77 files changed, 7439 insertions, 2140 deletions
diff --git a/dep/PackageList.txt b/dep/PackageList.txt
index 7d6d8a0e85c..a8a6a854e97 100644
--- a/dep/PackageList.txt
+++ b/dep/PackageList.txt
@@ -14,7 +14,7 @@ G3D (a commercial-grade C++ 3D engine available as Open Source (BSD License)
 
 jemalloc (a general-purpose scalable concurrent malloc-implementation)
   http://www.canonware.com/jemalloc/
-  Version: 3.3.1
+  Version: 3.6.0
   
 MySQL (the world's most popular open source database software)
   http://www.mysql.com/
@@ -37,8 +37,8 @@ gSOAP (a portable development toolkit for C and C++ XML Web services and XML dat
   Version: 2.8.10
 
 recastnavigation (Recast is state of the art navigation mesh construction toolset for games)
-  http://code.google.com/p/recastnavigation/
-  Version: 1.4
+  https://github.com/memononen/recastnavigation
+  Version: 740a7ba51600a3c87ce5667ae276a38284a1ce75
 
 StormLib (a pack of modules, written in C++, which are able to read and also to write files from/to the MPQ archives)
   http://www.zezula.net/en/mpq/stormlib.html
diff --git a/dep/acelite/ace/OS_NS_Thread.cpp b/dep/acelite/ace/OS_NS_Thread.cpp
index 8e36eb9e4f7..c450bc81371 100644
--- a/dep/acelite/ace/OS_NS_Thread.cpp
+++ b/dep/acelite/ace/OS_NS_Thread.cpp
@@ -885,8 +885,12 @@ ACE_TSS_Cleanup::thread_detach_key (ACE_thread_key_t key)
     ACE_TSS_CLEANUP_GUARD
 
     u_int key_index = key;
-    ACE_ASSERT (key_index < sizeof(this->table_)/sizeof(this->table_[0])
-        && this->table_[key_index].key_ == key);
+    ACE_ASSERT (key_index < sizeof(this->table_)/sizeof(this->table_[0]));
+    // If this entry was never set, just bug out. If it is set, but is the
+    // wrong key, assert.
+    if (this->table_[key_index].key_ == 0)
+        return 0;
+    ACE_ASSERT(this->table_[key_index].key_ == key);
     ACE_TSS_Info &info = this->table_ [key_index];
 
     // sanity check
diff --git a/dep/jemalloc/COPYING b/dep/jemalloc/COPYING
index 019e8132275..bdda0feb9e5 100644
--- a/dep/jemalloc/COPYING
+++ b/dep/jemalloc/COPYING
@@ -1,10 +1,10 @@
 Unless otherwise specified, files in the jemalloc source distribution are
 subject to the following license:
 --------------------------------------------------------------------------------
-Copyright (C) 2002-2013 Jason Evans <jasone@canonware.com>.
+Copyright (C) 2002-2014 Jason Evans <jasone@canonware.com>.
 All rights reserved.
 Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
-Copyright (C) 2009-2013 Facebook, Inc.  All rights reserved.
+Copyright (C) 2009-2014 Facebook, Inc.  All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
diff --git a/dep/jemalloc/ChangeLog b/dep/jemalloc/ChangeLog
index fc096d8f42f..d56ee999e69 100644
--- a/dep/jemalloc/ChangeLog
+++ b/dep/jemalloc/ChangeLog
@@ -3,8 +3,131 @@ bug fixes are all mentioned, but internal enhancements are omitted here for
 brevity (even though they are more fun to write about).  Much more detail can be
 found in the git revision history:
 
-    http://www.canonware.com/cgi-bin/gitweb.cgi?p=jemalloc.git
-    git://canonware.com/jemalloc.git
+    https://github.com/jemalloc/jemalloc
+
+* 3.6.0 (March 31, 2014)
+
+  This version contains a critical bug fix for a regression present in 3.5.0 and
+  3.5.1.
+
+  Bug fixes:
+  - Fix a regression in arena_chunk_alloc() that caused crashes during
+    small/large allocation if chunk allocation failed.  In the absence of this
+    bug, chunk allocation failure would result in allocation failure, e.g.  NULL
+    return from malloc().  This regression was introduced in 3.5.0.
+  - Fix backtracing for gcc intrinsics-based backtracing by specifying
+    -fno-omit-frame-pointer to gcc.  Note that the application (and all the
+    libraries it links to) must also be compiled with this option for
+    backtracing to be reliable.
+  - Use dss allocation precedence for huge allocations as well as small/large
+    allocations.
+  - Fix test assertion failure message formatting.  This bug did not manifect on
+    x86_64 systems because of implementation subtleties in va_list.
+  - Fix inconsequential test failures for hash and SFMT code.
+
+  New features:
+  - Support heap profiling on FreeBSD.  This feature depends on the proc
+    filesystem being mounted during heap profile dumping.
+
+* 3.5.1 (February 25, 2014)
+
+  This version primarily addresses minor bugs in test code.
+
+  Bug fixes:
+  - Configure Solaris/Illumos to use MADV_FREE.
+  - Fix junk filling for mremap(2)-based huge reallocation.  This is only
+    relevant if configuring with the --enable-mremap option specified.
+  - Avoid compilation failure if 'restrict' C99 keyword is not supported by the
+    compiler.
+  - Add a configure test for SSE2 rather than assuming it is usable on i686
+    systems.  This fixes test compilation errors, especially on 32-bit Linux
+    systems.
+  - Fix mallctl argument size mismatches (size_t vs. uint64_t) in the stats unit
+    test.
+  - Fix/remove flawed alignment-related overflow tests.
+  - Prevent compiler optimizations that could change backtraces in the
+    prof_accum unit test.
+
+* 3.5.0 (January 22, 2014)
+
+  This version focuses on refactoring and automated testing, though it also
+  includes some non-trivial heap profiling optimizations not mentioned below.
+
+  New features:
+  - Add the *allocx() API, which is a successor to the experimental *allocm()
+    API.  The *allocx() functions are slightly simpler to use because they have
+    fewer parameters, they directly return the results of primary interest, and
+    mallocx()/rallocx() avoid the strict aliasing pitfall that
+    allocm()/rallocm() share with posix_memalign().  Note that *allocm() is
+    slated for removal in the next non-bugfix release.
+  - Add support for LinuxThreads.
+
+  Bug fixes:
+  - Unless heap profiling is enabled, disable floating point code and don't link
+    with libm.  This, in combination with e.g. EXTRA_CFLAGS=-mno-sse on x64
+    systems, makes it possible to completely disable floating point register
+    use.  Some versions of glibc neglect to save/restore caller-saved floating
+    point registers during dynamic lazy symbol loading, and the symbol loading
+    code uses whatever malloc the application happens to have linked/loaded
+    with, the result being potential floating point register corruption.
+  - Report ENOMEM rather than EINVAL if an OOM occurs during heap profiling
+    backtrace creation in imemalign().  This bug impacted posix_memalign() and
+    aligned_alloc().
+  - Fix a file descriptor leak in a prof_dump_maps() error path.
+  - Fix prof_dump() to close the dump file descriptor for all relevant error
+    paths.
+  - Fix rallocm() to use the arena specified by the ALLOCM_ARENA(s) flag for
+    allocation, not just deallocation.
+  - Fix a data race for large allocation stats counters.
+  - Fix a potential infinite loop during thread exit.  This bug occurred on
+    Solaris, and could affect other platforms with similar pthreads TSD
+    implementations.
+  - Don't junk-fill reallocations unless usable size changes.  This fixes a
+    violation of the *allocx()/*allocm() semantics.
+  - Fix growing large reallocation to junk fill new space.
+  - Fix huge deallocation to junk fill when munmap is disabled.
+  - Change the default private namespace prefix from empty to je_, and change
+    --with-private-namespace-prefix so that it prepends an additional prefix
+    rather than replacing je_.  This reduces the likelihood of applications
+    which statically link jemalloc experiencing symbol name collisions.
+  - Add missing private namespace mangling (relevant when
+    --with-private-namespace is specified).
+  - Add and use JEMALLOC_INLINE_C so that static inline functions are marked as
+    static even for debug builds.
+  - Add a missing mutex unlock in a malloc_init_hard() error path.  In practice
+    this error path is never executed.
+  - Fix numerous bugs in malloc_strotumax() error handling/reporting.  These
+    bugs had no impact except for malformed inputs.
+  - Fix numerous bugs in malloc_snprintf().  These bugs were not exercised by
+    existing calls, so they had no impact.
+
+* 3.4.1 (October 20, 2013)
+
+  Bug fixes:
+  - Fix a race in the "arenas.extend" mallctl that could cause memory corruption
+    of internal data structures and subsequent crashes.
+  - Fix Valgrind integration flaws that caused Valgrind warnings about reads of
+    uninitialized memory in:
+    + arena chunk headers
+    + internal zero-initialized data structures (relevant to tcache and prof
+      code)
+  - Preserve errno during the first allocation.  A readlink(2) call during
+    initialization fails unless /etc/malloc.conf exists, so errno was typically
+    set during the first allocation prior to this fix.
+  - Fix compilation warnings reported by gcc 4.8.1.
+
+* 3.4.0 (June 2, 2013)
+
+  This version is essentially a small bugfix release, but the addition of
+  aarch64 support requires that the minor version be incremented.
+
+  Bug fixes:
+  - Fix race-triggered deadlocks in chunk_record().  These deadlocks were
+    typically triggered by multiple threads concurrently deallocating huge
+    objects.
+
+  New features:
+  - Add support for the aarch64 architecture.
 
 * 3.3.1 (March 6, 2013)
 
@@ -15,7 +138,7 @@ found in the git revision history:
   - Fix a locking order bug that could cause deadlock during fork if heap
     profiling were enabled.
   - Fix a chunk recycling bug that could cause the allocator to lose track of
-    whether a chunk was zeroed.   On FreeBSD, NetBSD, and OS X, it could cause
+    whether a chunk was zeroed.  On FreeBSD, NetBSD, and OS X, it could cause
     corruption if allocating via sbrk(2) (unlikely unless running with the
     "dss:primary" option specified).  This was completely harmless on Linux
     unless using mlockall(2) (and unlikely even then, unless the
@@ -47,7 +170,7 @@ found in the git revision history:
 
   Bug fixes:
   - Fix "arenas.extend" mallctl to output the number of arenas.
-  - Fix chunk_recycyle() to unconditionally inform Valgrind that returned memory
+  - Fix chunk_recycle() to unconditionally inform Valgrind that returned memory
     is undefined.
   - Fix build break on FreeBSD related to alloca.h.
 
diff --git a/dep/jemalloc/README b/dep/jemalloc/README
index 7661683bae7..9b268f42288 100644
--- a/dep/jemalloc/README
+++ b/dep/jemalloc/README
@@ -1,10 +1,14 @@
-jemalloc is a general-purpose scalable concurrent malloc(3) implementation.
-This distribution is a "portable" implementation that currently targets
-FreeBSD, Linux, Apple OS X, and MinGW.  jemalloc is included as the default
-allocator in the FreeBSD and NetBSD operating systems, and it is used by the
-Mozilla Firefox web browser on Microsoft Windows-related platforms.  Depending
-on your needs, one of the other divergent versions may suit your needs better
-than this distribution.
+jemalloc is a general purpose malloc(3) implementation that emphasizes
+fragmentation avoidance and scalable concurrency support.  jemalloc first came
+into use as the FreeBSD libc allocator in 2005, and since then it has found its
+way into numerous applications that rely on its predictable behavior.  In 2010
+jemalloc development efforts broadened to include developer support features
+such as heap profiling, Valgrind integration, and extensive monitoring/tuning
+hooks.  Modern jemalloc releases continue to be integrated back into FreeBSD,
+and therefore versatility remains critical.  Ongoing development efforts trend
+toward making jemalloc among the best allocators for a broad range of demanding
+applications, and eliminating/mitigating weaknesses that have practical
+repercussions for real world applications.
 
 The COPYING file contains copyright and licensing information.
 
diff --git a/dep/jemalloc/VERSION b/dep/jemalloc/VERSION
index 900c82d1043..dace31ba7b6 100644
--- a/dep/jemalloc/VERSION
+++ b/dep/jemalloc/VERSION
@@ -1 +1 @@
-3.3.1-0-g9ef9d9e8c271cdf14f664b871a8f98c827714784
+3.6.0-0-g46c0af68bd248b04df75e4f92d5fb804c3d75340
diff --git a/dep/jemalloc/include/jemalloc/internal/arena.h b/dep/jemalloc/include/jemalloc/internal/arena.h
index f2c18f43543..9d000c03dec 100644
--- a/dep/jemalloc/include/jemalloc/internal/arena.h
+++ b/dep/jemalloc/include/jemalloc/internal/arena.h
@@ -158,6 +158,7 @@ struct arena_chunk_map_s {
 };
 typedef rb_tree(arena_chunk_map_t) arena_avail_tree_t;
 typedef rb_tree(arena_chunk_map_t) arena_run_tree_t;
+typedef ql_head(arena_chunk_map_t) arena_chunk_mapelms_t;
 
 /* Arena chunk header. */
 struct arena_chunk_s {
@@ -174,11 +175,12 @@ struct arena_chunk_s {
 	size_t			nruns_avail;
 
 	/*
-	 * Number of available run adjacencies.  Clean and dirty available runs
-	 * are not coalesced, which causes virtual memory fragmentation.  The
-	 * ratio of (nruns_avail-nruns_adjac):nruns_adjac is used for tracking
-	 * this fragmentation.
-	 * */
+	 * Number of available run adjacencies that purging could coalesce.
+	 * Clean and dirty available runs are not coalesced, which causes
+	 * virtual memory fragmentation.  The ratio of
+	 * (nruns_avail-nruns_adjac):nruns_adjac is used for tracking this
+	 * fragmentation.
+	 */
 	size_t			nruns_adjac;
 
 	/*
@@ -404,7 +406,16 @@ void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
     size_t binind, uint64_t prof_accumbytes);
 void	arena_alloc_junk_small(void *ptr, arena_bin_info_t *bin_info,
     bool zero);
+#ifdef JEMALLOC_JET
+typedef void (arena_redzone_corruption_t)(void *, size_t, bool, size_t,
+    uint8_t);
+extern arena_redzone_corruption_t *arena_redzone_corruption;
+typedef void (arena_dalloc_junk_small_t)(void *, arena_bin_info_t *);
+extern arena_dalloc_junk_small_t *arena_dalloc_junk_small;
+#else
 void	arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info);
+#endif
+void	arena_quarantine_junk_small(void *ptr, size_t usize);
 void	*arena_malloc_small(arena_t *arena, size_t size, bool zero);
 void	*arena_malloc_large(arena_t *arena, size_t size, bool zero);
 void	*arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero);
@@ -415,10 +426,18 @@ void	arena_dalloc_bin(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t pageind, arena_chunk_map_t *mapelm);
 void	arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
     size_t pageind);
+#ifdef JEMALLOC_JET
+typedef void (arena_dalloc_junk_large_t)(void *, size_t);
+extern arena_dalloc_junk_large_t *arena_dalloc_junk_large;
+#endif
 void	arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk,
     void *ptr);
 void	arena_dalloc_large(arena_t *arena, arena_chunk_t *chunk, void *ptr);
-void	*arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+#ifdef JEMALLOC_JET
+typedef void (arena_ralloc_junk_large_t)(void *, size_t, size_t);
+extern arena_ralloc_junk_large_t *arena_ralloc_junk_large;
+#endif
+bool	arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra, bool zero);
 void	*arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
     size_t extra, size_t alignment, bool zero, bool try_tcache_alloc,
@@ -441,6 +460,7 @@ void	arena_postfork_child(arena_t *arena);
 #ifndef JEMALLOC_ENABLE_INLINE
 arena_chunk_map_t	*arena_mapp_get(arena_chunk_t *chunk, size_t pageind);
 size_t	*arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind);
+size_t	arena_mapbitsp_read(size_t *mapbitsp);
 size_t	arena_mapbits_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_unallocated_size_get(arena_chunk_t *chunk,
     size_t pageind);
@@ -451,6 +471,7 @@ size_t	arena_mapbits_dirty_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_unzeroed_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_large_get(arena_chunk_t *chunk, size_t pageind);
 size_t	arena_mapbits_allocated_get(arena_chunk_t *chunk, size_t pageind);
+void	arena_mapbitsp_write(size_t *mapbitsp, size_t mapbits);
 void	arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind,
     size_t size, size_t flags);
 void	arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
@@ -471,7 +492,7 @@ size_t	arena_bin_index(arena_t *arena, arena_bin_t *bin);
 unsigned	arena_run_regind(arena_run_t *run, arena_bin_info_t *bin_info,
     const void *ptr);
 prof_ctx_t	*arena_prof_ctx_get(const void *ptr);
-void	arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+void	arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
 void	*arena_malloc(arena_t *arena, size_t size, bool zero, bool try_tcache);
 size_t	arena_salloc(const void *ptr, bool demote);
 void	arena_dalloc(arena_t *arena, arena_chunk_t *chunk, void *ptr,
@@ -498,10 +519,17 @@ arena_mapbitsp_get(arena_chunk_t *chunk, size_t pageind)
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
+arena_mapbitsp_read(size_t *mapbitsp)
+{
+
+	return (*mapbitsp);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
 arena_mapbits_get(arena_chunk_t *chunk, size_t pageind)
 {
 
-	return (*arena_mapbitsp_get(chunk, pageind));
+	return (arena_mapbitsp_read(arena_mapbitsp_get(chunk, pageind)));
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -585,82 +613,89 @@ arena_mapbits_allocated_get(arena_chunk_t *chunk, size_t pageind)
 }
 
 JEMALLOC_ALWAYS_INLINE void
+arena_mapbitsp_write(size_t *mapbitsp, size_t mapbits)
+{
+
+	*mapbitsp = mapbits;
+}
+
+JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_unallocated_set(arena_chunk_t *chunk, size_t pageind, size_t size,
     size_t flags)
 {
-	size_t *mapbitsp;
+	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
 
-	mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	assert((size & PAGE_MASK) == 0);
 	assert((flags & ~CHUNK_MAP_FLAGS_MASK) == 0);
 	assert((flags & (CHUNK_MAP_DIRTY|CHUNK_MAP_UNZEROED)) == flags);
-	*mapbitsp = size | CHUNK_MAP_BININD_INVALID | flags;
+	arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags);
 }
 
 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_unallocated_size_set(arena_chunk_t *chunk, size_t pageind,
     size_t size)
 {
-	size_t *mapbitsp;
+	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
+	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
-	mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	assert((size & PAGE_MASK) == 0);
-	assert((*mapbitsp & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) == 0);
-	*mapbitsp = size | (*mapbitsp & PAGE_MASK);
+	assert((mapbits & (CHUNK_MAP_LARGE|CHUNK_MAP_ALLOCATED)) == 0);
+	arena_mapbitsp_write(mapbitsp, size | (mapbits & PAGE_MASK));
 }
 
 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_large_set(arena_chunk_t *chunk, size_t pageind, size_t size,
     size_t flags)
 {
-	size_t *mapbitsp;
+	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
+	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 	size_t unzeroed;
 
-	mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	assert((size & PAGE_MASK) == 0);
 	assert((flags & CHUNK_MAP_DIRTY) == flags);
-	unzeroed = *mapbitsp & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
-	*mapbitsp = size | CHUNK_MAP_BININD_INVALID | flags | unzeroed |
-	    CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED;
+	unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
+	arena_mapbitsp_write(mapbitsp, size | CHUNK_MAP_BININD_INVALID | flags
+	    | unzeroed | CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED);
 }
 
 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_large_binind_set(arena_chunk_t *chunk, size_t pageind,
     size_t binind)
 {
-	size_t *mapbitsp;
+	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
+	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
 	assert(binind <= BININD_INVALID);
-	mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	assert(arena_mapbits_large_size_get(chunk, pageind) == PAGE);
-	*mapbitsp = (*mapbitsp & ~CHUNK_MAP_BININD_MASK) | (binind <<
-	    CHUNK_MAP_BININD_SHIFT);
+	arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_BININD_MASK) |
+	    (binind << CHUNK_MAP_BININD_SHIFT));
 }
 
 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_small_set(arena_chunk_t *chunk, size_t pageind, size_t runind,
     size_t binind, size_t flags)
 {
-	size_t *mapbitsp;
+	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
+	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 	size_t unzeroed;
 
 	assert(binind < BININD_INVALID);
-	mapbitsp = arena_mapbitsp_get(chunk, pageind);
 	assert(pageind - runind >= map_bias);
 	assert((flags & CHUNK_MAP_DIRTY) == flags);
-	unzeroed = *mapbitsp & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
-	*mapbitsp = (runind << LG_PAGE) | (binind << CHUNK_MAP_BININD_SHIFT) |
-	    flags | unzeroed | CHUNK_MAP_ALLOCATED;
+	unzeroed = mapbits & CHUNK_MAP_UNZEROED; /* Preserve unzeroed. */
+	arena_mapbitsp_write(mapbitsp, (runind << LG_PAGE) | (binind <<
+	    CHUNK_MAP_BININD_SHIFT) | flags | unzeroed | CHUNK_MAP_ALLOCATED);
 }
 
 JEMALLOC_ALWAYS_INLINE void
 arena_mapbits_unzeroed_set(arena_chunk_t *chunk, size_t pageind,
     size_t unzeroed)
 {
-	size_t *mapbitsp;
+	size_t *mapbitsp = arena_mapbitsp_get(chunk, pageind);
+	size_t mapbits = arena_mapbitsp_read(mapbitsp);
 
-	mapbitsp = arena_mapbitsp_get(chunk, pageind);
-	*mapbitsp = (*mapbitsp & ~CHUNK_MAP_UNZEROED) | unzeroed;
+	arena_mapbitsp_write(mapbitsp, (mapbits & ~CHUNK_MAP_UNZEROED) |
+	    unzeroed);
 }
 
 JEMALLOC_INLINE bool
@@ -869,10 +904,10 @@ arena_prof_ctx_get(const void *ptr)
 }
 
 JEMALLOC_INLINE void
-arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+arena_prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
 {
 	arena_chunk_t *chunk;
-	size_t pageind, mapbits;
+	size_t pageind;
 
 	cassert(config_prof);
 	assert(ptr != NULL);
@@ -880,10 +915,17 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-	mapbits = arena_mapbits_get(chunk, pageind);
-	assert((mapbits & CHUNK_MAP_ALLOCATED) != 0);
-	if ((mapbits & CHUNK_MAP_LARGE) == 0) {
+	assert(arena_mapbits_allocated_get(chunk, pageind) != 0);
+
+	if (usize > SMALL_MAXCLASS || (prof_promote &&
+	    ((uintptr_t)ctx != (uintptr_t)1U || arena_mapbits_large_get(chunk,
+	    pageind) != 0))) {
+		assert(arena_mapbits_large_get(chunk, pageind) != 0);
+		arena_mapp_get(chunk, pageind)->prof_ctx = ctx;
+	} else {
+		assert(arena_mapbits_large_get(chunk, pageind) == 0);
 		if (prof_promote == false) {
+			size_t mapbits = arena_mapbits_get(chunk, pageind);
 			arena_run_t *run = (arena_run_t *)((uintptr_t)chunk +
 			    (uintptr_t)((pageind - (mapbits >> LG_PAGE)) <<
 			    LG_PAGE));
@@ -895,12 +937,11 @@ arena_prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 			bin_info = &arena_bin_info[binind];
 			regind = arena_run_regind(run, bin_info, ptr);
 
-			*((prof_ctx_t **)((uintptr_t)run + bin_info->ctx0_offset
-			    + (regind * sizeof(prof_ctx_t *)))) = ctx;
-		} else
-			assert((uintptr_t)ctx == (uintptr_t)1U);
-	} else
-		arena_mapp_get(chunk, pageind)->prof_ctx = ctx;
+			*((prof_ctx_t **)((uintptr_t)run +
+			    bin_info->ctx0_offset + (regind * sizeof(prof_ctx_t
+			    *)))) = ctx;
+		}
+	}
 }
 
 JEMALLOC_ALWAYS_INLINE void *
diff --git a/dep/jemalloc/include/jemalloc/internal/chunk_dss.h b/dep/jemalloc/include/jemalloc/internal/chunk_dss.h
index 6585f071bbe..4535ce09c09 100644
--- a/dep/jemalloc/include/jemalloc/internal/chunk_dss.h
+++ b/dep/jemalloc/include/jemalloc/internal/chunk_dss.h
@@ -7,7 +7,7 @@ typedef enum {
 	dss_prec_secondary = 2,
 
 	dss_prec_limit     = 3
-} dss_prec_t ;
+} dss_prec_t;
 #define	DSS_PREC_DEFAULT	dss_prec_secondary
 #define	DSS_DEFAULT		"secondary"
 
diff --git a/dep/jemalloc/include/jemalloc/internal/ckh.h b/dep/jemalloc/include/jemalloc/internal/ckh.h
index 50c39ed9581..58712a6a763 100644
--- a/dep/jemalloc/include/jemalloc/internal/ckh.h
+++ b/dep/jemalloc/include/jemalloc/internal/ckh.h
@@ -17,7 +17,7 @@ typedef bool ckh_keycomp_t (const void *, const void *);
  * There are 2^LG_CKH_BUCKET_CELLS cells in each hash table bucket.  Try to fit
  * one bucket per L1 cache line.
  */
-#define LG_CKH_BUCKET_CELLS (LG_CACHELINE - LG_SIZEOF_PTR - 1)
+#define	LG_CKH_BUCKET_CELLS (LG_CACHELINE - LG_SIZEOF_PTR - 1)
 
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
diff --git a/dep/jemalloc/include/jemalloc/internal/hash.h b/dep/jemalloc/include/jemalloc/internal/hash.h
index 56ecc793b36..c7183ede82d 100644
--- a/dep/jemalloc/include/jemalloc/internal/hash.h
+++ b/dep/jemalloc/include/jemalloc/internal/hash.h
@@ -19,6 +19,11 @@
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+uint32_t	hash_x86_32(const void *key, int len, uint32_t seed);
+void	hash_x86_128(const void *key, const int len, uint32_t seed,
+    uint64_t r_out[2]);
+void	hash_x64_128(const void *key, const int len, const uint32_t seed,
+    uint64_t r_out[2]);
 void	hash(const void *key, size_t len, const uint32_t seed,
     size_t r_hash[2]);
 #endif
@@ -43,14 +48,14 @@ JEMALLOC_INLINE uint32_t
 hash_get_block_32(const uint32_t *p, int i)
 {
 
-	return p[i];
+	return (p[i]);
 }
 
 JEMALLOC_INLINE uint64_t
 hash_get_block_64(const uint64_t *p, int i)
 {
 
-	return p[i];
+	return (p[i]);
 }
 
 JEMALLOC_INLINE uint32_t
@@ -63,7 +68,7 @@ hash_fmix_32(uint32_t h)
 	h *= 0xc2b2ae35;
 	h ^= h >> 16;
 
-	return h;
+	return (h);
 }
 
 JEMALLOC_INLINE uint64_t
@@ -76,7 +81,7 @@ hash_fmix_64(uint64_t k)
 	k *= QU(0xc4ceb9fe1a85ec53LLU);
 	k ^= k >> 33;
 
-	return k;
+	return (k);
 }
 
 JEMALLOC_INLINE uint32_t
@@ -127,12 +132,12 @@ hash_x86_32(const void *key, int len, uint32_t seed)
 
 	h1 = hash_fmix_32(h1);
 
-	return h1;
+	return (h1);
 }
 
 UNUSED JEMALLOC_INLINE void
 hash_x86_128(const void *key, const int len, uint32_t seed,
-  uint64_t r_out[2])
+    uint64_t r_out[2])
 {
 	const uint8_t * data = (const uint8_t *) key;
 	const int nblocks = len / 16;
@@ -234,7 +239,7 @@ hash_x86_128(const void *key, const int len, uint32_t seed,
 
 UNUSED JEMALLOC_INLINE void
 hash_x64_128(const void *key, const int len, const uint32_t seed,
-  uint64_t r_out[2])
+    uint64_t r_out[2])
 {
 	const uint8_t *data = (const uint8_t *) key;
 	const int nblocks = len / 16;
@@ -310,13 +315,12 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,
 	r_out[1] = h2;
 }
 
-
 /******************************************************************************/
 /* API. */
 JEMALLOC_INLINE void
 hash(const void *key, size_t len, const uint32_t seed, size_t r_hash[2])
 {
-#if (LG_SIZEOF_PTR == 3)
+#if (LG_SIZEOF_PTR == 3 && !defined(JEMALLOC_BIG_ENDIAN))
 	hash_x64_128(key, len, seed, (uint64_t *)r_hash);
 #else
 	uint64_t hashes[2];
diff --git a/dep/jemalloc/include/jemalloc/internal/huge.h b/dep/jemalloc/include/jemalloc/internal/huge.h
index d987d370767..a2b9c779191 100644
--- a/dep/jemalloc/include/jemalloc/internal/huge.h
+++ b/dep/jemalloc/include/jemalloc/internal/huge.h
@@ -17,14 +17,20 @@ extern size_t		huge_allocated;
 /* Protects chunk-related data structures. */
 extern malloc_mutex_t	huge_mtx;
 
-void	*huge_malloc(size_t size, bool zero);
-void	*huge_palloc(size_t size, size_t alignment, bool zero);
-void	*huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
+void	*huge_malloc(size_t size, bool zero, dss_prec_t dss_prec);
+void	*huge_palloc(size_t size, size_t alignment, bool zero,
+    dss_prec_t dss_prec);
+bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
     size_t extra);
 void	*huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_dalloc);
+    size_t alignment, bool zero, bool try_tcache_dalloc, dss_prec_t dss_prec);
+#ifdef JEMALLOC_JET
+typedef void (huge_dalloc_junk_t)(void *, size_t);
+extern huge_dalloc_junk_t *huge_dalloc_junk;
+#endif
 void	huge_dalloc(void *ptr, bool unmap);
 size_t	huge_salloc(const void *ptr);
+dss_prec_t	huge_dss_prec_get(arena_t *arena);
 prof_ctx_t	*huge_prof_ctx_get(const void *ptr);
 void	huge_prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
 bool	huge_boot(void);
diff --git a/dep/jemalloc/include/jemalloc/internal/jemalloc_internal.h b/dep/jemalloc/include/jemalloc/internal/jemalloc_internal.h
index 80045bda4bd..cf171326c29 100644
--- a/dep/jemalloc/include/jemalloc/internal/jemalloc_internal.h
+++ b/dep/jemalloc/include/jemalloc/internal/jemalloc_internal.h
@@ -1,5 +1,5 @@
 #ifndef JEMALLOC_INTERNAL_H
-#define JEMALLOC_INTERNAL_H
+#define	JEMALLOC_INTERNAL_H
 #include <math.h>
 #ifdef _WIN32
 #  include <windows.h>
@@ -54,8 +54,7 @@ typedef intptr_t ssize_t;
 #endif
 #include <fcntl.h>
 
-#define	JEMALLOC_NO_DEMANGLE
-#include "../jemalloc.h"
+#include "jemalloc_defs.h"
 
 #ifdef JEMALLOC_UTRACE
 #include <sys/ktrace.h>
@@ -66,6 +65,8 @@ typedef intptr_t ssize_t;
 #include <valgrind/memcheck.h>
 #endif
 
+#define	JEMALLOC_NO_DEMANGLE
+#include "../jemalloc.h"
 #include "jemalloc/internal/private_namespace.h"
 
 #ifdef JEMALLOC_CC_SILENCE
@@ -221,8 +222,13 @@ static const bool config_ivsalloc =
  *   JEMALLOC_H_INLINES : Inline functions.
  */
 /******************************************************************************/
-#define JEMALLOC_H_TYPES
+#define	JEMALLOC_H_TYPES
+
+#ifndef JEMALLOC_HAS_RESTRICT
+#  define restrict
+#endif
 
+#define	MALLOCX_LG_ALIGN_MASK	((int)0x3f)
 #define	ALLOCM_LG_ALIGN_MASK	((int)0x3f)
 
 #define	ZU(z)	((size_t)z)
@@ -232,20 +238,26 @@ static const bool config_ivsalloc =
 #  define	__DECONST(type, var)	((type)(uintptr_t)(const void *)(var))
 #endif
 
-#ifdef JEMALLOC_DEBUG
+#if defined(JEMALLOC_DEBUG)
    /* Disable inlining to make debugging easier. */
 #  define JEMALLOC_ALWAYS_INLINE
+#  define JEMALLOC_ALWAYS_INLINE_C static
 #  define JEMALLOC_INLINE
+#  define JEMALLOC_INLINE_C static
 #  define inline
 #else
 #  define JEMALLOC_ENABLE_INLINE
 #  ifdef JEMALLOC_HAVE_ATTR
 #    define JEMALLOC_ALWAYS_INLINE \
 	 static inline JEMALLOC_ATTR(unused) JEMALLOC_ATTR(always_inline)
+#    define JEMALLOC_ALWAYS_INLINE_C \
+	 static inline JEMALLOC_ATTR(always_inline)
 #  else
 #    define JEMALLOC_ALWAYS_INLINE static inline
+#    define JEMALLOC_ALWAYS_INLINE_C static inline
 #  endif
 #  define JEMALLOC_INLINE static inline
+#  define JEMALLOC_INLINE_C static inline
 #  ifdef _MSC_VER
 #    define inline _inline
 #  endif
@@ -278,6 +290,9 @@ static const bool config_ivsalloc =
 #  ifdef __arm__
 #    define LG_QUANTUM		3
 #  endif
+#  ifdef __aarch64__
+#    define LG_QUANTUM		4
+#  endif
 #  ifdef __hppa__
 #    define LG_QUANTUM		4
 #  endif
@@ -478,7 +493,7 @@ static const bool config_ivsalloc =
 
 #undef JEMALLOC_H_TYPES
 /******************************************************************************/
-#define JEMALLOC_H_STRUCTS
+#define	JEMALLOC_H_STRUCTS
 
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
@@ -507,14 +522,14 @@ typedef struct {
 	uint64_t	deallocated;
 } thread_allocated_t;
 /*
- * The JEMALLOC_CONCAT() wrapper is necessary to pass {0, 0} via a cpp macro
+ * The JEMALLOC_ARG_CONCAT() wrapper is necessary to pass {0, 0} via a cpp macro
  * argument.
  */
-#define	THREAD_ALLOCATED_INITIALIZER	JEMALLOC_CONCAT({0, 0})
+#define	THREAD_ALLOCATED_INITIALIZER	JEMALLOC_ARG_CONCAT({0, 0})
 
 #undef JEMALLOC_H_STRUCTS
 /******************************************************************************/
-#define JEMALLOC_H_EXTERNS
+#define	JEMALLOC_H_EXTERNS
 
 extern bool	opt_abort;
 extern bool	opt_junk;
@@ -574,7 +589,7 @@ void	jemalloc_postfork_child(void);
 
 #undef JEMALLOC_H_EXTERNS
 /******************************************************************************/
-#define JEMALLOC_H_INLINES
+#define	JEMALLOC_H_INLINES
 
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
@@ -749,32 +764,36 @@ choose_arena(arena_t *arena)
 #include "jemalloc/internal/quarantine.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
-void	*imallocx(size_t size, bool try_tcache, arena_t *arena);
+void	*imalloct(size_t size, bool try_tcache, arena_t *arena);
 void	*imalloc(size_t size);
-void	*icallocx(size_t size, bool try_tcache, arena_t *arena);
+void	*icalloct(size_t size, bool try_tcache, arena_t *arena);
 void	*icalloc(size_t size);
-void	*ipallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
+void	*ipalloct(size_t usize, size_t alignment, bool zero, bool try_tcache,
     arena_t *arena);
 void	*ipalloc(size_t usize, size_t alignment, bool zero);
 size_t	isalloc(const void *ptr, bool demote);
 size_t	ivsalloc(const void *ptr, bool demote);
 size_t	u2rz(size_t usize);
 size_t	p2rz(const void *ptr);
-void	idallocx(void *ptr, bool try_tcache);
+void	idalloct(void *ptr, bool try_tcache);
 void	idalloc(void *ptr);
-void	iqallocx(void *ptr, bool try_tcache);
+void	iqalloct(void *ptr, bool try_tcache);
 void	iqalloc(void *ptr);
-void	*irallocx(void *ptr, size_t size, size_t extra, size_t alignment,
-    bool zero, bool no_move, bool try_tcache_alloc, bool try_tcache_dalloc,
+void	*iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
     arena_t *arena);
+void	*iralloct(void *ptr, size_t size, size_t extra, size_t alignment,
+    bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena);
 void	*iralloc(void *ptr, size_t size, size_t extra, size_t alignment,
-    bool zero, bool no_move);
+    bool zero);
+bool	ixalloc(void *ptr, size_t size, size_t extra, size_t alignment,
+    bool zero);
 malloc_tsd_protos(JEMALLOC_ATTR(unused), thread_allocated, thread_allocated_t)
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
 JEMALLOC_ALWAYS_INLINE void *
-imallocx(size_t size, bool try_tcache, arena_t *arena)
+imalloct(size_t size, bool try_tcache, arena_t *arena)
 {
 
 	assert(size != 0);
@@ -782,35 +801,35 @@ imallocx(size_t size, bool try_tcache, arena_t *arena)
 	if (size <= arena_maxclass)
 		return (arena_malloc(arena, size, false, try_tcache));
 	else
-		return (huge_malloc(size, false));
+		return (huge_malloc(size, false, huge_dss_prec_get(arena)));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 imalloc(size_t size)
 {
 
-	return (imallocx(size, true, NULL));
+	return (imalloct(size, true, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-icallocx(size_t size, bool try_tcache, arena_t *arena)
+icalloct(size_t size, bool try_tcache, arena_t *arena)
 {
 
 	if (size <= arena_maxclass)
 		return (arena_malloc(arena, size, true, try_tcache));
 	else
-		return (huge_malloc(size, true));
+		return (huge_malloc(size, true, huge_dss_prec_get(arena)));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
 icalloc(size_t size)
 {
 
-	return (icallocx(size, true, NULL));
+	return (icalloct(size, true, NULL));
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-ipallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
+ipalloct(size_t usize, size_t alignment, bool zero, bool try_tcache,
     arena_t *arena)
 {
 	void *ret;
@@ -825,9 +844,9 @@ ipallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
 			ret = arena_palloc(choose_arena(arena), usize,
 			    alignment, zero);
 		} else if (alignment <= chunksize)
-			ret = huge_malloc(usize, zero);
+			ret = huge_malloc(usize, zero, huge_dss_prec_get(arena));
 		else
-			ret = huge_palloc(usize, alignment, zero);
+			ret = huge_palloc(usize, alignment, zero, huge_dss_prec_get(arena));
 	}
 
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
@@ -838,7 +857,7 @@ JEMALLOC_ALWAYS_INLINE void *
 ipalloc(size_t usize, size_t alignment, bool zero)
 {
 
-	return (ipallocx(usize, alignment, zero, true, NULL));
+	return (ipalloct(usize, alignment, zero, true, NULL));
 }
 
 /*
@@ -870,7 +889,7 @@ ivsalloc(const void *ptr, bool demote)
 {
 
 	/* Return 0 if ptr is not within a chunk managed by jemalloc. */
-	if (rtree_get(chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == NULL)
+	if (rtree_get(chunks_rtree, (uintptr_t)CHUNK_ADDR2BASE(ptr)) == 0)
 		return (0);
 
 	return (isalloc(ptr, demote));
@@ -899,7 +918,7 @@ p2rz(const void *ptr)
 }
 
 JEMALLOC_ALWAYS_INLINE void
-idallocx(void *ptr, bool try_tcache)
+idalloct(void *ptr, bool try_tcache)
 {
 	arena_chunk_t *chunk;
 
@@ -916,31 +935,63 @@ JEMALLOC_ALWAYS_INLINE void
 idalloc(void *ptr)
 {
 
-	idallocx(ptr, true);
+	idalloct(ptr, true);
 }
 
 JEMALLOC_ALWAYS_INLINE void
-iqallocx(void *ptr, bool try_tcache)
+iqalloct(void *ptr, bool try_tcache)
 {
 
 	if (config_fill && opt_quarantine)
 		quarantine(ptr);
 	else
-		idallocx(ptr, try_tcache);
+		idalloct(ptr, try_tcache);
 }
 
 JEMALLOC_ALWAYS_INLINE void
 iqalloc(void *ptr)
 {
 
-	iqallocx(ptr, true);
+	iqalloct(ptr, true);
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-irallocx(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
-    bool no_move, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena)
+iralloct_realign(void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
+    arena_t *arena)
+{
+	void *p;
+	size_t usize, copysize;
+
+	usize = sa2u(size + extra, alignment);
+	if (usize == 0)
+		return (NULL);
+	p = ipalloct(usize, alignment, zero, try_tcache_alloc, arena);
+	if (p == NULL) {
+		if (extra == 0)
+			return (NULL);
+		/* Try again, without extra this time. */
+		usize = sa2u(size, alignment);
+		if (usize == 0)
+			return (NULL);
+		p = ipalloct(usize, alignment, zero, try_tcache_alloc, arena);
+		if (p == NULL)
+			return (NULL);
+	}
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
+	copysize = (size < oldsize) ? size : oldsize;
+	memcpy(p, ptr, copysize);
+	iqalloct(ptr, try_tcache_dalloc);
+	return (p);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iralloct(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
+    bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena)
 {
-	void *ret;
 	size_t oldsize;
 
 	assert(ptr != NULL);
@@ -950,68 +1001,50 @@ irallocx(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
 
 	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
 	    != 0) {
-		size_t usize, copysize;
-
 		/*
 		 * Existing object alignment is inadequate; allocate new space
 		 * and copy.
 		 */
-		if (no_move)
-			return (NULL);
-		usize = sa2u(size + extra, alignment);
-		if (usize == 0)
-			return (NULL);
-		ret = ipallocx(usize, alignment, zero, try_tcache_alloc, arena);
-		if (ret == NULL) {
-			if (extra == 0)
-				return (NULL);
-			/* Try again, without extra this time. */
-			usize = sa2u(size, alignment);
-			if (usize == 0)
-				return (NULL);
-			ret = ipallocx(usize, alignment, zero, try_tcache_alloc,
-			    arena);
-			if (ret == NULL)
-				return (NULL);
-		}
-		/*
-		 * Copy at most size bytes (not size+extra), since the caller
-		 * has no expectation that the extra bytes will be reliably
-		 * preserved.
-		 */
-		copysize = (size < oldsize) ? size : oldsize;
-		memcpy(ret, ptr, copysize);
-		iqallocx(ptr, try_tcache_dalloc);
-		return (ret);
+		return (iralloct_realign(ptr, oldsize, size, extra, alignment,
+		    zero, try_tcache_alloc, try_tcache_dalloc, arena));
 	}
 
-	if (no_move) {
-		if (size <= arena_maxclass) {
-			return (arena_ralloc_no_move(ptr, oldsize, size,
-			    extra, zero));
-		} else {
-			return (huge_ralloc_no_move(ptr, oldsize, size,
-			    extra));
-		}
+	if (size + extra <= arena_maxclass) {
+		return (arena_ralloc(arena, ptr, oldsize, size, extra,
+		    alignment, zero, try_tcache_alloc,
+		    try_tcache_dalloc));
 	} else {
-		if (size + extra <= arena_maxclass) {
-			return (arena_ralloc(arena, ptr, oldsize, size, extra,
-			    alignment, zero, try_tcache_alloc,
-			    try_tcache_dalloc));
-		} else {
-			return (huge_ralloc(ptr, oldsize, size, extra,
-			    alignment, zero, try_tcache_dalloc));
-		}
+		return (huge_ralloc(ptr, oldsize, size, extra,
+		    alignment, zero, try_tcache_dalloc, huge_dss_prec_get(arena)));
 	}
 }
 
 JEMALLOC_ALWAYS_INLINE void *
-iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero,
-    bool no_move)
+iralloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero)
 {
 
-	return (irallocx(ptr, size, extra, alignment, zero, no_move, true, true,
-	    NULL));
+	return (iralloct(ptr, size, extra, alignment, zero, true, true, NULL));
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+ixalloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero)
+{
+	size_t oldsize;
+
+	assert(ptr != NULL);
+	assert(size != 0);
+
+	oldsize = isalloc(ptr, config_prof);
+	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
+	    != 0) {
+		/* Existing object alignment is inadequate. */
+		return (true);
+	}
+
+	if (size <= arena_maxclass)
+		return (arena_ralloc_no_move(ptr, oldsize, size, extra, zero));
+	else
+		return (huge_ralloc_no_move(ptr, oldsize, size, extra));
 }
 
 malloc_tsd_externs(thread_allocated, thread_allocated_t)
diff --git a/dep/jemalloc/include/jemalloc/internal/private_namespace.h b/dep/jemalloc/include/jemalloc/internal/private_namespace.h
index 65de3163fd3..35c3b0c6c74 100644
--- a/dep/jemalloc/include/jemalloc/internal/private_namespace.h
+++ b/dep/jemalloc/include/jemalloc/internal/private_namespace.h
@@ -8,6 +8,7 @@
 #define	arena_dalloc JEMALLOC_N(arena_dalloc)
 #define	arena_dalloc_bin JEMALLOC_N(arena_dalloc_bin)
 #define	arena_dalloc_bin_locked JEMALLOC_N(arena_dalloc_bin_locked)
+#define	arena_dalloc_junk_large JEMALLOC_N(arena_dalloc_junk_large)
 #define	arena_dalloc_junk_small JEMALLOC_N(arena_dalloc_junk_small)
 #define	arena_dalloc_large JEMALLOC_N(arena_dalloc_large)
 #define	arena_dalloc_large_locked JEMALLOC_N(arena_dalloc_large_locked)
@@ -33,6 +34,8 @@
 #define	arena_mapbits_unzeroed_get JEMALLOC_N(arena_mapbits_unzeroed_get)
 #define	arena_mapbits_unzeroed_set JEMALLOC_N(arena_mapbits_unzeroed_set)
 #define	arena_mapbitsp_get JEMALLOC_N(arena_mapbitsp_get)
+#define	arena_mapbitsp_read JEMALLOC_N(arena_mapbitsp_read)
+#define	arena_mapbitsp_write JEMALLOC_N(arena_mapbitsp_write)
 #define	arena_mapp_get JEMALLOC_N(arena_mapp_get)
 #define	arena_maxclass JEMALLOC_N(arena_maxclass)
 #define	arena_new JEMALLOC_N(arena_new)
@@ -48,8 +51,11 @@
 #define	arena_prof_promoted JEMALLOC_N(arena_prof_promoted)
 #define	arena_ptr_small_binind_get JEMALLOC_N(arena_ptr_small_binind_get)
 #define	arena_purge_all JEMALLOC_N(arena_purge_all)
+#define	arena_quarantine_junk_small JEMALLOC_N(arena_quarantine_junk_small)
 #define	arena_ralloc JEMALLOC_N(arena_ralloc)
+#define	arena_ralloc_junk_large JEMALLOC_N(arena_ralloc_junk_large)
 #define	arena_ralloc_no_move JEMALLOC_N(arena_ralloc_no_move)
+#define	arena_redzone_corruption JEMALLOC_N(arena_redzone_corruption)
 #define	arena_run_regind JEMALLOC_N(arena_run_regind)
 #define	arena_salloc JEMALLOC_N(arena_salloc)
 #define	arena_stats_merge JEMALLOC_N(arena_stats_merge)
@@ -66,6 +72,7 @@
 #define	arenas_tsd_cleanup_wrapper JEMALLOC_N(arenas_tsd_cleanup_wrapper)
 #define	arenas_tsd_get JEMALLOC_N(arenas_tsd_get)
 #define	arenas_tsd_get_wrapper JEMALLOC_N(arenas_tsd_get_wrapper)
+#define	arenas_tsd_init_head JEMALLOC_N(arenas_tsd_init_head)
 #define	arenas_tsd_set JEMALLOC_N(arenas_tsd_set)
 #define	atomic_add_u JEMALLOC_N(atomic_add_u)
 #define	atomic_add_uint32 JEMALLOC_N(atomic_add_uint32)
@@ -189,6 +196,8 @@
 #define	huge_allocated JEMALLOC_N(huge_allocated)
 #define	huge_boot JEMALLOC_N(huge_boot)
 #define	huge_dalloc JEMALLOC_N(huge_dalloc)
+#define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk)
+#define	huge_dss_prec_get JEMALLOC_N(huge_dss_prec_get)
 #define	huge_malloc JEMALLOC_N(huge_malloc)
 #define	huge_mtx JEMALLOC_N(huge_mtx)
 #define	huge_ndalloc JEMALLOC_N(huge_ndalloc)
@@ -204,20 +213,22 @@
 #define	huge_salloc JEMALLOC_N(huge_salloc)
 #define	iallocm JEMALLOC_N(iallocm)
 #define	icalloc JEMALLOC_N(icalloc)
-#define	icallocx JEMALLOC_N(icallocx)
+#define	icalloct JEMALLOC_N(icalloct)
 #define	idalloc JEMALLOC_N(idalloc)
-#define	idallocx JEMALLOC_N(idallocx)
+#define	idalloct JEMALLOC_N(idalloct)
 #define	imalloc JEMALLOC_N(imalloc)
-#define	imallocx JEMALLOC_N(imallocx)
+#define	imalloct JEMALLOC_N(imalloct)
 #define	ipalloc JEMALLOC_N(ipalloc)
-#define	ipallocx JEMALLOC_N(ipallocx)
+#define	ipalloct JEMALLOC_N(ipalloct)
 #define	iqalloc JEMALLOC_N(iqalloc)
-#define	iqallocx JEMALLOC_N(iqallocx)
+#define	iqalloct JEMALLOC_N(iqalloct)
 #define	iralloc JEMALLOC_N(iralloc)
-#define	irallocx JEMALLOC_N(irallocx)
+#define	iralloct JEMALLOC_N(iralloct)
+#define	iralloct_realign JEMALLOC_N(iralloct_realign)
 #define	isalloc JEMALLOC_N(isalloc)
 #define	isthreaded JEMALLOC_N(isthreaded)
 #define	ivsalloc JEMALLOC_N(ivsalloc)
+#define	ixalloc JEMALLOC_N(ixalloc)
 #define	jemalloc_postfork_child JEMALLOC_N(jemalloc_postfork_child)
 #define	jemalloc_postfork_parent JEMALLOC_N(jemalloc_postfork_parent)
 #define	jemalloc_prefork JEMALLOC_N(jemalloc_prefork)
@@ -248,6 +259,7 @@
 #define	ncpus JEMALLOC_N(ncpus)
 #define	nhbins JEMALLOC_N(nhbins)
 #define	opt_abort JEMALLOC_N(opt_abort)
+#define	opt_dss JEMALLOC_N(opt_dss)
 #define	opt_junk JEMALLOC_N(opt_junk)
 #define	opt_lg_chunk JEMALLOC_N(opt_lg_chunk)
 #define	opt_lg_dirty_mult JEMALLOC_N(opt_lg_dirty_mult)
@@ -277,8 +289,10 @@
 #define	prof_boot0 JEMALLOC_N(prof_boot0)
 #define	prof_boot1 JEMALLOC_N(prof_boot1)
 #define	prof_boot2 JEMALLOC_N(prof_boot2)
+#define	prof_bt_count JEMALLOC_N(prof_bt_count)
 #define	prof_ctx_get JEMALLOC_N(prof_ctx_get)
 #define	prof_ctx_set JEMALLOC_N(prof_ctx_set)
+#define	prof_dump_open JEMALLOC_N(prof_dump_open)
 #define	prof_free JEMALLOC_N(prof_free)
 #define	prof_gdump JEMALLOC_N(prof_gdump)
 #define	prof_idump JEMALLOC_N(prof_idump)
@@ -304,6 +318,7 @@
 #define	prof_tdata_tsd_cleanup_wrapper JEMALLOC_N(prof_tdata_tsd_cleanup_wrapper)
 #define	prof_tdata_tsd_get JEMALLOC_N(prof_tdata_tsd_get)
 #define	prof_tdata_tsd_get_wrapper JEMALLOC_N(prof_tdata_tsd_get_wrapper)
+#define	prof_tdata_tsd_init_head JEMALLOC_N(prof_tdata_tsd_init_head)
 #define	prof_tdata_tsd_set JEMALLOC_N(prof_tdata_tsd_set)
 #define	quarantine JEMALLOC_N(quarantine)
 #define	quarantine_alloc_hook JEMALLOC_N(quarantine_alloc_hook)
@@ -317,8 +332,10 @@
 #define	quarantine_tsd_cleanup_wrapper JEMALLOC_N(quarantine_tsd_cleanup_wrapper)
 #define	quarantine_tsd_get JEMALLOC_N(quarantine_tsd_get)
 #define	quarantine_tsd_get_wrapper JEMALLOC_N(quarantine_tsd_get_wrapper)
+#define	quarantine_tsd_init_head JEMALLOC_N(quarantine_tsd_init_head)
 #define	quarantine_tsd_set JEMALLOC_N(quarantine_tsd_set)
 #define	register_zone JEMALLOC_N(register_zone)
+#define	rtree_delete JEMALLOC_N(rtree_delete)
 #define	rtree_get JEMALLOC_N(rtree_get)
 #define	rtree_get_locked JEMALLOC_N(rtree_get_locked)
 #define	rtree_new JEMALLOC_N(rtree_new)
@@ -329,6 +346,7 @@
 #define	s2u JEMALLOC_N(s2u)
 #define	sa2u JEMALLOC_N(sa2u)
 #define	set_errno JEMALLOC_N(set_errno)
+#define	small_size2bin JEMALLOC_N(small_size2bin)
 #define	stats_cactive JEMALLOC_N(stats_cactive)
 #define	stats_cactive_add JEMALLOC_N(stats_cactive_add)
 #define	stats_cactive_get JEMALLOC_N(stats_cactive_get)
@@ -361,6 +379,7 @@
 #define	tcache_enabled_tsd_cleanup_wrapper JEMALLOC_N(tcache_enabled_tsd_cleanup_wrapper)
 #define	tcache_enabled_tsd_get JEMALLOC_N(tcache_enabled_tsd_get)
 #define	tcache_enabled_tsd_get_wrapper JEMALLOC_N(tcache_enabled_tsd_get_wrapper)
+#define	tcache_enabled_tsd_init_head JEMALLOC_N(tcache_enabled_tsd_init_head)
 #define	tcache_enabled_tsd_set JEMALLOC_N(tcache_enabled_tsd_set)
 #define	tcache_event JEMALLOC_N(tcache_event)
 #define	tcache_event_hard JEMALLOC_N(tcache_event_hard)
@@ -377,6 +396,7 @@
 #define	tcache_tsd_cleanup_wrapper JEMALLOC_N(tcache_tsd_cleanup_wrapper)
 #define	tcache_tsd_get JEMALLOC_N(tcache_tsd_get)
 #define	tcache_tsd_get_wrapper JEMALLOC_N(tcache_tsd_get_wrapper)
+#define	tcache_tsd_init_head JEMALLOC_N(tcache_tsd_init_head)
 #define	tcache_tsd_set JEMALLOC_N(tcache_tsd_set)
 #define	thread_allocated_booted JEMALLOC_N(thread_allocated_booted)
 #define	thread_allocated_initialized JEMALLOC_N(thread_allocated_initialized)
@@ -386,5 +406,8 @@
 #define	thread_allocated_tsd_cleanup_wrapper JEMALLOC_N(thread_allocated_tsd_cleanup_wrapper)
 #define	thread_allocated_tsd_get JEMALLOC_N(thread_allocated_tsd_get)
 #define	thread_allocated_tsd_get_wrapper JEMALLOC_N(thread_allocated_tsd_get_wrapper)
+#define	thread_allocated_tsd_init_head JEMALLOC_N(thread_allocated_tsd_init_head)
 #define	thread_allocated_tsd_set JEMALLOC_N(thread_allocated_tsd_set)
+#define	tsd_init_check_recursion JEMALLOC_N(tsd_init_check_recursion)
+#define	tsd_init_finish JEMALLOC_N(tsd_init_finish)
 #define	u2rz JEMALLOC_N(u2rz)
diff --git a/dep/jemalloc/include/jemalloc/internal/prng.h b/dep/jemalloc/include/jemalloc/internal/prng.h
index 83a5462b4dd..7b2b06512ff 100644
--- a/dep/jemalloc/include/jemalloc/internal/prng.h
+++ b/dep/jemalloc/include/jemalloc/internal/prng.h
@@ -25,7 +25,7 @@
  *   uint32_t state      : Seed value.
  *   const uint32_t a, c : See above discussion.
  */
-#define prng32(r, lg_range, state, a, c) do {				\
+#define	prng32(r, lg_range, state, a, c) do {				\
 	assert(lg_range > 0);						\
 	assert(lg_range <= 32);						\
 									\
@@ -35,7 +35,7 @@
 } while (false)
 
 /* Same as prng32(), but 64 bits of pseudo-randomness, using uint64_t. */
-#define prng64(r, lg_range, state, a, c) do {				\
+#define	prng64(r, lg_range, state, a, c) do {				\
 	assert(lg_range > 0);						\
 	assert(lg_range <= 64);						\
 									\
diff --git a/dep/jemalloc/include/jemalloc/internal/prof.h b/dep/jemalloc/include/jemalloc/internal/prof.h
index 119a5b1bcb7..6f162d21e84 100644
--- a/dep/jemalloc/include/jemalloc/internal/prof.h
+++ b/dep/jemalloc/include/jemalloc/internal/prof.h
@@ -8,7 +8,11 @@ typedef struct prof_ctx_s prof_ctx_t;
 typedef struct prof_tdata_s prof_tdata_t;
 
 /* Option defaults. */
-#define	PROF_PREFIX_DEFAULT		"jeprof"
+#ifdef JEMALLOC_PROF
+#  define PROF_PREFIX_DEFAULT		"jeprof"
+#else
+#  define PROF_PREFIX_DEFAULT		""
+#endif
 #define	LG_PROF_SAMPLE_DEFAULT		19
 #define	LG_PROF_INTERVAL_DEFAULT	-1
 
@@ -129,6 +133,7 @@ struct prof_ctx_s {
 	 * limbo due to one of:
 	 *   - Initializing per thread counters associated with this ctx.
 	 *   - Preparing to destroy this ctx.
+	 *   - Dumping a heap profile that includes this ctx.
 	 * nlimbo must be 1 (single destroyer) in order to safely destroy the
 	 * ctx.
 	 */
@@ -145,7 +150,11 @@ struct prof_ctx_s {
 	 * this context.
 	 */
 	ql_head(prof_thr_cnt_t)	cnts_ql;
+
+	/* Linkage for list of contexts to be dumped. */
+	ql_elm(prof_ctx_t)	dump_link;
 };
+typedef ql_head(prof_ctx_t) prof_ctx_list_t;
 
 struct prof_tdata_s {
 	/*
@@ -195,7 +204,12 @@ extern bool	opt_prof_gdump;       /* High-water memory dumping. */
 extern bool	opt_prof_final;       /* Final profile dumping. */
 extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
 extern bool	opt_prof_accum;       /* Report cumulative bytes. */
-extern char	opt_prof_prefix[PATH_MAX + 1];
+extern char	opt_prof_prefix[
+    /* Minimize memory bloat for non-prof builds. */
+#ifdef JEMALLOC_PROF
+    PATH_MAX +
+#endif
+    1];
 
 /*
  * Profile dump interval, measured in bytes allocated.  Each arena triggers a
@@ -215,6 +229,11 @@ extern bool	prof_promote;
 void	bt_init(prof_bt_t *bt, void **vec);
 void	prof_backtrace(prof_bt_t *bt, unsigned nignore);
 prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
+#ifdef JEMALLOC_JET
+size_t	prof_bt_count(void);
+typedef int (prof_dump_open_t)(bool, const char *);
+extern prof_dump_open_t *prof_dump_open;
+#endif
 void	prof_idump(void);
 bool	prof_mdump(const char *filename);
 void	prof_gdump(void);
@@ -289,11 +308,11 @@ malloc_tsd_protos(JEMALLOC_ATTR(unused), prof_tdata, prof_tdata_t *)
 prof_tdata_t	*prof_tdata_get(bool create);
 void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
 prof_ctx_t	*prof_ctx_get(const void *ptr);
-void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+void	prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx);
 bool	prof_sample_accum_update(size_t size);
-void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
-void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
-    size_t old_size, prof_ctx_t *old_ctx);
+void	prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt);
+void	prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
+    size_t old_usize, prof_ctx_t *old_ctx);
 void	prof_free(const void *ptr, size_t size);
 #endif
 
@@ -320,6 +339,20 @@ prof_tdata_get(bool create)
 JEMALLOC_INLINE void
 prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 {
+	/*
+	 * The body of this function is compiled out unless heap profiling is
+	 * enabled, so that it is possible to compile jemalloc with floating
+	 * point support completely disabled.  Avoiding floating point code is
+	 * important on memory-constrained systems, but it also enables a
+	 * workaround for versions of glibc that don't properly save/restore
+	 * floating point registers during dynamic lazy symbol loading (which
+	 * internally calls into whatever malloc implementation happens to be
+	 * integrated into the application).  Note that some compilers (e.g.
+	 * gcc 4.8) may use floating point registers for fast memory moves, so
+	 * jemalloc must be compiled with such optimizations disabled (e.g.
+	 * -mno-sse) in order for the workaround to be complete.
+	 */
+#ifdef JEMALLOC_PROF
 	uint64_t r;
 	double u;
 
@@ -341,7 +374,7 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	 *   Luc Devroye
 	 *   Springer-Verlag, New York, 1986
 	 *   pp 500
-	 *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
+	 *   (http://luc.devroye.org/rnbookindex.html)
 	 */
 	prng64(r, 53, prof_tdata->prng_state,
 	    UINT64_C(6364136223846793005), UINT64_C(1442695040888963407));
@@ -349,6 +382,7 @@ prof_sample_threshold_update(prof_tdata_t *prof_tdata)
 	prof_tdata->threshold = (uint64_t)(log(u) /
 	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
 	    + (uint64_t)1U;
+#endif
 }
 
 JEMALLOC_INLINE prof_ctx_t *
@@ -371,7 +405,7 @@ prof_ctx_get(const void *ptr)
 }
 
 JEMALLOC_INLINE void
-prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+prof_ctx_set(const void *ptr, size_t usize, prof_ctx_t *ctx)
 {
 	arena_chunk_t *chunk;
 
@@ -381,7 +415,7 @@ prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
 	if (chunk != ptr) {
 		/* Region. */
-		arena_prof_ctx_set(ptr, ctx);
+		arena_prof_ctx_set(ptr, usize, ctx);
 	} else
 		huge_prof_ctx_set(ptr, ctx);
 }
@@ -416,20 +450,20 @@ prof_sample_accum_update(size_t size)
 }
 
 JEMALLOC_INLINE void
-prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
+prof_malloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt)
 {
 
 	cassert(config_prof);
 	assert(ptr != NULL);
-	assert(size == isalloc(ptr, true));
+	assert(usize == isalloc(ptr, true));
 
 	if (opt_lg_prof_sample != 0) {
-		if (prof_sample_accum_update(size)) {
+		if (prof_sample_accum_update(usize)) {
 			/*
 			 * Don't sample.  For malloc()-like allocation, it is
 			 * always possible to tell in advance how large an
 			 * object's usable size will be, so there should never
-			 * be a difference between the size passed to
+			 * be a difference between the usize passed to
 			 * PROF_ALLOC_PREP() and prof_malloc().
 			 */
 			assert((uintptr_t)cnt == (uintptr_t)1U);
@@ -437,17 +471,17 @@ prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 	}
 
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
+		prof_ctx_set(ptr, usize, cnt->ctx);
 
 		cnt->epoch++;
 		/*********/
 		mb_write();
 		/*********/
 		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += size;
+		cnt->cnts.curbytes += usize;
 		if (opt_prof_accum) {
 			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += size;
+			cnt->cnts.accumbytes += usize;
 		}
 		/*********/
 		mb_write();
@@ -457,12 +491,12 @@ prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
 		mb_write();
 		/*********/
 	} else
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
 }
 
 JEMALLOC_INLINE void
-prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
-    size_t old_size, prof_ctx_t *old_ctx)
+prof_realloc(const void *ptr, size_t usize, prof_thr_cnt_t *cnt,
+    size_t old_usize, prof_ctx_t *old_ctx)
 {
 	prof_thr_cnt_t *told_cnt;
 
@@ -470,15 +504,15 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
 
 	if (ptr != NULL) {
-		assert(size == isalloc(ptr, true));
+		assert(usize == isalloc(ptr, true));
 		if (opt_lg_prof_sample != 0) {
-			if (prof_sample_accum_update(size)) {
+			if (prof_sample_accum_update(usize)) {
 				/*
-				 * Don't sample.  The size passed to
+				 * Don't sample.  The usize passed to
 				 * PROF_ALLOC_PREP() was larger than what
 				 * actually got allocated, so a backtrace was
 				 * captured for this allocation, even though
-				 * its actual size was insufficient to cross
+				 * its actual usize was insufficient to cross
 				 * the sample threshold.
 				 */
 				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
@@ -495,7 +529,7 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 			 */
 			malloc_mutex_lock(old_ctx->lock);
 			old_ctx->cnt_merged.curobjs--;
-			old_ctx->cnt_merged.curbytes -= old_size;
+			old_ctx->cnt_merged.curbytes -= old_usize;
 			malloc_mutex_unlock(old_ctx->lock);
 			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
 		}
@@ -505,23 +539,23 @@ prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
 	if ((uintptr_t)told_cnt > (uintptr_t)1U)
 		told_cnt->epoch++;
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
-		prof_ctx_set(ptr, cnt->ctx);
+		prof_ctx_set(ptr, usize, cnt->ctx);
 		cnt->epoch++;
 	} else if (ptr != NULL)
-		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+		prof_ctx_set(ptr, usize, (prof_ctx_t *)(uintptr_t)1U);
 	/*********/
 	mb_write();
 	/*********/
 	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
 		told_cnt->cnts.curobjs--;
-		told_cnt->cnts.curbytes -= old_size;
+		told_cnt->cnts.curbytes -= old_usize;
 	}
 	if ((uintptr_t)cnt > (uintptr_t)1U) {
 		cnt->cnts.curobjs++;
-		cnt->cnts.curbytes += size;
+		cnt->cnts.curbytes += usize;
 		if (opt_prof_accum) {
 			cnt->cnts.accumobjs++;
-			cnt->cnts.accumbytes += size;
+			cnt->cnts.accumbytes += usize;
 		}
 	}
 	/*********/
diff --git a/dep/jemalloc/include/jemalloc/internal/ql.h b/dep/jemalloc/include/jemalloc/internal/ql.h
index a9ed2393f0c..f70c5f6f391 100644
--- a/dep/jemalloc/include/jemalloc/internal/ql.h
+++ b/dep/jemalloc/include/jemalloc/internal/ql.h
@@ -1,61 +1,61 @@
 /*
  * List definitions.
  */
-#define ql_head(a_type)							\
+#define	ql_head(a_type)							\
 struct {								\
 	a_type *qlh_first;						\
 }
 
-#define ql_head_initializer(a_head) {NULL}
+#define	ql_head_initializer(a_head) {NULL}
 
-#define ql_elm(a_type)	qr(a_type)
+#define	ql_elm(a_type)	qr(a_type)
 
 /* List functions. */
-#define ql_new(a_head) do {						\
+#define	ql_new(a_head) do {						\
 	(a_head)->qlh_first = NULL;					\
 } while (0)
 
-#define ql_elm_new(a_elm, a_field) qr_new((a_elm), a_field)
+#define	ql_elm_new(a_elm, a_field) qr_new((a_elm), a_field)
 
-#define ql_first(a_head) ((a_head)->qlh_first)
+#define	ql_first(a_head) ((a_head)->qlh_first)
 
-#define ql_last(a_head, a_field)					\
+#define	ql_last(a_head, a_field)					\
 	((ql_first(a_head) != NULL)					\
 	    ? qr_prev(ql_first(a_head), a_field) : NULL)
 
-#define ql_next(a_head, a_elm, a_field)					\
+#define	ql_next(a_head, a_elm, a_field)					\
 	((ql_last(a_head, a_field) != (a_elm))				\
 	    ? qr_next((a_elm), a_field)	: NULL)
 
-#define ql_prev(a_head, a_elm, a_field)					\
+#define	ql_prev(a_head, a_elm, a_field)					\
 	((ql_first(a_head) != (a_elm)) ? qr_prev((a_elm), a_field)	\
 				       : NULL)
 
-#define ql_before_insert(a_head, a_qlelm, a_elm, a_field) do {		\
+#define	ql_before_insert(a_head, a_qlelm, a_elm, a_field) do {		\
 	qr_before_insert((a_qlelm), (a_elm), a_field);			\
 	if (ql_first(a_head) == (a_qlelm)) {				\
 		ql_first(a_head) = (a_elm);				\
 	}								\
 } while (0)
 
-#define ql_after_insert(a_qlelm, a_elm, a_field)			\
+#define	ql_after_insert(a_qlelm, a_elm, a_field)			\
 	qr_after_insert((a_qlelm), (a_elm), a_field)
 
-#define ql_head_insert(a_head, a_elm, a_field) do {			\
+#define	ql_head_insert(a_head, a_elm, a_field) do {			\
 	if (ql_first(a_head) != NULL) {					\
 		qr_before_insert(ql_first(a_head), (a_elm), a_field);	\
 	}								\
 	ql_first(a_head) = (a_elm);					\
 } while (0)
 
-#define ql_tail_insert(a_head, a_elm, a_field) do {			\
+#define	ql_tail_insert(a_head, a_elm, a_field) do {			\
 	if (ql_first(a_head) != NULL) {					\
 		qr_before_insert(ql_first(a_head), (a_elm), a_field);	\
 	}								\
 	ql_first(a_head) = qr_next((a_elm), a_field);			\
 } while (0)
 
-#define ql_remove(a_head, a_elm, a_field) do {				\
+#define	ql_remove(a_head, a_elm, a_field) do {				\
 	if (ql_first(a_head) == (a_elm)) {				\
 		ql_first(a_head) = qr_next(ql_first(a_head), a_field);	\
 	}								\
@@ -66,18 +66,18 @@ struct {								\
 	}								\
 } while (0)
 
-#define ql_head_remove(a_head, a_type, a_field) do {			\
+#define	ql_head_remove(a_head, a_type, a_field) do {			\
 	a_type *t = ql_first(a_head);					\
 	ql_remove((a_head), t, a_field);				\
 } while (0)
 
-#define ql_tail_remove(a_head, a_type, a_field) do {			\
+#define	ql_tail_remove(a_head, a_type, a_field) do {			\
 	a_type *t = ql_last(a_head, a_field);				\
 	ql_remove((a_head), t, a_field);				\
 } while (0)
 
-#define ql_foreach(a_var, a_head, a_field)				\
+#define	ql_foreach(a_var, a_head, a_field)				\
 	qr_foreach((a_var), ql_first(a_head), a_field)
 
-#define ql_reverse_foreach(a_var, a_head, a_field)			\
+#define	ql_reverse_foreach(a_var, a_head, a_field)			\
 	qr_reverse_foreach((a_var), ql_first(a_head), a_field)
diff --git a/dep/jemalloc/include/jemalloc/internal/qr.h b/dep/jemalloc/include/jemalloc/internal/qr.h
index fe22352fedd..602944b9b4f 100644
--- a/dep/jemalloc/include/jemalloc/internal/qr.h
+++ b/dep/jemalloc/include/jemalloc/internal/qr.h
@@ -1,28 +1,28 @@
 /* Ring definitions. */
-#define qr(a_type)							\
+#define	qr(a_type)							\
 struct {								\
 	a_type	*qre_next;						\
 	a_type	*qre_prev;						\
 }
 
 /* Ring functions. */
-#define qr_new(a_qr, a_field) do {					\
+#define	qr_new(a_qr, a_field) do {					\
 	(a_qr)->a_field.qre_next = (a_qr);				\
 	(a_qr)->a_field.qre_prev = (a_qr);				\
 } while (0)
 
-#define qr_next(a_qr, a_field) ((a_qr)->a_field.qre_next)
+#define	qr_next(a_qr, a_field) ((a_qr)->a_field.qre_next)
 
-#define qr_prev(a_qr, a_field) ((a_qr)->a_field.qre_prev)
+#define	qr_prev(a_qr, a_field) ((a_qr)->a_field.qre_prev)
 
-#define qr_before_insert(a_qrelm, a_qr, a_field) do {			\
+#define	qr_before_insert(a_qrelm, a_qr, a_field) do {			\
 	(a_qr)->a_field.qre_prev = (a_qrelm)->a_field.qre_prev;		\
 	(a_qr)->a_field.qre_next = (a_qrelm);				\
 	(a_qr)->a_field.qre_prev->a_field.qre_next = (a_qr);		\
 	(a_qrelm)->a_field.qre_prev = (a_qr);				\
 } while (0)
 
-#define qr_after_insert(a_qrelm, a_qr, a_field)				\
+#define	qr_after_insert(a_qrelm, a_qr, a_field)				\
     do									\
     {									\
 	(a_qr)->a_field.qre_next = (a_qrelm)->a_field.qre_next;		\
@@ -31,7 +31,7 @@ struct {								\
 	(a_qrelm)->a_field.qre_next = (a_qr);				\
     } while (0)
 
-#define qr_meld(a_qr_a, a_qr_b, a_field) do {				\
+#define	qr_meld(a_qr_a, a_qr_b, a_field) do {				\
 	void *t;							\
 	(a_qr_a)->a_field.qre_prev->a_field.qre_next = (a_qr_b);	\
 	(a_qr_b)->a_field.qre_prev->a_field.qre_next = (a_qr_a);	\
@@ -42,10 +42,10 @@ struct {								\
 
 /* qr_meld() and qr_split() are functionally equivalent, so there's no need to
  * have two copies of the code. */
-#define qr_split(a_qr_a, a_qr_b, a_field)				\
+#define	qr_split(a_qr_a, a_qr_b, a_field)				\
 	qr_meld((a_qr_a), (a_qr_b), a_field)
 
-#define qr_remove(a_qr, a_field) do {					\
+#define	qr_remove(a_qr, a_field) do {					\
 	(a_qr)->a_field.qre_prev->a_field.qre_next			\
 	    = (a_qr)->a_field.qre_next;					\
 	(a_qr)->a_field.qre_next->a_field.qre_prev			\
@@ -54,13 +54,13 @@ struct {								\
 	(a_qr)->a_field.qre_prev = (a_qr);				\
 } while (0)
 
-#define qr_foreach(var, a_qr, a_field)					\
+#define	qr_foreach(var, a_qr, a_field)					\
 	for ((var) = (a_qr);						\
 	    (var) != NULL;						\
 	    (var) = (((var)->a_field.qre_next != (a_qr))		\
 	    ? (var)->a_field.qre_next : NULL))
 
-#define qr_reverse_foreach(var, a_qr, a_field)				\
+#define	qr_reverse_foreach(var, a_qr, a_field)				\
 	for ((var) = ((a_qr) != NULL) ? qr_prev(a_qr, a_field) : NULL;	\
 	    (var) != NULL;						\
 	    (var) = (((var) != (a_qr))					\
diff --git a/dep/jemalloc/include/jemalloc/internal/rb.h b/dep/jemalloc/include/jemalloc/internal/rb.h
index 7b675f09051..423802eb2dc 100644
--- a/dep/jemalloc/include/jemalloc/internal/rb.h
+++ b/dep/jemalloc/include/jemalloc/internal/rb.h
@@ -22,10 +22,6 @@
 #ifndef RB_H_
 #define	RB_H_
 
-#if 0
-__FBSDID("$FreeBSD: head/lib/libc/stdlib/rb.h 204493 2010-02-28 22:57:13Z jasone $");
-#endif
-
 #ifdef RB_COMPACT
 /* Node structure. */
 #define	rb_node(a_type)							\
diff --git a/dep/jemalloc/include/jemalloc/internal/rtree.h b/dep/jemalloc/include/jemalloc/internal/rtree.h
index 9bd98548cfe..bc74769f50e 100644
--- a/dep/jemalloc/include/jemalloc/internal/rtree.h
+++ b/dep/jemalloc/include/jemalloc/internal/rtree.h
@@ -14,17 +14,18 @@ typedef struct rtree_s rtree_t;
  * Size of each radix tree node (must be a power of 2).  This impacts tree
  * depth.
  */
-#if (LG_SIZEOF_PTR == 2)
-#  define RTREE_NODESIZE (1U << 14)
-#else
-#  define RTREE_NODESIZE CACHELINE
-#endif
+#define	RTREE_NODESIZE (1U << 16)
+
+typedef void *(rtree_alloc_t)(size_t);
+typedef void (rtree_dalloc_t)(void *);
 
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
 struct rtree_s {
+	rtree_alloc_t	*alloc;
+	rtree_dalloc_t	*dalloc;
 	malloc_mutex_t	mutex;
 	void		**root;
 	unsigned	height;
@@ -35,7 +36,8 @@ struct rtree_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-rtree_t	*rtree_new(unsigned bits);
+rtree_t	*rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc);
+void	rtree_delete(rtree_t *rtree);
 void	rtree_prefork(rtree_t *rtree);
 void	rtree_postfork_parent(rtree_t *rtree);
 void	rtree_postfork_child(rtree_t *rtree);
@@ -45,20 +47,20 @@ void	rtree_postfork_child(rtree_t *rtree);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
-#ifndef JEMALLOC_DEBUG
-void	*rtree_get_locked(rtree_t *rtree, uintptr_t key);
+#ifdef JEMALLOC_DEBUG
+uint8_t rtree_get_locked(rtree_t *rtree, uintptr_t key);
 #endif
-void	*rtree_get(rtree_t *rtree, uintptr_t key);
-bool	rtree_set(rtree_t *rtree, uintptr_t key, void *val);
+uint8_t	rtree_get(rtree_t *rtree, uintptr_t key);
+bool	rtree_set(rtree_t *rtree, uintptr_t key, uint8_t val);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_RTREE_C_))
 #define	RTREE_GET_GENERATE(f)						\
 /* The least significant bits of the key are ignored. */		\
-JEMALLOC_INLINE void *							\
+JEMALLOC_INLINE uint8_t							\
 f(rtree_t *rtree, uintptr_t key)					\
 {									\
-	void *ret;							\
+	uint8_t ret;							\
 	uintptr_t subkey;						\
 	unsigned i, lshift, height, bits;				\
 	void **node, **child;						\
@@ -68,12 +70,12 @@ f(rtree_t *rtree, uintptr_t key)					\
 	    i < height - 1;						\
 	    i++, lshift += bits, node = child) {			\
 		bits = rtree->level2bits[i];				\
-		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR + \
+		subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR +	\
 		    3)) - bits);					\
 		child = (void**)node[subkey];				\
 		if (child == NULL) {					\
 			RTREE_UNLOCK(&rtree->mutex);			\
-			return (NULL);					\
+			return (0);					\
 		}							\
 	}								\
 									\
@@ -84,7 +86,10 @@ f(rtree_t *rtree, uintptr_t key)					\
 	bits = rtree->level2bits[i];					\
 	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) -	\
 	    bits);							\
-	ret = node[subkey];						\
+	{								\
+		uint8_t *leaf = (uint8_t *)node;			\
+		ret = leaf[subkey];					\
+	}								\
 	RTREE_UNLOCK(&rtree->mutex);					\
 									\
 	RTREE_GET_VALIDATE						\
@@ -123,7 +128,7 @@ RTREE_GET_GENERATE(rtree_get)
 #undef RTREE_GET_VALIDATE
 
 JEMALLOC_INLINE bool
-rtree_set(rtree_t *rtree, uintptr_t key, void *val)
+rtree_set(rtree_t *rtree, uintptr_t key, uint8_t val)
 {
 	uintptr_t subkey;
 	unsigned i, lshift, height, bits;
@@ -138,14 +143,14 @@ rtree_set(rtree_t *rtree, uintptr_t key, void *val)
 		    bits);
 		child = (void**)node[subkey];
 		if (child == NULL) {
-			child = (void**)base_alloc(sizeof(void *) <<
-			    rtree->level2bits[i+1]);
+			size_t size = ((i + 1 < height - 1) ? sizeof(void *)
+			    : (sizeof(uint8_t))) << rtree->level2bits[i+1];
+			child = (void**)rtree->alloc(size);
 			if (child == NULL) {
 				malloc_mutex_unlock(&rtree->mutex);
 				return (true);
 			}
-			memset(child, 0, sizeof(void *) <<
-			    rtree->level2bits[i+1]);
+			memset(child, 0, size);
 			node[subkey] = child;
 		}
 	}
@@ -153,7 +158,10 @@ rtree_set(rtree_t *rtree, uintptr_t key, void *val)
 	/* node is a leaf, so it contains values rather than node pointers. */
 	bits = rtree->level2bits[i];
 	subkey = (key << lshift) >> ((ZU(1) << (LG_SIZEOF_PTR+3)) - bits);
-	node[subkey] = val;
+	{
+		uint8_t *leaf = (uint8_t *)node;
+		leaf[subkey] = val;
+	}
 	malloc_mutex_unlock(&rtree->mutex);
 
 	return (false);
diff --git a/dep/jemalloc/include/jemalloc/internal/tcache.h b/dep/jemalloc/include/jemalloc/internal/tcache.h
index ba36204ff21..c3d4b58d4dc 100644
--- a/dep/jemalloc/include/jemalloc/internal/tcache.h
+++ b/dep/jemalloc/include/jemalloc/internal/tcache.h
@@ -297,6 +297,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 	binind = SMALL_SIZE2BIN(size);
 	assert(binind < NBINS);
 	tbin = &tcache->tbins[binind];
+	size = arena_bin_info[binind].reg_size;
 	ret = tcache_alloc_easy(tbin);
 	if (ret == NULL) {
 		ret = tcache_alloc_small_hard(tcache, tbin, binind);
@@ -313,6 +314,7 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 			} else if (opt_zero)
 				memset(ret, 0, size);
 		}
+		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	} else {
 		if (config_fill && opt_junk) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
@@ -321,7 +323,6 @@ tcache_alloc_small(tcache_t *tcache, size_t size, bool zero)
 		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 		memset(ret, 0, size);
 	}
-	VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 
 	if (config_stats)
 		tbin->tstats.nrequests++;
@@ -368,11 +369,11 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero)
 				else if (opt_zero)
 					memset(ret, 0, size);
 			}
+			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 		} else {
 			VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 			memset(ret, 0, size);
 		}
-		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 
 		if (config_stats)
 			tbin->tstats.nrequests++;
diff --git a/dep/jemalloc/include/jemalloc/internal/tsd.h b/dep/jemalloc/include/jemalloc/internal/tsd.h
index 0037cf35e70..9fb4a23ec6b 100644
--- a/dep/jemalloc/include/jemalloc/internal/tsd.h
+++ b/dep/jemalloc/include/jemalloc/internal/tsd.h
@@ -6,6 +6,12 @@
 
 typedef bool (*malloc_tsd_cleanup_t)(void);
 
+#if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
+    !defined(_WIN32))
+typedef struct tsd_init_block_s tsd_init_block_t;
+typedef struct tsd_init_head_s tsd_init_head_t;
+#endif
+
 /*
  * TLS/TSD-agnostic macro-based implementation of thread-specific data.  There
  * are four macros that support (at least) three use cases: file-private,
@@ -75,12 +81,13 @@ extern __thread a_type	a_name##_tls;					\
 extern pthread_key_t	a_name##_tsd;					\
 extern bool		a_name##_booted;
 #elif (defined(_WIN32))
-#define malloc_tsd_externs(a_name, a_type)				\
+#define	malloc_tsd_externs(a_name, a_type)				\
 extern DWORD		a_name##_tsd;					\
 extern bool		a_name##_booted;
 #else
 #define	malloc_tsd_externs(a_name, a_type)				\
 extern pthread_key_t	a_name##_tsd;					\
+extern tsd_init_head_t	a_name##_tsd_init_head;				\
 extern bool		a_name##_booted;
 #endif
 
@@ -105,6 +112,10 @@ a_attr bool		a_name##_booted = false;
 #else
 #define	malloc_tsd_data(a_attr, a_name, a_type, a_initializer)		\
 a_attr pthread_key_t	a_name##_tsd;					\
+a_attr tsd_init_head_t	a_name##_tsd_init_head = {			\
+	ql_head_initializer(blocks),					\
+	MALLOC_MUTEX_INITIALIZER					\
+};									\
 a_attr bool		a_name##_booted = false;
 #endif
 
@@ -333,8 +344,14 @@ a_name##_tsd_get_wrapper(void)						\
 	    pthread_getspecific(a_name##_tsd);				\
 									\
 	if (wrapper == NULL) {						\
+		tsd_init_block_t block;					\
+		wrapper = tsd_init_check_recursion(			\
+		    &a_name##_tsd_init_head, &block);			\
+		if (wrapper)						\
+		    return (wrapper);					\
 		wrapper = (a_name##_tsd_wrapper_t *)			\
 		    malloc_tsd_malloc(sizeof(a_name##_tsd_wrapper_t));	\
+		block.data = wrapper;					\
 		if (wrapper == NULL) {					\
 			malloc_write("<jemalloc>: Error allocating"	\
 			    " TSD for "#a_name"\n");			\
@@ -350,6 +367,7 @@ a_name##_tsd_get_wrapper(void)						\
 			    " TSD for "#a_name"\n");			\
 			abort();					\
 		}							\
+		tsd_init_finish(&a_name##_tsd_init_head, &block);	\
 	}								\
 	return (wrapper);						\
 }									\
@@ -379,6 +397,19 @@ a_name##_tsd_set(a_type *val)						\
 /******************************************************************************/
 #ifdef JEMALLOC_H_STRUCTS
 
+#if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
+    !defined(_WIN32))
+struct tsd_init_block_s {
+	ql_elm(tsd_init_block_t)	link;
+	pthread_t			thread;
+	void				*data;
+};
+struct tsd_init_head_s {
+	ql_head(tsd_init_block_t)	blocks;
+	malloc_mutex_t			lock;
+};
+#endif
+
 #endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
@@ -388,6 +419,12 @@ void	malloc_tsd_dalloc(void *wrapper);
 void	malloc_tsd_no_cleanup(void *);
 void	malloc_tsd_cleanup_register(bool (*f)(void));
 void	malloc_tsd_boot(void);
+#if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
+    !defined(_WIN32))
+void	*tsd_init_check_recursion(tsd_init_head_t *head,
+    tsd_init_block_t *block);
+void	tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block);
+#endif
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/dep/jemalloc/include/jemalloc/internal/util.h b/dep/jemalloc/include/jemalloc/internal/util.h
index 8479693631a..6b938f74688 100644
--- a/dep/jemalloc/include/jemalloc/internal/util.h
+++ b/dep/jemalloc/include/jemalloc/internal/util.h
@@ -14,7 +14,7 @@
  * Wrap a cpp argument that contains commas such that it isn't broken up into
  * multiple arguments.
  */
-#define JEMALLOC_CONCAT(...) __VA_ARGS__
+#define	JEMALLOC_ARG_CONCAT(...) __VA_ARGS__
 
 /*
  * Silence compiler warnings due to uninitialized values.  This is used
@@ -42,12 +42,6 @@
 } while (0)
 #endif
 
-/* Use to assert a particular configuration, e.g., cassert(config_debug). */
-#define	cassert(c) do {							\
-	if ((c) == false)						\
-		assert(false);						\
-} while (0)
-
 #ifndef not_reached
 #define	not_reached() do {						\
 	if (config_debug) {						\
@@ -69,10 +63,18 @@
 } while (0)
 #endif
 
+#ifndef assert_not_implemented
 #define	assert_not_implemented(e) do {					\
 	if (config_debug && !(e))					\
 		not_implemented();					\
 } while (0)
+#endif
+
+/* Use to assert a particular configuration, e.g., cassert(config_debug). */
+#define	cassert(c) do {							\
+	if ((c) == false)						\
+		not_reached();						\
+} while (0)
 
 #endif /* JEMALLOC_H_TYPES */
 /******************************************************************************/
@@ -82,8 +84,9 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-int	buferror(char *buf, size_t buflen);
-uintmax_t	malloc_strtoumax(const char *nptr, char **endptr, int base);
+int	buferror(int err, char *buf, size_t buflen);
+uintmax_t	malloc_strtoumax(const char *restrict nptr,
+    char **restrict endptr, int base);
 void	malloc_write(const char *s);
 
 /*
@@ -107,7 +110,6 @@ void	malloc_printf(const char *format, ...)
 
 #ifndef JEMALLOC_ENABLE_INLINE
 size_t	pow2_ceil(size_t x);
-void	malloc_write(const char *s);
 void	set_errno(int errnum);
 int	get_errno(void);
 #endif
diff --git a/dep/jemalloc/include/jemalloc/jemalloc.h b/dep/jemalloc/include/jemalloc/jemalloc.h
index 946c73b75e5..b8ea851e525 100644
--- a/dep/jemalloc/include/jemalloc/jemalloc.h
+++ b/dep/jemalloc/include/jemalloc/jemalloc.h
@@ -7,36 +7,45 @@ extern "C" {
 #include <limits.h>
 #include <strings.h>
 
-#define	JEMALLOC_VERSION "3.3.1-0-g9ef9d9e8c271cdf14f664b871a8f98c827714784"
+#define	JEMALLOC_VERSION "3.6.0-0-g46c0af68bd248b04df75e4f92d5fb804c3d75340"
 #define	JEMALLOC_VERSION_MAJOR 3
-#define	JEMALLOC_VERSION_MINOR 3
-#define	JEMALLOC_VERSION_BUGFIX 1
+#define	JEMALLOC_VERSION_MINOR 6
+#define	JEMALLOC_VERSION_BUGFIX 0
 #define	JEMALLOC_VERSION_NREV 0
-#define	JEMALLOC_VERSION_GID "9ef9d9e8c271cdf14f664b871a8f98c827714784"
+#define	JEMALLOC_VERSION_GID "46c0af68bd248b04df75e4f92d5fb804c3d75340"
 
-#include "jemalloc_defs.h"
+#  define MALLOCX_LG_ALIGN(la)	(la)
+#  if LG_SIZEOF_PTR == 2
+#    define MALLOCX_ALIGN(a)	(ffs(a)-1)
+#  else
+#    define MALLOCX_ALIGN(a)						\
+	 ((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
+#  endif
+#  define MALLOCX_ZERO	((int)0x40)
+/* Bias arena index bits so that 0 encodes "MALLOCX_ARENA() unspecified". */
+#  define MALLOCX_ARENA(a)	((int)(((a)+1) << 8))
 
 #ifdef JEMALLOC_EXPERIMENTAL
-#define	ALLOCM_LG_ALIGN(la)	(la)
-#if LG_SIZEOF_PTR == 2
-#define	ALLOCM_ALIGN(a)	(ffs(a)-1)
-#else
-#define	ALLOCM_ALIGN(a)	((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
-#endif
-#define	ALLOCM_ZERO	((int)0x40)
-#define	ALLOCM_NO_MOVE	((int)0x80)
+#  define ALLOCM_LG_ALIGN(la)	(la)
+#  if LG_SIZEOF_PTR == 2
+#    define ALLOCM_ALIGN(a)	(ffs(a)-1)
+#  else
+#    define ALLOCM_ALIGN(a)						\
+	 ((a < (size_t)INT_MAX) ? ffs(a)-1 : ffs(a>>32)+31)
+#  endif
+#  define ALLOCM_ZERO	((int)0x40)
+#  define ALLOCM_NO_MOVE	((int)0x80)
 /* Bias arena index bits so that 0 encodes "ALLOCM_ARENA() unspecified". */
-#define	ALLOCM_ARENA(a)	((int)(((a)+1) << 8))
-
-#define	ALLOCM_SUCCESS		0
-#define	ALLOCM_ERR_OOM		1
-#define	ALLOCM_ERR_NOT_MOVED	2
+#  define ALLOCM_ARENA(a)	((int)(((a)+1) << 8))
+#  define ALLOCM_SUCCESS	0
+#  define ALLOCM_ERR_OOM	1
+#  define ALLOCM_ERR_NOT_MOVED	2
 #endif
 
 /*
- * The je_ prefix on the following public symbol declarations is an artifact of
- * namespace management, and should be omitted in application code unless
- * JEMALLOC_NO_DEMANGLE is defined (see below).
+ * The je_ prefix on the following public symbol declarations is an artifact
+ * of namespace management, and should be omitted in application code unless
+ * JEMALLOC_NO_DEMANGLE is defined (see jemalloc_mangle.h).
  */
 extern JEMALLOC_EXPORT const char	*je_malloc_conf;
 extern JEMALLOC_EXPORT void		(*je_malloc_message)(void *cbopaque,
@@ -52,6 +61,25 @@ JEMALLOC_EXPORT void	*je_aligned_alloc(size_t alignment, size_t size)
 JEMALLOC_EXPORT void	*je_realloc(void *ptr, size_t size);
 JEMALLOC_EXPORT void	je_free(void *ptr);
 
+JEMALLOC_EXPORT void	*je_mallocx(size_t size, int flags);
+JEMALLOC_EXPORT void	*je_rallocx(void *ptr, size_t size, int flags);
+JEMALLOC_EXPORT size_t	je_xallocx(void *ptr, size_t size, size_t extra,
+    int flags);
+JEMALLOC_EXPORT size_t	je_sallocx(const void *ptr, int flags);
+JEMALLOC_EXPORT void	je_dallocx(void *ptr, int flags);
+JEMALLOC_EXPORT size_t	je_nallocx(size_t size, int flags);
+
+JEMALLOC_EXPORT int	je_mallctl(const char *name, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen);
+JEMALLOC_EXPORT int	je_mallctlnametomib(const char *name, size_t *mibp,
+    size_t *miblenp);
+JEMALLOC_EXPORT int	je_mallctlbymib(const size_t *mib, size_t miblen,
+    void *oldp, size_t *oldlenp, void *newp, size_t newlen);
+JEMALLOC_EXPORT void	je_malloc_stats_print(void (*write_cb)(void *,
+    const char *), void *je_cbopaque, const char *opts);
+JEMALLOC_EXPORT size_t	je_malloc_usable_size(
+    JEMALLOC_USABLE_SIZE_CONST void *ptr);
+
 #ifdef JEMALLOC_OVERRIDE_MEMALIGN
 JEMALLOC_EXPORT void *	je_memalign(size_t alignment, size_t size)
     JEMALLOC_ATTR(malloc);
@@ -61,17 +89,6 @@ JEMALLOC_EXPORT void *	je_memalign(size_t alignment, size_t size)
 JEMALLOC_EXPORT void *	je_valloc(size_t size) JEMALLOC_ATTR(malloc);
 #endif
 
-JEMALLOC_EXPORT size_t	je_malloc_usable_size(
-    JEMALLOC_USABLE_SIZE_CONST void *ptr);
-JEMALLOC_EXPORT void	je_malloc_stats_print(void (*write_cb)(void *,
-    const char *), void *je_cbopaque, const char *opts);
-JEMALLOC_EXPORT int	je_mallctl(const char *name, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen);
-JEMALLOC_EXPORT int	je_mallctlnametomib(const char *name, size_t *mibp,
-    size_t *miblenp);
-JEMALLOC_EXPORT int	je_mallctlbymib(const size_t *mib, size_t miblen,
-    void *oldp, size_t *oldlenp, void *newp, size_t newlen);
-
 #ifdef JEMALLOC_EXPERIMENTAL
 JEMALLOC_EXPORT int	je_allocm(void **ptr, size_t *rsize, size_t size,
     int flags) JEMALLOC_ATTR(nonnull(1));
@@ -92,63 +109,71 @@ JEMALLOC_EXPORT int	je_nallocm(size_t *rsize, size_t size, int flags);
  * --with-mangling and/or --with-jemalloc-prefix configuration settings.
  */
 #ifdef JEMALLOC_MANGLE
-#ifndef JEMALLOC_NO_DEMANGLE
-#define	JEMALLOC_NO_DEMANGLE
-#endif
-#define	malloc_conf je_malloc_conf
-#define	malloc_message je_malloc_message
-#define	malloc je_malloc
-#define	calloc je_calloc
-#define	posix_memalign je_posix_memalign
-#define	aligned_alloc je_aligned_alloc
-#define	realloc je_realloc
-#define	free je_free
-#define	malloc_usable_size je_malloc_usable_size
-#define	malloc_stats_print je_malloc_stats_print
-#define	mallctl je_mallctl
-#define	mallctlnametomib je_mallctlnametomib
-#define	mallctlbymib je_mallctlbymib
-#define	memalign je_memalign
-#define	valloc je_valloc
-#ifdef JEMALLOC_EXPERIMENTAL
-#define	allocm je_allocm
-#define	rallocm je_rallocm
-#define	sallocm je_sallocm
-#define	dallocm je_dallocm
-#define	nallocm je_nallocm
-#endif
+#  ifndef JEMALLOC_NO_DEMANGLE
+#    define JEMALLOC_NO_DEMANGLE
+#  endif
+#  define malloc_conf je_malloc_conf
+#  define malloc_message je_malloc_message
+#  define malloc je_malloc
+#  define calloc je_calloc
+#  define posix_memalign je_posix_memalign
+#  define aligned_alloc je_aligned_alloc
+#  define realloc je_realloc
+#  define free je_free
+#  define mallocx je_mallocx
+#  define rallocx je_rallocx
+#  define xallocx je_xallocx
+#  define sallocx je_sallocx
+#  define dallocx je_dallocx
+#  define nallocx je_nallocx
+#  define mallctl je_mallctl
+#  define mallctlnametomib je_mallctlnametomib
+#  define mallctlbymib je_mallctlbymib
+#  define malloc_stats_print je_malloc_stats_print
+#  define malloc_usable_size je_malloc_usable_size
+#  define memalign je_memalign
+#  define valloc je_valloc
+#  define allocm je_allocm
+#  define dallocm je_dallocm
+#  define nallocm je_nallocm
+#  define rallocm je_rallocm
+#  define sallocm je_sallocm
 #endif
 
 /*
- * The je_* macros can be used as stable alternative names for the public
- * jemalloc API if JEMALLOC_NO_DEMANGLE is defined.  This is primarily meant
- * for use in jemalloc itself, but it can be used by application code to
+ * The je_* macros can be used as stable alternative names for the
+ * public jemalloc API if JEMALLOC_NO_DEMANGLE is defined.  This is primarily
+ * meant for use in jemalloc itself, but it can be used by application code to
  * provide isolation from the name mangling specified via --with-mangling
  * and/or --with-jemalloc-prefix.
  */
 #ifndef JEMALLOC_NO_DEMANGLE
-#undef je_malloc_conf
-#undef je_malloc_message
-#undef je_malloc
-#undef je_calloc
-#undef je_posix_memalign
-#undef je_aligned_alloc
-#undef je_realloc
-#undef je_free
-#undef je_malloc_usable_size
-#undef je_malloc_stats_print
-#undef je_mallctl
-#undef je_mallctlnametomib
-#undef je_mallctlbymib
-#undef je_memalign
-#undef je_valloc
-#ifdef JEMALLOC_EXPERIMENTAL
-#undef je_allocm
-#undef je_rallocm
-#undef je_sallocm
-#undef je_dallocm
-#undef je_nallocm
-#endif
+#  undef je_malloc_conf
+#  undef je_malloc_message
+#  undef je_malloc
+#  undef je_calloc
+#  undef je_posix_memalign
+#  undef je_aligned_alloc
+#  undef je_realloc
+#  undef je_free
+#  undef je_mallocx
+#  undef je_rallocx
+#  undef je_xallocx
+#  undef je_sallocx
+#  undef je_dallocx
+#  undef je_nallocx
+#  undef je_mallctl
+#  undef je_mallctlnametomib
+#  undef je_mallctlbymib
+#  undef je_malloc_stats_print
+#  undef je_malloc_usable_size
+#  undef je_memalign
+#  undef je_valloc
+#  undef je_allocm
+#  undef je_dallocm
+#  undef je_nallocm
+#  undef je_rallocm
+#  undef je_sallocm
 #endif
 
 #ifdef __cplusplus
diff --git a/dep/jemalloc/jemalloc_defs.h.in.cmake b/dep/jemalloc/jemalloc_defs.h.in.cmake
index 9fdf53546e3..89e496f4acb 100644
--- a/dep/jemalloc/jemalloc_defs.h.in.cmake
+++ b/dep/jemalloc/jemalloc_defs.h.in.cmake
@@ -266,3 +266,9 @@
 
 /* sizeof(intmax_t) == 2^LG_SIZEOF_INTMAX_T. */
 #define LG_SIZEOF_INTMAX_T 3
+
+/* C99 restrict keyword supported. */
+#define JEMALLOC_HAS_RESTRICT
+
+/* JEMALLOC_CODE_COVERAGE enables test code coverage analysis. */
+#undef JEMALLOC_CODE_COVERAGE
diff --git a/dep/jemalloc/src/arena.c b/dep/jemalloc/src/arena.c
index 05a787f89d9..dad707b63d0 100644
--- a/dep/jemalloc/src/arena.c
+++ b/dep/jemalloc/src/arena.c
@@ -38,52 +38,18 @@ const uint8_t	small_size2bin[] = {
 };
 
 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static void	arena_avail_insert(arena_t *arena, arena_chunk_t *chunk,
-    size_t pageind, size_t npages, bool maybe_adjac_pred,
-    bool maybe_adjac_succ);
-static void	arena_avail_remove(arena_t *arena, arena_chunk_t *chunk,
-    size_t pageind, size_t npages, bool maybe_adjac_pred,
-    bool maybe_adjac_succ);
-static void	arena_run_split(arena_t *arena, arena_run_t *run, size_t size,
-    bool large, size_t binind, bool zero);
-static arena_chunk_t *arena_chunk_alloc(arena_t *arena);
-static void	arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk);
-static arena_run_t	*arena_run_alloc_helper(arena_t *arena, size_t size,
-    bool large, size_t binind, bool zero);
-static arena_run_t *arena_run_alloc(arena_t *arena, size_t size, bool large,
-    size_t binind, bool zero);
-static arena_chunk_t	*chunks_dirty_iter_cb(arena_chunk_tree_t *tree,
-    arena_chunk_t *chunk, void *arg);
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */
+
 static void	arena_purge(arena_t *arena, bool all);
 static void	arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty,
     bool cleaned);
-static void	arena_run_trim_head(arena_t *arena, arena_chunk_t *chunk,
-    arena_run_t *run, size_t oldsize, size_t newsize);
-static void	arena_run_trim_tail(arena_t *arena, arena_chunk_t *chunk,
-    arena_run_t *run, size_t oldsize, size_t newsize, bool dirty);
-static arena_run_t	*arena_bin_runs_first(arena_bin_t *bin);
-static void	arena_bin_runs_insert(arena_bin_t *bin, arena_run_t *run);
-static void	arena_bin_runs_remove(arena_bin_t *bin, arena_run_t *run);
-static arena_run_t *arena_bin_nonfull_run_tryget(arena_bin_t *bin);
-static arena_run_t *arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin);
-static void	*arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin);
-static void	arena_dissociate_bin_run(arena_chunk_t *chunk, arena_run_t *run,
-    arena_bin_t *bin);
 static void	arena_dalloc_bin_run(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, arena_bin_t *bin);
 static void	arena_bin_lower_run(arena_t *arena, arena_chunk_t *chunk,
     arena_run_t *run, arena_bin_t *bin);
-static void	arena_ralloc_large_shrink(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t oldsize, size_t size);
-static bool	arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk,
-    void *ptr, size_t oldsize, size_t size, size_t extra, bool zero);
-static bool	arena_ralloc_large(void *ptr, size_t oldsize, size_t size,
-    size_t extra, bool zero);
-static size_t	bin_info_run_size_calc(arena_bin_info_t *bin_info,
-    size_t min_run_size);
-static void	bin_info_init(void);
 
 /******************************************************************************/
 
@@ -369,62 +335,63 @@ arena_run_zero(arena_chunk_t *chunk, size_t run_ind, size_t npages)
 }
 
 static inline void
+arena_run_page_mark_zeroed(arena_chunk_t *chunk, size_t run_ind)
+{
+
+	VALGRIND_MAKE_MEM_DEFINED((void *)((uintptr_t)chunk + (run_ind <<
+	    LG_PAGE)), PAGE);
+}
+
+static inline void
 arena_run_page_validate_zeroed(arena_chunk_t *chunk, size_t run_ind)
 {
 	size_t i;
 	UNUSED size_t *p = (size_t *)((uintptr_t)chunk + (run_ind << LG_PAGE));
 
-	VALGRIND_MAKE_MEM_DEFINED((void *)((uintptr_t)chunk + (run_ind <<
-	    LG_PAGE)), PAGE);
+	arena_run_page_mark_zeroed(chunk, run_ind);
 	for (i = 0; i < PAGE / sizeof(size_t); i++)
 		assert(p[i] == 0);
 }
 
 static void
-arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
-    size_t binind, bool zero)
+arena_cactive_update(arena_t *arena, size_t add_pages, size_t sub_pages)
 {
-	arena_chunk_t *chunk;
-	size_t run_ind, total_pages, need_pages, rem_pages, i;
-	size_t flag_dirty;
 
-	assert((large && binind == BININD_INVALID) || (large == false && binind
-	    != BININD_INVALID));
+	if (config_stats) {
+		ssize_t cactive_diff = CHUNK_CEILING((arena->nactive +
+		    add_pages) << LG_PAGE) - CHUNK_CEILING((arena->nactive -
+		    sub_pages) << LG_PAGE);
+		if (cactive_diff != 0)
+			stats_cactive_add(cactive_diff);
+	}
+}
+
+static void
+arena_run_split_remove(arena_t *arena, arena_chunk_t *chunk, size_t run_ind,
+    size_t flag_dirty, size_t need_pages)
+{
+	size_t total_pages, rem_pages;
 
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
-	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
 	total_pages = arena_mapbits_unallocated_size_get(chunk, run_ind) >>
 	    LG_PAGE;
 	assert(arena_mapbits_dirty_get(chunk, run_ind+total_pages-1) ==
 	    flag_dirty);
-	need_pages = (size >> LG_PAGE);
-	assert(need_pages > 0);
 	assert(need_pages <= total_pages);
 	rem_pages = total_pages - need_pages;
 
 	arena_avail_remove(arena, chunk, run_ind, total_pages, true, true);
-	if (config_stats) {
-		/*
-		 * Update stats_cactive if nactive is crossing a chunk
-		 * multiple.
-		 */
-		size_t cactive_diff = CHUNK_CEILING((arena->nactive +
-		    need_pages) << LG_PAGE) - CHUNK_CEILING(arena->nactive <<
-		    LG_PAGE);
-		if (cactive_diff != 0)
-			stats_cactive_add(cactive_diff);
-	}
+	arena_cactive_update(arena, need_pages, 0);
 	arena->nactive += need_pages;
 
 	/* Keep track of trailing unused pages for later use. */
 	if (rem_pages > 0) {
 		if (flag_dirty != 0) {
-			arena_mapbits_unallocated_set(chunk, run_ind+need_pages,
-			    (rem_pages << LG_PAGE), CHUNK_MAP_DIRTY);
+			arena_mapbits_unallocated_set(chunk,
+			    run_ind+need_pages, (rem_pages << LG_PAGE),
+			    flag_dirty);
 			arena_mapbits_unallocated_set(chunk,
 			    run_ind+total_pages-1, (rem_pages << LG_PAGE),
-			    CHUNK_MAP_DIRTY);
+			    flag_dirty);
 		} else {
 			arena_mapbits_unallocated_set(chunk, run_ind+need_pages,
 			    (rem_pages << LG_PAGE),
@@ -438,152 +405,219 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large,
 		arena_avail_insert(arena, chunk, run_ind+need_pages, rem_pages,
 		    false, true);
 	}
+}
 
-	/*
-	 * Update the page map separately for large vs. small runs, since it is
-	 * possible to avoid iteration for large mallocs.
-	 */
-	if (large) {
-		if (zero) {
-			if (flag_dirty == 0) {
-				/*
-				 * The run is clean, so some pages may be
-				 * zeroed (i.e. never before touched).
-				 */
-				for (i = 0; i < need_pages; i++) {
-					if (arena_mapbits_unzeroed_get(chunk,
-					    run_ind+i) != 0) {
-						arena_run_zero(chunk, run_ind+i,
-						    1);
-					} else if (config_debug) {
-						arena_run_page_validate_zeroed(
-						    chunk, run_ind+i);
-					}
+static void
+arena_run_split_large_helper(arena_t *arena, arena_run_t *run, size_t size,
+    bool remove, bool zero)
+{
+	arena_chunk_t *chunk;
+	size_t flag_dirty, run_ind, need_pages, i;
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
+	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
+	need_pages = (size >> LG_PAGE);
+	assert(need_pages > 0);
+
+	if (remove) {
+		arena_run_split_remove(arena, chunk, run_ind, flag_dirty,
+		    need_pages);
+	}
+
+	if (zero) {
+		if (flag_dirty == 0) {
+			/*
+			 * The run is clean, so some pages may be zeroed (i.e.
+			 * never before touched).
+			 */
+			for (i = 0; i < need_pages; i++) {
+				if (arena_mapbits_unzeroed_get(chunk, run_ind+i)
+				    != 0)
+					arena_run_zero(chunk, run_ind+i, 1);
+				else if (config_debug) {
+					arena_run_page_validate_zeroed(chunk,
+					    run_ind+i);
+				} else {
+					arena_run_page_mark_zeroed(chunk,
+					    run_ind+i);
 				}
-			} else {
-				/*
-				 * The run is dirty, so all pages must be
-				 * zeroed.
-				 */
-				arena_run_zero(chunk, run_ind, need_pages);
 			}
+		} else {
+			/* The run is dirty, so all pages must be zeroed. */
+			arena_run_zero(chunk, run_ind, need_pages);
 		}
-
-		/*
-		 * Set the last element first, in case the run only contains one
-		 * page (i.e. both statements set the same element).
-		 */
-		arena_mapbits_large_set(chunk, run_ind+need_pages-1, 0,
-		    flag_dirty);
-		arena_mapbits_large_set(chunk, run_ind, size, flag_dirty);
 	} else {
-		assert(zero == false);
-		/*
-		 * Propagate the dirty and unzeroed flags to the allocated
-		 * small run, so that arena_dalloc_bin_run() has the ability to
-		 * conditionally trim clean pages.
-		 */
-		arena_mapbits_small_set(chunk, run_ind, 0, binind, flag_dirty);
-		/*
-		 * The first page will always be dirtied during small run
-		 * initialization, so a validation failure here would not
-		 * actually cause an observable failure.
-		 */
-		if (config_debug && flag_dirty == 0 &&
-		    arena_mapbits_unzeroed_get(chunk, run_ind) == 0)
-			arena_run_page_validate_zeroed(chunk, run_ind);
-		for (i = 1; i < need_pages - 1; i++) {
-			arena_mapbits_small_set(chunk, run_ind+i, i, binind, 0);
-			if (config_debug && flag_dirty == 0 &&
-			    arena_mapbits_unzeroed_get(chunk, run_ind+i) == 0) {
-				arena_run_page_validate_zeroed(chunk,
-				    run_ind+i);
-			}
-		}
-		arena_mapbits_small_set(chunk, run_ind+need_pages-1,
-		    need_pages-1, binind, flag_dirty);
-		if (config_debug && flag_dirty == 0 &&
-		    arena_mapbits_unzeroed_get(chunk, run_ind+need_pages-1) ==
-		    0) {
-			arena_run_page_validate_zeroed(chunk,
-			    run_ind+need_pages-1);
-		}
+		VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
+		    (run_ind << LG_PAGE)), (need_pages << LG_PAGE));
 	}
-	VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk + (run_ind <<
-	    LG_PAGE)), (need_pages << LG_PAGE));
+
+	/*
+	 * Set the last element first, in case the run only contains one page
+	 * (i.e. both statements set the same element).
+	 */
+	arena_mapbits_large_set(chunk, run_ind+need_pages-1, 0, flag_dirty);
+	arena_mapbits_large_set(chunk, run_ind, size, flag_dirty);
+}
+
+static void
+arena_run_split_large(arena_t *arena, arena_run_t *run, size_t size, bool zero)
+{
+
+	arena_run_split_large_helper(arena, run, size, true, zero);
+}
+
+static void
+arena_run_init_large(arena_t *arena, arena_run_t *run, size_t size, bool zero)
+{
+
+	arena_run_split_large_helper(arena, run, size, false, zero);
+}
+
+static void
+arena_run_split_small(arena_t *arena, arena_run_t *run, size_t size,
+    size_t binind)
+{
+	arena_chunk_t *chunk;
+	size_t flag_dirty, run_ind, need_pages, i;
+
+	assert(binind != BININD_INVALID);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
+	flag_dirty = arena_mapbits_dirty_get(chunk, run_ind);
+	need_pages = (size >> LG_PAGE);
+	assert(need_pages > 0);
+
+	arena_run_split_remove(arena, chunk, run_ind, flag_dirty, need_pages);
+
+	/*
+	 * Propagate the dirty and unzeroed flags to the allocated small run,
+	 * so that arena_dalloc_bin_run() has the ability to conditionally trim
+	 * clean pages.
+	 */
+	arena_mapbits_small_set(chunk, run_ind, 0, binind, flag_dirty);
+	/*
+	 * The first page will always be dirtied during small run
+	 * initialization, so a validation failure here would not actually
+	 * cause an observable failure.
+	 */
+	if (config_debug && flag_dirty == 0 && arena_mapbits_unzeroed_get(chunk,
+	    run_ind) == 0)
+		arena_run_page_validate_zeroed(chunk, run_ind);
+	for (i = 1; i < need_pages - 1; i++) {
+		arena_mapbits_small_set(chunk, run_ind+i, i, binind, 0);
+		if (config_debug && flag_dirty == 0 &&
+		    arena_mapbits_unzeroed_get(chunk, run_ind+i) == 0)
+			arena_run_page_validate_zeroed(chunk, run_ind+i);
+	}
+	arena_mapbits_small_set(chunk, run_ind+need_pages-1, need_pages-1,
+	    binind, flag_dirty);
+	if (config_debug && flag_dirty == 0 && arena_mapbits_unzeroed_get(chunk,
+	    run_ind+need_pages-1) == 0)
+		arena_run_page_validate_zeroed(chunk, run_ind+need_pages-1);
+	VALGRIND_MAKE_MEM_UNDEFINED((void *)((uintptr_t)chunk +
+	    (run_ind << LG_PAGE)), (need_pages << LG_PAGE));
 }
 
 static arena_chunk_t *
-arena_chunk_alloc(arena_t *arena)
+arena_chunk_init_spare(arena_t *arena)
 {
 	arena_chunk_t *chunk;
-	size_t i;
 
-	if (arena->spare != NULL) {
-		chunk = arena->spare;
-		arena->spare = NULL;
+	assert(arena->spare != NULL);
 
-		assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
-		assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
-		assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
-		    arena_maxclass);
-		assert(arena_mapbits_unallocated_size_get(chunk,
-		    chunk_npages-1) == arena_maxclass);
-		assert(arena_mapbits_dirty_get(chunk, map_bias) ==
-		    arena_mapbits_dirty_get(chunk, chunk_npages-1));
-	} else {
-		bool zero;
-		size_t unzeroed;
+	chunk = arena->spare;
+	arena->spare = NULL;
 
-		zero = false;
-		malloc_mutex_unlock(&arena->lock);
-		chunk = (arena_chunk_t *)chunk_alloc(chunksize, chunksize,
-		    false, &zero, arena->dss_prec);
-		malloc_mutex_lock(&arena->lock);
-		if (chunk == NULL)
-			return (NULL);
-		if (config_stats)
-			arena->stats.mapped += chunksize;
+	assert(arena_mapbits_allocated_get(chunk, map_bias) == 0);
+	assert(arena_mapbits_allocated_get(chunk, chunk_npages-1) == 0);
+	assert(arena_mapbits_unallocated_size_get(chunk, map_bias) ==
+	    arena_maxclass);
+	assert(arena_mapbits_unallocated_size_get(chunk, chunk_npages-1) ==
+	    arena_maxclass);
+	assert(arena_mapbits_dirty_get(chunk, map_bias) ==
+	    arena_mapbits_dirty_get(chunk, chunk_npages-1));
 
-		chunk->arena = arena;
+	return (chunk);
+}
 
-		/*
-		 * Claim that no pages are in use, since the header is merely
-		 * overhead.
-		 */
-		chunk->ndirty = 0;
+static arena_chunk_t *
+arena_chunk_init_hard(arena_t *arena)
+{
+	arena_chunk_t *chunk;
+	bool zero;
+	size_t unzeroed, i;
 
-		chunk->nruns_avail = 0;
-		chunk->nruns_adjac = 0;
+	assert(arena->spare == NULL);
 
-		/*
-		 * Initialize the map to contain one maximal free untouched run.
-		 * Mark the pages as zeroed iff chunk_alloc() returned a zeroed
-		 * chunk.
-		 */
-		unzeroed = zero ? 0 : CHUNK_MAP_UNZEROED;
-		arena_mapbits_unallocated_set(chunk, map_bias, arena_maxclass,
-		    unzeroed);
-		/*
-		 * There is no need to initialize the internal page map entries
-		 * unless the chunk is not zeroed.
-		 */
-		if (zero == false) {
-			for (i = map_bias+1; i < chunk_npages-1; i++)
-				arena_mapbits_unzeroed_set(chunk, i, unzeroed);
-		} else if (config_debug) {
-			VALGRIND_MAKE_MEM_DEFINED(
-			    (void *)arena_mapp_get(chunk, map_bias+1),
-			    (void *)((uintptr_t)
-			    arena_mapp_get(chunk, chunk_npages-1)
-			    - (uintptr_t)arena_mapp_get(chunk, map_bias+1)));
+	zero = false;
+	malloc_mutex_unlock(&arena->lock);
+	chunk = (arena_chunk_t *)chunk_alloc(chunksize, chunksize, false,
+	    &zero, arena->dss_prec);
+	malloc_mutex_lock(&arena->lock);
+	if (chunk == NULL)
+		return (NULL);
+	if (config_stats)
+		arena->stats.mapped += chunksize;
+
+	chunk->arena = arena;
+
+	/*
+	 * Claim that no pages are in use, since the header is merely overhead.
+	 */
+	chunk->ndirty = 0;
+
+	chunk->nruns_avail = 0;
+	chunk->nruns_adjac = 0;
+
+	/*
+	 * Initialize the map to contain one maximal free untouched run.  Mark
+	 * the pages as zeroed iff chunk_alloc() returned a zeroed chunk.
+	 */
+	unzeroed = zero ? 0 : CHUNK_MAP_UNZEROED;
+	arena_mapbits_unallocated_set(chunk, map_bias, arena_maxclass,
+	    unzeroed);
+	/*
+	 * There is no need to initialize the internal page map entries unless
+	 * the chunk is not zeroed.
+	 */
+	if (zero == false) {
+		VALGRIND_MAKE_MEM_UNDEFINED((void *)arena_mapp_get(chunk,
+		    map_bias+1), (size_t)((uintptr_t) arena_mapp_get(chunk,
+		    chunk_npages-1) - (uintptr_t)arena_mapp_get(chunk,
+		    map_bias+1)));
+		for (i = map_bias+1; i < chunk_npages-1; i++)
+			arena_mapbits_unzeroed_set(chunk, i, unzeroed);
+	} else {
+		VALGRIND_MAKE_MEM_DEFINED((void *)arena_mapp_get(chunk,
+		    map_bias+1), (size_t)((uintptr_t) arena_mapp_get(chunk,
+		    chunk_npages-1) - (uintptr_t)arena_mapp_get(chunk,
+		    map_bias+1)));
+		if (config_debug) {
 			for (i = map_bias+1; i < chunk_npages-1; i++) {
 				assert(arena_mapbits_unzeroed_get(chunk, i) ==
 				    unzeroed);
 			}
 		}
-		arena_mapbits_unallocated_set(chunk, chunk_npages-1,
-		    arena_maxclass, unzeroed);
+	}
+	arena_mapbits_unallocated_set(chunk, chunk_npages-1, arena_maxclass,
+	    unzeroed);
+
+	return (chunk);
+}
+
+static arena_chunk_t *
+arena_chunk_alloc(arena_t *arena)
+{
+	arena_chunk_t *chunk;
+
+	if (arena->spare != NULL)
+		chunk = arena_chunk_init_spare(arena);
+	else {
+		chunk = arena_chunk_init_hard(arena);
+		if (chunk == NULL)
+			return (NULL);
 	}
 
 	/* Insert the run into the runs_avail tree. */
@@ -626,8 +660,7 @@ arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk)
 }
 
 static arena_run_t *
-arena_run_alloc_helper(arena_t *arena, size_t size, bool large, size_t binind,
-    bool zero)
+arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
 {
 	arena_run_t *run;
 	arena_chunk_map_t *mapelm, key;
@@ -642,7 +675,7 @@ arena_run_alloc_helper(arena_t *arena, size_t size, bool large, size_t binind,
 
 		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
 		    LG_PAGE));
-		arena_run_split(arena, run, size, large, binind, zero);
+		arena_run_split_large(arena, run, size, zero);
 		return (run);
 	}
 
@@ -650,19 +683,16 @@ arena_run_alloc_helper(arena_t *arena, size_t size, bool large, size_t binind,
 }
 
 static arena_run_t *
-arena_run_alloc(arena_t *arena, size_t size, bool large, size_t binind,
-    bool zero)
+arena_run_alloc_large(arena_t *arena, size_t size, bool zero)
 {
 	arena_chunk_t *chunk;
 	arena_run_t *run;
 
 	assert(size <= arena_maxclass);
 	assert((size & PAGE_MASK) == 0);
-	assert((large && binind == BININD_INVALID) || (large == false && binind
-	    != BININD_INVALID));
 
 	/* Search the arena's chunks for the lowest best fit. */
-	run = arena_run_alloc_helper(arena, size, large, binind, zero);
+	run = arena_run_alloc_large_helper(arena, size, zero);
 	if (run != NULL)
 		return (run);
 
@@ -672,7 +702,7 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, size_t binind,
 	chunk = arena_chunk_alloc(arena);
 	if (chunk != NULL) {
 		run = (arena_run_t *)((uintptr_t)chunk + (map_bias << LG_PAGE));
-		arena_run_split(arena, run, size, large, binind, zero);
+		arena_run_split_large(arena, run, size, zero);
 		return (run);
 	}
 
@@ -681,7 +711,63 @@ arena_run_alloc(arena_t *arena, size_t size, bool large, size_t binind,
 	 * sufficient memory available while this one dropped arena->lock in
 	 * arena_chunk_alloc(), so search one more time.
 	 */
-	return (arena_run_alloc_helper(arena, size, large, binind, zero));
+	return (arena_run_alloc_large_helper(arena, size, zero));
+}
+
+static arena_run_t *
+arena_run_alloc_small_helper(arena_t *arena, size_t size, size_t binind)
+{
+	arena_run_t *run;
+	arena_chunk_map_t *mapelm, key;
+
+	key.bits = size | CHUNK_MAP_KEY;
+	mapelm = arena_avail_tree_nsearch(&arena->runs_avail, &key);
+	if (mapelm != NULL) {
+		arena_chunk_t *run_chunk = CHUNK_ADDR2BASE(mapelm);
+		size_t pageind = (((uintptr_t)mapelm -
+		    (uintptr_t)run_chunk->map) / sizeof(arena_chunk_map_t))
+		    + map_bias;
+
+		run = (arena_run_t *)((uintptr_t)run_chunk + (pageind <<
+		    LG_PAGE));
+		arena_run_split_small(arena, run, size, binind);
+		return (run);
+	}
+
+	return (NULL);
+}
+
+static arena_run_t *
+arena_run_alloc_small(arena_t *arena, size_t size, size_t binind)
+{
+	arena_chunk_t *chunk;
+	arena_run_t *run;
+
+	assert(size <= arena_maxclass);
+	assert((size & PAGE_MASK) == 0);
+	assert(binind != BININD_INVALID);
+
+	/* Search the arena's chunks for the lowest best fit. */
+	run = arena_run_alloc_small_helper(arena, size, binind);
+	if (run != NULL)
+		return (run);
+
+	/*
+	 * No usable runs.  Create a new chunk from which to allocate the run.
+	 */
+	chunk = arena_chunk_alloc(arena);
+	if (chunk != NULL) {
+		run = (arena_run_t *)((uintptr_t)chunk + (map_bias << LG_PAGE));
+		arena_run_split_small(arena, run, size, binind);
+		return (run);
+	}
+
+	/*
+	 * arena_chunk_alloc() failed, but another thread may have made
+	 * sufficient memory available while this one dropped arena->lock in
+	 * arena_chunk_alloc(), so search one more time.
+	 */
+	return (arena_run_alloc_small_helper(arena, size, binind));
 }
 
 static inline void
@@ -707,48 +793,42 @@ arena_maybe_purge(arena_t *arena)
 	arena_purge(arena, false);
 }
 
-static inline size_t
-arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
+static arena_chunk_t *
+chunks_dirty_iter_cb(arena_chunk_tree_t *tree, arena_chunk_t *chunk, void *arg)
 {
-	size_t npurged;
-	ql_head(arena_chunk_map_t) mapelms;
-	arena_chunk_map_t *mapelm;
-	size_t pageind, npages;
-	size_t nmadvise;
+       size_t *ndirty = (size_t *)arg;
 
-	ql_new(&mapelms);
+       assert(chunk->ndirty != 0);
+       *ndirty += chunk->ndirty;
+       return (NULL);
+}
+
+static size_t
+arena_compute_npurgatory(arena_t *arena, bool all)
+{
+	size_t npurgatory, npurgeable;
 
 	/*
-	 * If chunk is the spare, temporarily re-allocate it, 1) so that its
-	 * run is reinserted into runs_avail, and 2) so that it cannot be
-	 * completely discarded by another thread while arena->lock is dropped
-	 * by this thread.  Note that the arena_run_dalloc() call will
-	 * implicitly deallocate the chunk, so no explicit action is required
-	 * in this function to deallocate the chunk.
-	 *
-	 * Note that once a chunk contains dirty pages, it cannot again contain
-	 * a single run unless 1) it is a dirty run, or 2) this function purges
-	 * dirty pages and causes the transition to a single clean run.  Thus
-	 * (chunk == arena->spare) is possible, but it is not possible for
-	 * this function to be called on the spare unless it contains a dirty
-	 * run.
+	 * Compute the minimum number of pages that this thread should try to
+	 * purge.
 	 */
-	if (chunk == arena->spare) {
-		assert(arena_mapbits_dirty_get(chunk, map_bias) != 0);
-		assert(arena_mapbits_dirty_get(chunk, chunk_npages-1) != 0);
+	npurgeable = arena->ndirty - arena->npurgatory;
 
-		arena_chunk_alloc(arena);
-	}
+	if (all == false) {
+		size_t threshold = (arena->nactive >> opt_lg_dirty_mult);
 
-	if (config_stats)
-		arena->stats.purged += chunk->ndirty;
+		npurgatory = npurgeable - threshold;
+	} else
+		npurgatory = npurgeable;
 
-	/*
-	 * Operate on all dirty runs if there is no clean/dirty run
-	 * fragmentation.
-	 */
-	if (chunk->nruns_adjac == 0)
-		all = true;
+	return (npurgatory);
+}
+
+static void
+arena_chunk_stash_dirty(arena_t *arena, arena_chunk_t *chunk, bool all,
+    arena_chunk_mapelms_t *mapelms)
+{
+	size_t pageind, npages;
 
 	/*
 	 * Temporarily allocate free dirty runs within chunk.  If all is false,
@@ -756,7 +836,7 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 	 * all dirty runs.
 	 */
 	for (pageind = map_bias; pageind < chunk_npages; pageind += npages) {
-		mapelm = arena_mapp_get(chunk, pageind);
+		arena_chunk_map_t *mapelm = arena_mapp_get(chunk, pageind);
 		if (arena_mapbits_allocated_get(chunk, pageind) == 0) {
 			size_t run_size =
 			    arena_mapbits_unallocated_size_get(chunk, pageind);
@@ -772,11 +852,11 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 				arena_run_t *run = (arena_run_t *)((uintptr_t)
 				    chunk + (uintptr_t)(pageind << LG_PAGE));
 
-				arena_run_split(arena, run, run_size, true,
-				    BININD_INVALID, false);
+				arena_run_split_large(arena, run, run_size,
+				    false);
 				/* Append to list for later processing. */
 				ql_elm_new(mapelm, u.ql_link);
-				ql_tail_insert(&mapelms, mapelm, u.ql_link);
+				ql_tail_insert(mapelms, mapelm, u.ql_link);
 			}
 		} else {
 			/* Skip run. */
@@ -800,12 +880,20 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 	assert(pageind == chunk_npages);
 	assert(chunk->ndirty == 0 || all == false);
 	assert(chunk->nruns_adjac == 0);
+}
+
+static size_t
+arena_chunk_purge_stashed(arena_t *arena, arena_chunk_t *chunk,
+    arena_chunk_mapelms_t *mapelms)
+{
+	size_t npurged, pageind, npages, nmadvise;
+	arena_chunk_map_t *mapelm;
 
 	malloc_mutex_unlock(&arena->lock);
 	if (config_stats)
 		nmadvise = 0;
 	npurged = 0;
-	ql_foreach(mapelm, &mapelms, u.ql_link) {
+	ql_foreach(mapelm, mapelms, u.ql_link) {
 		bool unzeroed;
 		size_t flag_unzeroed, i;
 
@@ -839,30 +927,75 @@ arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 	if (config_stats)
 		arena->stats.nmadvise += nmadvise;
 
+	return (npurged);
+}
+
+static void
+arena_chunk_unstash_purged(arena_t *arena, arena_chunk_t *chunk,
+    arena_chunk_mapelms_t *mapelms)
+{
+	arena_chunk_map_t *mapelm;
+	size_t pageind;
+
 	/* Deallocate runs. */
-	for (mapelm = ql_first(&mapelms); mapelm != NULL;
-	    mapelm = ql_first(&mapelms)) {
+	for (mapelm = ql_first(mapelms); mapelm != NULL;
+	    mapelm = ql_first(mapelms)) {
 		arena_run_t *run;
 
 		pageind = (((uintptr_t)mapelm - (uintptr_t)chunk->map) /
 		    sizeof(arena_chunk_map_t)) + map_bias;
 		run = (arena_run_t *)((uintptr_t)chunk + (uintptr_t)(pageind <<
 		    LG_PAGE));
-		ql_remove(&mapelms, mapelm, u.ql_link);
+		ql_remove(mapelms, mapelm, u.ql_link);
 		arena_run_dalloc(arena, run, false, true);
 	}
-
-	return (npurged);
 }
 
-static arena_chunk_t *
-chunks_dirty_iter_cb(arena_chunk_tree_t *tree, arena_chunk_t *chunk, void *arg)
+static inline size_t
+arena_chunk_purge(arena_t *arena, arena_chunk_t *chunk, bool all)
 {
-       size_t *ndirty = (size_t *)arg;
+	size_t npurged;
+	arena_chunk_mapelms_t mapelms;
 
-       assert(chunk->ndirty != 0);
-       *ndirty += chunk->ndirty;
-       return (NULL);
+	ql_new(&mapelms);
+
+	/*
+	 * If chunk is the spare, temporarily re-allocate it, 1) so that its
+	 * run is reinserted into runs_avail, and 2) so that it cannot be
+	 * completely discarded by another thread while arena->lock is dropped
+	 * by this thread.  Note that the arena_run_dalloc() call will
+	 * implicitly deallocate the chunk, so no explicit action is required
+	 * in this function to deallocate the chunk.
+	 *
+	 * Note that once a chunk contains dirty pages, it cannot again contain
+	 * a single run unless 1) it is a dirty run, or 2) this function purges
+	 * dirty pages and causes the transition to a single clean run.  Thus
+	 * (chunk == arena->spare) is possible, but it is not possible for
+	 * this function to be called on the spare unless it contains a dirty
+	 * run.
+	 */
+	if (chunk == arena->spare) {
+		assert(arena_mapbits_dirty_get(chunk, map_bias) != 0);
+		assert(arena_mapbits_dirty_get(chunk, chunk_npages-1) != 0);
+
+		arena_chunk_alloc(arena);
+	}
+
+	if (config_stats)
+		arena->stats.purged += chunk->ndirty;
+
+	/*
+	 * Operate on all dirty runs if there is no clean/dirty run
+	 * fragmentation.
+	 */
+	if (chunk->nruns_adjac == 0)
+		all = true;
+
+	arena_chunk_stash_dirty(arena, chunk, all, &mapelms);
+	npurged = arena_chunk_purge_stashed(arena, chunk, &mapelms);
+	arena_chunk_unstash_purged(arena, chunk, &mapelms);
+
+	return (npurged);
 }
 
 static void
@@ -885,21 +1018,11 @@ arena_purge(arena_t *arena, bool all)
 		arena->stats.npurge++;
 
 	/*
-	 * Compute the minimum number of pages that this thread should try to
-	 * purge, and add the result to arena->npurgatory.  This will keep
-	 * multiple threads from racing to reduce ndirty below the threshold.
+	 * Add the minimum number of pages this thread should try to purge to
+	 * arena->npurgatory.  This will keep multiple threads from racing to
+	 * reduce ndirty below the threshold.
 	 */
-	{
-		size_t npurgeable = arena->ndirty - arena->npurgatory;
-
-		if (all == false) {
-			size_t threshold = (arena->nactive >>
-			    opt_lg_dirty_mult);
-
-			npurgatory = npurgeable - threshold;
-		} else
-			npurgatory = npurgeable;
-	}
+	npurgatory = arena_compute_npurgatory(arena, all);
 	arena->npurgatory += npurgatory;
 
 	while (npurgatory > 0) {
@@ -966,61 +1089,12 @@ arena_purge_all(arena_t *arena)
 }
 
 static void
-arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
+arena_run_coalesce(arena_t *arena, arena_chunk_t *chunk, size_t *p_size,
+    size_t *p_run_ind, size_t *p_run_pages, size_t flag_dirty)
 {
-	arena_chunk_t *chunk;
-	size_t size, run_ind, run_pages, flag_dirty;
-
-	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
-	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
-	assert(run_ind >= map_bias);
-	assert(run_ind < chunk_npages);
-	if (arena_mapbits_large_get(chunk, run_ind) != 0) {
-		size = arena_mapbits_large_size_get(chunk, run_ind);
-		assert(size == PAGE ||
-		    arena_mapbits_large_size_get(chunk,
-		    run_ind+(size>>LG_PAGE)-1) == 0);
-	} else {
-		size_t binind = arena_bin_index(arena, run->bin);
-		arena_bin_info_t *bin_info = &arena_bin_info[binind];
-		size = bin_info->run_size;
-	}
-	run_pages = (size >> LG_PAGE);
-	if (config_stats) {
-		/*
-		 * Update stats_cactive if nactive is crossing a chunk
-		 * multiple.
-		 */
-		size_t cactive_diff = CHUNK_CEILING(arena->nactive << LG_PAGE) -
-		    CHUNK_CEILING((arena->nactive - run_pages) << LG_PAGE);
-		if (cactive_diff != 0)
-			stats_cactive_sub(cactive_diff);
-	}
-	arena->nactive -= run_pages;
-
-	/*
-	 * The run is dirty if the caller claims to have dirtied it, as well as
-	 * if it was already dirty before being allocated and the caller
-	 * doesn't claim to have cleaned it.
-	 */
-	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
-	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
-	if (cleaned == false && arena_mapbits_dirty_get(chunk, run_ind) != 0)
-		dirty = true;
-	flag_dirty = dirty ? CHUNK_MAP_DIRTY : 0;
-
-	/* Mark pages as unallocated in the chunk map. */
-	if (dirty) {
-		arena_mapbits_unallocated_set(chunk, run_ind, size,
-		    CHUNK_MAP_DIRTY);
-		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
-		    CHUNK_MAP_DIRTY);
-	} else {
-		arena_mapbits_unallocated_set(chunk, run_ind, size,
-		    arena_mapbits_unzeroed_get(chunk, run_ind));
-		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
-		    arena_mapbits_unzeroed_get(chunk, run_ind+run_pages-1));
-	}
+	size_t size = *p_size;
+	size_t run_ind = *p_run_ind;
+	size_t run_pages = *p_run_pages;
 
 	/* Try to coalesce forward. */
 	if (run_ind + run_pages < chunk_npages &&
@@ -1050,8 +1124,9 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 	}
 
 	/* Try to coalesce backward. */
-	if (run_ind > map_bias && arena_mapbits_allocated_get(chunk, run_ind-1)
-	    == 0 && arena_mapbits_dirty_get(chunk, run_ind-1) == flag_dirty) {
+	if (run_ind > map_bias && arena_mapbits_allocated_get(chunk,
+	    run_ind-1) == 0 && arena_mapbits_dirty_get(chunk, run_ind-1) ==
+	    flag_dirty) {
 		size_t prun_size = arena_mapbits_unallocated_size_get(chunk,
 		    run_ind-1);
 		size_t prun_pages = prun_size >> LG_PAGE;
@@ -1076,6 +1151,62 @@ arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
 		    size);
 	}
 
+	*p_size = size;
+	*p_run_ind = run_ind;
+	*p_run_pages = run_pages;
+}
+
+static void
+arena_run_dalloc(arena_t *arena, arena_run_t *run, bool dirty, bool cleaned)
+{
+	arena_chunk_t *chunk;
+	size_t size, run_ind, run_pages, flag_dirty;
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
+	run_ind = (size_t)(((uintptr_t)run - (uintptr_t)chunk) >> LG_PAGE);
+	assert(run_ind >= map_bias);
+	assert(run_ind < chunk_npages);
+	if (arena_mapbits_large_get(chunk, run_ind) != 0) {
+		size = arena_mapbits_large_size_get(chunk, run_ind);
+		assert(size == PAGE ||
+		    arena_mapbits_large_size_get(chunk,
+		    run_ind+(size>>LG_PAGE)-1) == 0);
+	} else {
+		size_t binind = arena_bin_index(arena, run->bin);
+		arena_bin_info_t *bin_info = &arena_bin_info[binind];
+		size = bin_info->run_size;
+	}
+	run_pages = (size >> LG_PAGE);
+	arena_cactive_update(arena, 0, run_pages);
+	arena->nactive -= run_pages;
+
+	/*
+	 * The run is dirty if the caller claims to have dirtied it, as well as
+	 * if it was already dirty before being allocated and the caller
+	 * doesn't claim to have cleaned it.
+	 */
+	assert(arena_mapbits_dirty_get(chunk, run_ind) ==
+	    arena_mapbits_dirty_get(chunk, run_ind+run_pages-1));
+	if (cleaned == false && arena_mapbits_dirty_get(chunk, run_ind) != 0)
+		dirty = true;
+	flag_dirty = dirty ? CHUNK_MAP_DIRTY : 0;
+
+	/* Mark pages as unallocated in the chunk map. */
+	if (dirty) {
+		arena_mapbits_unallocated_set(chunk, run_ind, size,
+		    CHUNK_MAP_DIRTY);
+		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
+		    CHUNK_MAP_DIRTY);
+	} else {
+		arena_mapbits_unallocated_set(chunk, run_ind, size,
+		    arena_mapbits_unzeroed_get(chunk, run_ind));
+		arena_mapbits_unallocated_set(chunk, run_ind+run_pages-1, size,
+		    arena_mapbits_unzeroed_get(chunk, run_ind+run_pages-1));
+	}
+
+	arena_run_coalesce(arena, chunk, &size, &run_ind, &run_pages,
+	    flag_dirty);
+
 	/* Insert into runs_avail, now that coalescing is complete. */
 	assert(arena_mapbits_unallocated_size_get(chunk, run_ind) ==
 	    arena_mapbits_unallocated_size_get(chunk, run_ind+run_pages-1));
@@ -1243,7 +1374,7 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 	malloc_mutex_unlock(&bin->lock);
 	/******************************/
 	malloc_mutex_lock(&arena->lock);
-	run = arena_run_alloc(arena, bin_info->run_size, false, binind, false);
+	run = arena_run_alloc_small(arena, bin_info->run_size, binind);
 	if (run != NULL) {
 		bitmap_t *bitmap = (bitmap_t *)((uintptr_t)run +
 		    (uintptr_t)bin_info->bitmap_offset);
@@ -1266,7 +1397,7 @@ arena_bin_nonfull_run_get(arena_t *arena, arena_bin_t *bin)
 	}
 
 	/*
-	 * arena_run_alloc() failed, but another thread may have made
+	 * arena_run_alloc_small() failed, but another thread may have made
 	 * sufficient memory available while this one dropped bin->lock above,
 	 * so search one more time.
 	 */
@@ -1301,12 +1432,12 @@ arena_bin_malloc_hard(arena_t *arena, arena_bin_t *bin)
 			arena_chunk_t *chunk;
 
 			/*
-			 * arena_run_alloc() may have allocated run, or it may
-			 * have pulled run from the bin's run tree.  Therefore
-			 * it is unsafe to make any assumptions about how run
-			 * has previously been used, and arena_bin_lower_run()
-			 * must be called, as if a region were just deallocated
-			 * from the run.
+			 * arena_run_alloc_small() may have allocated run, or
+			 * it may have pulled run from the bin's run tree.
+			 * Therefore it is unsafe to make any assumptions about
+			 * how run has previously been used, and
+			 * arena_bin_lower_run() must be called, as if a region
+			 * were just deallocated from the run.
 			 */
 			chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(run);
 			if (run->nfree == bin_info->nregs)
@@ -1384,8 +1515,28 @@ arena_alloc_junk_small(void *ptr, arena_bin_info_t *bin_info, bool zero)
 	}
 }
 
-void
-arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info)
+#ifdef JEMALLOC_JET
+#undef arena_redzone_corruption
+#define	arena_redzone_corruption JEMALLOC_N(arena_redzone_corruption_impl)
+#endif
+static void
+arena_redzone_corruption(void *ptr, size_t usize, bool after,
+    size_t offset, uint8_t byte)
+{
+
+	malloc_printf("<jemalloc>: Corrupt redzone %zu byte%s %s %p "
+	    "(size %zu), byte=%#x\n", offset, (offset == 1) ? "" : "s",
+	    after ? "after" : "before", ptr, usize, byte);
+}
+#ifdef JEMALLOC_JET
+#undef arena_redzone_corruption
+#define	arena_redzone_corruption JEMALLOC_N(arena_redzone_corruption)
+arena_redzone_corruption_t *arena_redzone_corruption =
+    JEMALLOC_N(arena_redzone_corruption_impl);
+#endif
+
+static void
+arena_redzones_validate(void *ptr, arena_bin_info_t *bin_info, bool reset)
 {
 	size_t size = bin_info->reg_size;
 	size_t redzone_size = bin_info->redzone_size;
@@ -1393,29 +1544,61 @@ arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info)
 	bool error = false;
 
 	for (i = 1; i <= redzone_size; i++) {
-		unsigned byte;
-		if ((byte = *(uint8_t *)((uintptr_t)ptr - i)) != 0xa5) {
+		uint8_t *byte = (uint8_t *)((uintptr_t)ptr - i);
+		if (*byte != 0xa5) {
 			error = true;
-			malloc_printf("<jemalloc>: Corrupt redzone "
-			    "%zu byte%s before %p (size %zu), byte=%#x\n", i,
-			    (i == 1) ? "" : "s", ptr, size, byte);
+			arena_redzone_corruption(ptr, size, false, i, *byte);
+			if (reset)
+				*byte = 0xa5;
 		}
 	}
 	for (i = 0; i < redzone_size; i++) {
-		unsigned byte;
-		if ((byte = *(uint8_t *)((uintptr_t)ptr + size + i)) != 0xa5) {
+		uint8_t *byte = (uint8_t *)((uintptr_t)ptr + size + i);
+		if (*byte != 0xa5) {
 			error = true;
-			malloc_printf("<jemalloc>: Corrupt redzone "
-			    "%zu byte%s after end of %p (size %zu), byte=%#x\n",
-			    i, (i == 1) ? "" : "s", ptr, size, byte);
+			arena_redzone_corruption(ptr, size, true, i, *byte);
+			if (reset)
+				*byte = 0xa5;
 		}
 	}
 	if (opt_abort && error)
 		abort();
+}
 
+#ifdef JEMALLOC_JET
+#undef arena_dalloc_junk_small
+#define	arena_dalloc_junk_small JEMALLOC_N(arena_dalloc_junk_small_impl)
+#endif
+void
+arena_dalloc_junk_small(void *ptr, arena_bin_info_t *bin_info)
+{
+	size_t redzone_size = bin_info->redzone_size;
+
+	arena_redzones_validate(ptr, bin_info, false);
 	memset((void *)((uintptr_t)ptr - redzone_size), 0x5a,
 	    bin_info->reg_interval);
 }
+#ifdef JEMALLOC_JET
+#undef arena_dalloc_junk_small
+#define	arena_dalloc_junk_small JEMALLOC_N(arena_dalloc_junk_small)
+arena_dalloc_junk_small_t *arena_dalloc_junk_small =
+    JEMALLOC_N(arena_dalloc_junk_small_impl);
+#endif
+
+void
+arena_quarantine_junk_small(void *ptr, size_t usize)
+{
+	size_t binind;
+	arena_bin_info_t *bin_info;
+	cassert(config_fill);
+	assert(opt_junk);
+	assert(opt_quarantine);
+	assert(usize <= SMALL_MAXCLASS);
+
+	binind = SMALL_SIZE2BIN(usize);
+	bin_info = &arena_bin_info[binind];
+	arena_redzones_validate(ptr, bin_info, true);
+}
 
 void *
 arena_malloc_small(arena_t *arena, size_t size, bool zero)
@@ -1458,6 +1641,7 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 			} else if (opt_zero)
 				memset(ret, 0, size);
 		}
+		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	} else {
 		if (config_fill && opt_junk) {
 			arena_alloc_junk_small(ret, &arena_bin_info[binind],
@@ -1466,7 +1650,6 @@ arena_malloc_small(arena_t *arena, size_t size, bool zero)
 		VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 		memset(ret, 0, size);
 	}
-	VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 
 	return (ret);
 }
@@ -1480,7 +1663,7 @@ arena_malloc_large(arena_t *arena, size_t size, bool zero)
 	/* Large allocation. */
 	size = PAGE_CEILING(size);
 	malloc_mutex_lock(&arena->lock);
-	ret = (void *)arena_run_alloc(arena, size, true, BININD_INVALID, zero);
+	ret = (void *)arena_run_alloc_large(arena, size, zero);
 	if (ret == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
@@ -1526,7 +1709,7 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 	alloc_size = size + alignment - PAGE;
 
 	malloc_mutex_lock(&arena->lock);
-	run = arena_run_alloc(arena, alloc_size, true, BININD_INVALID, zero);
+	run = arena_run_alloc_large(arena, alloc_size, false);
 	if (run == NULL) {
 		malloc_mutex_unlock(&arena->lock);
 		return (NULL);
@@ -1546,6 +1729,7 @@ arena_palloc(arena_t *arena, size_t size, size_t alignment, bool zero)
 		arena_run_trim_tail(arena, chunk, ret, size + trailsize, size,
 		    false);
 	}
+	arena_run_init_large(arena, (arena_run_t *)ret, size, zero);
 
 	if (config_stats) {
 		arena->stats.nmalloc_large++;
@@ -1749,21 +1933,38 @@ arena_dalloc_small(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	arena_dalloc_bin(arena, chunk, ptr, pageind, mapelm);
 }
 
+#ifdef JEMALLOC_JET
+#undef arena_dalloc_junk_large
+#define	arena_dalloc_junk_large JEMALLOC_N(arena_dalloc_junk_large_impl)
+#endif
+static void
+arena_dalloc_junk_large(void *ptr, size_t usize)
+{
+
+	if (config_fill && opt_junk)
+		memset(ptr, 0x5a, usize);
+}
+#ifdef JEMALLOC_JET
+#undef arena_dalloc_junk_large
+#define	arena_dalloc_junk_large JEMALLOC_N(arena_dalloc_junk_large)
+arena_dalloc_junk_large_t *arena_dalloc_junk_large =
+    JEMALLOC_N(arena_dalloc_junk_large_impl);
+#endif
+
 void
 arena_dalloc_large_locked(arena_t *arena, arena_chunk_t *chunk, void *ptr)
 {
 
 	if (config_fill || config_stats) {
 		size_t pageind = ((uintptr_t)ptr - (uintptr_t)chunk) >> LG_PAGE;
-		size_t size = arena_mapbits_large_size_get(chunk, pageind);
+		size_t usize = arena_mapbits_large_size_get(chunk, pageind);
 
-		if (config_fill && config_stats && opt_junk)
-			memset(ptr, 0x5a, size);
+		arena_dalloc_junk_large(ptr, usize);
 		if (config_stats) {
 			arena->stats.ndalloc_large++;
-			arena->stats.allocated_large -= size;
-			arena->stats.lstats[(size >> LG_PAGE) - 1].ndalloc++;
-			arena->stats.lstats[(size >> LG_PAGE) - 1].curruns--;
+			arena->stats.allocated_large -= usize;
+			arena->stats.lstats[(usize >> LG_PAGE) - 1].ndalloc++;
+			arena->stats.lstats[(usize >> LG_PAGE) - 1].curruns--;
 		}
 	}
 
@@ -1834,9 +2035,8 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 		size_t flag_dirty;
 		size_t splitsize = (oldsize + followsize <= size + extra)
 		    ? followsize : size + extra - oldsize;
-		arena_run_split(arena, (arena_run_t *)((uintptr_t)chunk +
-		    ((pageind+npages) << LG_PAGE)), splitsize, true,
-		    BININD_INVALID, zero);
+		arena_run_split_large(arena, (arena_run_t *)((uintptr_t)chunk +
+		    ((pageind+npages) << LG_PAGE)), splitsize, zero);
 
 		size = oldsize + splitsize;
 		npages = size >> LG_PAGE;
@@ -1875,6 +2075,26 @@ arena_ralloc_large_grow(arena_t *arena, arena_chunk_t *chunk, void *ptr,
 	return (true);
 }
 
+#ifdef JEMALLOC_JET
+#undef arena_ralloc_junk_large
+#define	arena_ralloc_junk_large JEMALLOC_N(arena_ralloc_junk_large_impl)
+#endif
+static void
+arena_ralloc_junk_large(void *ptr, size_t old_usize, size_t usize)
+{
+
+	if (config_fill && opt_junk) {
+		memset((void *)((uintptr_t)ptr + usize), 0x5a,
+		    old_usize - usize);
+	}
+}
+#ifdef JEMALLOC_JET
+#undef arena_ralloc_junk_large
+#define	arena_ralloc_junk_large JEMALLOC_N(arena_ralloc_junk_large)
+arena_ralloc_junk_large_t *arena_ralloc_junk_large =
+    JEMALLOC_N(arena_ralloc_junk_large_impl);
+#endif
+
 /*
  * Try to resize a large allocation, in order to avoid copying.  This will
  * always fail if growing an object, and the following run is already in use.
@@ -1888,10 +2108,6 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 	psize = PAGE_CEILING(size + extra);
 	if (psize == oldsize) {
 		/* Same size class. */
-		if (config_fill && opt_junk && size < oldsize) {
-			memset((void *)((uintptr_t)ptr + size), 0x5a, oldsize -
-			    size);
-		}
 		return (false);
 	} else {
 		arena_chunk_t *chunk;
@@ -1902,10 +2118,7 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 
 		if (psize < oldsize) {
 			/* Fill before shrinking in order avoid a race. */
-			if (config_fill && opt_junk) {
-				memset((void *)((uintptr_t)ptr + size), 0x5a,
-				    oldsize - size);
-			}
+			arena_ralloc_junk_large(ptr, oldsize, psize);
 			arena_ralloc_large_shrink(arena, chunk, ptr, oldsize,
 			    psize);
 			return (false);
@@ -1913,17 +2126,23 @@ arena_ralloc_large(void *ptr, size_t oldsize, size_t size, size_t extra,
 			bool ret = arena_ralloc_large_grow(arena, chunk, ptr,
 			    oldsize, PAGE_CEILING(size),
 			    psize - PAGE_CEILING(size), zero);
-			if (config_fill && ret == false && zero == false &&
-			    opt_zero) {
-				memset((void *)((uintptr_t)ptr + oldsize), 0,
-				    size - oldsize);
+			if (config_fill && ret == false && zero == false) {
+				if (opt_junk) {
+					memset((void *)((uintptr_t)ptr +
+					    oldsize), 0xa5, isalloc(ptr,
+					    config_prof) - oldsize);
+				} else if (opt_zero) {
+					memset((void *)((uintptr_t)ptr +
+					    oldsize), 0, isalloc(ptr,
+					    config_prof) - oldsize);
+				}
 			}
 			return (ret);
 		}
 	}
 }
 
-void *
+bool
 arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
     bool zero)
 {
@@ -1938,25 +2157,20 @@ arena_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
 			if ((size + extra <= SMALL_MAXCLASS &&
 			    SMALL_SIZE2BIN(size + extra) ==
 			    SMALL_SIZE2BIN(oldsize)) || (size <= oldsize &&
-			    size + extra >= oldsize)) {
-				if (config_fill && opt_junk && size < oldsize) {
-					memset((void *)((uintptr_t)ptr + size),
-					    0x5a, oldsize - size);
-				}
-				return (ptr);
-			}
+			    size + extra >= oldsize))
+				return (false);
 		} else {
 			assert(size <= arena_maxclass);
 			if (size + extra > SMALL_MAXCLASS) {
 				if (arena_ralloc_large(ptr, oldsize, size,
 				    extra, zero) == false)
-					return (ptr);
+					return (false);
 			}
 		}
 	}
 
 	/* Reallocation would require a move. */
-	return (NULL);
+	return (true);
 }
 
 void *
@@ -1968,9 +2182,8 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	size_t copysize;
 
 	/* Try to avoid moving the allocation. */
-	ret = arena_ralloc_no_move(ptr, oldsize, size, extra, zero);
-	if (ret != NULL)
-		return (ret);
+	if (arena_ralloc_no_move(ptr, oldsize, size, extra, zero) == false)
+		return (ptr);
 
 	/*
 	 * size and oldsize are different enough that we need to move the
@@ -1981,7 +2194,7 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 		size_t usize = sa2u(size + extra, alignment);
 		if (usize == 0)
 			return (NULL);
-		ret = ipallocx(usize, alignment, zero, try_tcache_alloc, arena);
+		ret = ipalloct(usize, alignment, zero, try_tcache_alloc, arena);
 	} else
 		ret = arena_malloc(arena, size + extra, zero, try_tcache_alloc);
 
@@ -1993,7 +2206,7 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 			size_t usize = sa2u(size, alignment);
 			if (usize == 0)
 				return (NULL);
-			ret = ipallocx(usize, alignment, zero, try_tcache_alloc,
+			ret = ipalloct(usize, alignment, zero, try_tcache_alloc,
 			    arena);
 		} else
 			ret = arena_malloc(arena, size, zero, try_tcache_alloc);
@@ -2011,7 +2224,7 @@ arena_ralloc(arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	copysize = (size < oldsize) ? size : oldsize;
 	VALGRIND_MAKE_MEM_UNDEFINED(ret, copysize);
 	memcpy(ret, ptr, copysize);
-	iqallocx(ptr, try_tcache_dalloc);
+	iqalloct(ptr, try_tcache_dalloc);
 	return (ret);
 }
 
@@ -2266,7 +2479,6 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info, size_t min_run_size)
 			    bin_info->reg_interval) - pad_size;
 		} while (try_hdr_size > try_redzone0_offset);
 	} while (try_run_size <= arena_maxclass
-	    && try_run_size <= arena_maxclass
 	    && RUN_MAX_OVRHD * (bin_info->reg_interval << 3) >
 	    RUN_MAX_OVRHD_RELAX
 	    && (try_redzone0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size
diff --git a/dep/jemalloc/src/bitmap.c b/dep/jemalloc/src/bitmap.c
index b47e2629093..e2bd907d558 100644
--- a/dep/jemalloc/src/bitmap.c
+++ b/dep/jemalloc/src/bitmap.c
@@ -1,4 +1,4 @@
-#define JEMALLOC_BITMAP_C_
+#define	JEMALLOC_BITMAP_C_
 #include "jemalloc/internal/jemalloc_internal.h"
 
 /******************************************************************************/
diff --git a/dep/jemalloc/src/chunk.c b/dep/jemalloc/src/chunk.c
index 044f76be96c..90ab116ae5f 100644
--- a/dep/jemalloc/src/chunk.c
+++ b/dep/jemalloc/src/chunk.c
@@ -180,7 +180,7 @@ chunk_alloc(size_t size, size_t alignment, bool base, bool *zero,
 label_return:
 	if (ret != NULL) {
 		if (config_ivsalloc && base == false) {
-			if (rtree_set(chunks_rtree, (uintptr_t)ret, ret)) {
+			if (rtree_set(chunks_rtree, (uintptr_t)ret, 1)) {
 				chunk_dealloc(ret, size, true);
 				return (NULL);
 			}
@@ -214,7 +214,7 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
     size_t size)
 {
 	bool unzeroed;
-	extent_node_t *xnode, *node, *prev, key;
+	extent_node_t *xnode, *node, *prev, *xprev, key;
 
 	unzeroed = pages_purge(chunk, size);
 	VALGRIND_MAKE_MEM_NOACCESS(chunk, size);
@@ -226,6 +226,8 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 	 * held.
 	 */
 	xnode = base_node_alloc();
+	/* Use xprev to implement conditional deferred deallocation of prev. */
+	xprev = NULL;
 
 	malloc_mutex_lock(&chunks_mtx);
 	key.addr = (void *)((uintptr_t)chunk + size);
@@ -242,8 +244,6 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 		node->size += size;
 		node->zeroed = (node->zeroed && (unzeroed == false));
 		extent_tree_szad_insert(chunks_szad, node);
-		if (xnode != NULL)
-			base_node_dealloc(xnode);
 	} else {
 		/* Coalescing forward failed, so insert a new node. */
 		if (xnode == NULL) {
@@ -253,10 +253,10 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 			 * already been purged, so this is only a virtual
 			 * memory leak.
 			 */
-			malloc_mutex_unlock(&chunks_mtx);
-			return;
+			goto label_return;
 		}
 		node = xnode;
+		xnode = NULL; /* Prevent deallocation below. */
 		node->addr = chunk;
 		node->size = size;
 		node->zeroed = (unzeroed == false);
@@ -282,9 +282,19 @@ chunk_record(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, void *chunk,
 		node->zeroed = (node->zeroed && prev->zeroed);
 		extent_tree_szad_insert(chunks_szad, node);
 
-		base_node_dealloc(prev);
+		xprev = prev;
 	}
+
+label_return:
 	malloc_mutex_unlock(&chunks_mtx);
+	/*
+	 * Deallocate xnode and/or xprev after unlocking chunks_mtx in order to
+	 * avoid potential deadlock.
+	 */
+	if (xnode != NULL)
+		base_node_dealloc(xnode);
+	if (xprev != NULL)
+		base_node_dealloc(xprev);
 }
 
 void
@@ -311,7 +321,7 @@ chunk_dealloc(void *chunk, size_t size, bool unmap)
 	assert((size & chunksize_mask) == 0);
 
 	if (config_ivsalloc)
-		rtree_set(chunks_rtree, (uintptr_t)chunk, NULL);
+		rtree_set(chunks_rtree, (uintptr_t)chunk, 0);
 	if (config_stats || config_prof) {
 		malloc_mutex_lock(&chunks_mtx);
 		assert(stats_chunks.curchunks >= (size / chunksize));
@@ -346,7 +356,7 @@ chunk_boot(void)
 	extent_tree_ad_new(&chunks_ad_dss);
 	if (config_ivsalloc) {
 		chunks_rtree = rtree_new((ZU(1) << (LG_SIZEOF_PTR+3)) -
-		    opt_lg_chunk);
+		    opt_lg_chunk, base_alloc, NULL);
 		if (chunks_rtree == NULL)
 			return (true);
 	}
@@ -358,7 +368,7 @@ void
 chunk_prefork(void)
 {
 
-	malloc_mutex_lock(&chunks_mtx);
+	malloc_mutex_prefork(&chunks_mtx);
 	if (config_ivsalloc)
 		rtree_prefork(chunks_rtree);
 	chunk_dss_prefork();
diff --git a/dep/jemalloc/src/chunk_dss.c b/dep/jemalloc/src/chunk_dss.c
index 24781cc52dc..510bb8bee85 100644
--- a/dep/jemalloc/src/chunk_dss.c
+++ b/dep/jemalloc/src/chunk_dss.c
@@ -28,16 +28,17 @@ static void		*dss_max;
 
 /******************************************************************************/
 
-#ifndef JEMALLOC_HAVE_SBRK
 static void *
-sbrk(intptr_t increment)
+chunk_dss_sbrk(intptr_t increment)
 {
 
+#ifdef JEMALLOC_HAVE_SBRK
+	return (sbrk(increment));
+#else
 	not_implemented();
-
 	return (NULL);
-}
 #endif
+}
 
 dss_prec_t
 chunk_dss_prec_get(void)
@@ -93,7 +94,7 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 		 */
 		do {
 			/* Get the current end of the DSS. */
-			dss_max = sbrk(0);
+			dss_max = chunk_dss_sbrk(0);
 			/*
 			 * Calculate how much padding is necessary to
 			 * chunk-align the end of the DSS.
@@ -117,7 +118,7 @@ chunk_alloc_dss(size_t size, size_t alignment, bool *zero)
 				return (NULL);
 			}
 			incr = gap_size + cpad_size + size;
-			dss_prev = sbrk(incr);
+			dss_prev = chunk_dss_sbrk(incr);
 			if (dss_prev == dss_max) {
 				/* Success. */
 				dss_max = dss_next;
@@ -163,7 +164,7 @@ chunk_dss_boot(void)
 
 	if (malloc_mutex_init(&dss_mtx))
 		return (true);
-	dss_base = sbrk(0);
+	dss_base = chunk_dss_sbrk(0);
 	dss_prev = dss_base;
 	dss_max = dss_base;
 
diff --git a/dep/jemalloc/src/chunk_mmap.c b/dep/jemalloc/src/chunk_mmap.c
index 8a42e75915f..2056d793f05 100644
--- a/dep/jemalloc/src/chunk_mmap.c
+++ b/dep/jemalloc/src/chunk_mmap.c
@@ -43,7 +43,7 @@ pages_map(void *addr, size_t size)
 		if (munmap(ret, size) == -1) {
 			char buf[BUFERROR_BUF];
 
-			buferror(buf, sizeof(buf));
+			buferror(get_errno(), buf, sizeof(buf));
 			malloc_printf("<jemalloc: Error in munmap(): %s\n",
 			    buf);
 			if (opt_abort)
@@ -69,7 +69,7 @@ pages_unmap(void *addr, size_t size)
 	{
 		char buf[BUFERROR_BUF];
 
-		buferror(buf, sizeof(buf));
+		buferror(get_errno(), buf, sizeof(buf));
 		malloc_printf("<jemalloc>: Error in "
 #ifdef _WIN32
 		              "VirtualFree"
diff --git a/dep/jemalloc/src/ckh.c b/dep/jemalloc/src/ckh.c
index 2f38348bb85..04c52966193 100644
--- a/dep/jemalloc/src/ckh.c
+++ b/dep/jemalloc/src/ckh.c
@@ -49,7 +49,7 @@ static void	ckh_shrink(ckh_t *ckh);
  * Search bucket for key and return the cell number if found; SIZE_T_MAX
  * otherwise.
  */
-JEMALLOC_INLINE size_t
+JEMALLOC_INLINE_C size_t
 ckh_bucket_search(ckh_t *ckh, size_t bucket, const void *key)
 {
 	ckhc_t *cell;
@@ -67,7 +67,7 @@ ckh_bucket_search(ckh_t *ckh, size_t bucket, const void *key)
 /*
  * Search table for key and return cell number if found; SIZE_T_MAX otherwise.
  */
-JEMALLOC_INLINE size_t
+JEMALLOC_INLINE_C size_t
 ckh_isearch(ckh_t *ckh, const void *key)
 {
 	size_t hashes[2], bucket, cell;
@@ -88,7 +88,7 @@ ckh_isearch(ckh_t *ckh, const void *key)
 	return (cell);
 }
 
-JEMALLOC_INLINE bool
+JEMALLOC_INLINE_C bool
 ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key,
     const void *data)
 {
@@ -120,7 +120,7 @@ ckh_try_bucket_insert(ckh_t *ckh, size_t bucket, const void *key,
  * eviction/relocation procedure until either success or detection of an
  * eviction/relocation bucket cycle.
  */
-JEMALLOC_INLINE bool
+JEMALLOC_INLINE_C bool
 ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey,
     void const **argdata)
 {
@@ -190,7 +190,7 @@ ckh_evict_reloc_insert(ckh_t *ckh, size_t argbucket, void const **argkey,
 	}
 }
 
-JEMALLOC_INLINE bool
+JEMALLOC_INLINE_C bool
 ckh_try_insert(ckh_t *ckh, void const**argkey, void const**argdata)
 {
 	size_t hashes[2], bucket;
@@ -219,7 +219,7 @@ ckh_try_insert(ckh_t *ckh, void const**argkey, void const**argdata)
  * Try to rebuild the hash table from scratch by inserting all items from the
  * old table into the new.
  */
-JEMALLOC_INLINE bool
+JEMALLOC_INLINE_C bool
 ckh_rebuild(ckh_t *ckh, ckhc_t *aTab)
 {
 	size_t count, i, nins;
diff --git a/dep/jemalloc/src/ctl.c b/dep/jemalloc/src/ctl.c
index f2ef4e60611..cc2c5aef570 100644
--- a/dep/jemalloc/src/ctl.c
+++ b/dep/jemalloc/src/ctl.c
@@ -546,43 +546,30 @@ ctl_arena_refresh(arena_t *arena, unsigned i)
 static bool
 ctl_grow(void)
 {
-	size_t astats_size;
 	ctl_arena_stats_t *astats;
 	arena_t **tarenas;
 
-	/* Extend arena stats and arenas arrays. */
-	astats_size = (ctl_stats.narenas + 2) * sizeof(ctl_arena_stats_t);
-	if (ctl_stats.narenas == narenas_auto) {
-		/* ctl_stats.arenas and arenas came from base_alloc(). */
-		astats = (ctl_arena_stats_t *)imalloc(astats_size);
-		if (astats == NULL)
-			return (true);
-		memcpy(astats, ctl_stats.arenas, (ctl_stats.narenas + 1) *
-		    sizeof(ctl_arena_stats_t));
-
-		tarenas = (arena_t **)imalloc((ctl_stats.narenas + 1) *
-		    sizeof(arena_t *));
-		if (tarenas == NULL) {
-			idalloc(astats);
-			return (true);
-		}
-		memcpy(tarenas, arenas, ctl_stats.narenas * sizeof(arena_t *));
-	} else {
-		astats = (ctl_arena_stats_t *)iralloc(ctl_stats.arenas,
-		    astats_size, 0, 0, false, false);
-		if (astats == NULL)
-			return (true);
-
-		tarenas = (arena_t **)iralloc(arenas, (ctl_stats.narenas + 1) *
-		    sizeof(arena_t *), 0, 0, false, false);
-		if (tarenas == NULL)
-			return (true);
+	/* Allocate extended arena stats and arenas arrays. */
+	astats = (ctl_arena_stats_t *)imalloc((ctl_stats.narenas + 2) *
+	    sizeof(ctl_arena_stats_t));
+	if (astats == NULL)
+		return (true);
+	tarenas = (arena_t **)imalloc((ctl_stats.narenas + 1) *
+	    sizeof(arena_t *));
+	if (tarenas == NULL) {
+		idalloc(astats);
+		return (true);
 	}
-	/* Initialize the new astats and arenas elements. */
+
+	/* Initialize the new astats element. */
+	memcpy(astats, ctl_stats.arenas, (ctl_stats.narenas + 1) *
+	    sizeof(ctl_arena_stats_t));
 	memset(&astats[ctl_stats.narenas + 1], 0, sizeof(ctl_arena_stats_t));
-	if (ctl_arena_init(&astats[ctl_stats.narenas + 1]))
+	if (ctl_arena_init(&astats[ctl_stats.narenas + 1])) {
+		idalloc(tarenas);
+		idalloc(astats);
 		return (true);
-	tarenas[ctl_stats.narenas] = NULL;
+	}
 	/* Swap merged stats to their new location. */
 	{
 		ctl_arena_stats_t tstats;
@@ -593,13 +580,34 @@ ctl_grow(void)
 		memcpy(&astats[ctl_stats.narenas + 1], &tstats,
 		    sizeof(ctl_arena_stats_t));
 	}
+	/* Initialize the new arenas element. */
+	tarenas[ctl_stats.narenas] = NULL;
+	{
+		arena_t **arenas_old = arenas;
+		/*
+		 * Swap extended arenas array into place.  Although ctl_mtx
+		 * protects this function from other threads extending the
+		 * array, it does not protect from other threads mutating it
+		 * (i.e. initializing arenas and setting array elements to
+		 * point to them).  Therefore, array copying must happen under
+		 * the protection of arenas_lock.
+		 */
+		malloc_mutex_lock(&arenas_lock);
+		arenas = tarenas;
+		memcpy(arenas, arenas_old, ctl_stats.narenas *
+		    sizeof(arena_t *));
+		narenas_total++;
+		arenas_extend(narenas_total - 1);
+		malloc_mutex_unlock(&arenas_lock);
+		/*
+		 * Deallocate arenas_old only if it came from imalloc() (not
+		 * base_alloc()).
+		 */
+		if (ctl_stats.narenas != narenas_auto)
+			idalloc(arenas_old);
+	}
 	ctl_stats.arenas = astats;
 	ctl_stats.narenas++;
-	malloc_mutex_lock(&arenas_lock);
-	arenas = tarenas;
-	narenas_total++;
-	arenas_extend(narenas_total - 1);
-	malloc_mutex_unlock(&arenas_lock);
 
 	return (false);
 }
@@ -921,7 +929,7 @@ void
 ctl_prefork(void)
 {
 
-	malloc_mutex_lock(&ctl_mtx);
+	malloc_mutex_prefork(&ctl_mtx);
 }
 
 void
@@ -1102,6 +1110,8 @@ label_return:								\
 	return (ret);							\
 }
 
+/******************************************************************************/
+
 CTL_RO_NL_GEN(version, JEMALLOC_VERSION, const char *)
 
 static int
@@ -1109,7 +1119,7 @@ epoch_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
     void *newp, size_t newlen)
 {
 	int ret;
-	uint64_t newval;
+	UNUSED uint64_t newval;
 
 	malloc_mutex_lock(&ctl_mtx);
 	WRITE(newval, uint64_t);
@@ -1123,49 +1133,52 @@ label_return:
 	return (ret);
 }
 
-static int
-thread_tcache_enabled_ctl(const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen)
-{
-	int ret;
-	bool oldval;
-
-	if (config_tcache == false)
-		return (ENOENT);
-
-	oldval = tcache_enabled_get();
-	if (newp != NULL) {
-		if (newlen != sizeof(bool)) {
-			ret = EINVAL;
-			goto label_return;
-		}
-		tcache_enabled_set(*(bool *)newp);
-	}
-	READ(oldval, bool);
-
-	ret = 0;
-label_return:
-	return (ret);
-}
-
-static int
-thread_tcache_flush_ctl(const size_t *mib, size_t miblen, void *oldp,
-    size_t *oldlenp, void *newp, size_t newlen)
-{
-	int ret;
+/******************************************************************************/
 
-	if (config_tcache == false)
-		return (ENOENT);
+CTL_RO_BOOL_CONFIG_GEN(config_debug)
+CTL_RO_BOOL_CONFIG_GEN(config_dss)
+CTL_RO_BOOL_CONFIG_GEN(config_fill)
+CTL_RO_BOOL_CONFIG_GEN(config_lazy_lock)
+CTL_RO_BOOL_CONFIG_GEN(config_mremap)
+CTL_RO_BOOL_CONFIG_GEN(config_munmap)
+CTL_RO_BOOL_CONFIG_GEN(config_prof)
+CTL_RO_BOOL_CONFIG_GEN(config_prof_libgcc)
+CTL_RO_BOOL_CONFIG_GEN(config_prof_libunwind)
+CTL_RO_BOOL_CONFIG_GEN(config_stats)
+CTL_RO_BOOL_CONFIG_GEN(config_tcache)
+CTL_RO_BOOL_CONFIG_GEN(config_tls)
+CTL_RO_BOOL_CONFIG_GEN(config_utrace)
+CTL_RO_BOOL_CONFIG_GEN(config_valgrind)
+CTL_RO_BOOL_CONFIG_GEN(config_xmalloc)
 
-	READONLY();
-	WRITEONLY();
+/******************************************************************************/
 
-	tcache_flush();
+CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
+CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
+CTL_RO_NL_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
+CTL_RO_NL_GEN(opt_narenas, opt_narenas, size_t)
+CTL_RO_NL_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
+CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool)
+CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, bool)
+CTL_RO_NL_CGEN(config_fill, opt_quarantine, opt_quarantine, size_t)
+CTL_RO_NL_CGEN(config_fill, opt_redzone, opt_redzone, bool)
+CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool)
+CTL_RO_NL_CGEN(config_utrace, opt_utrace, opt_utrace, bool)
+CTL_RO_NL_CGEN(config_valgrind, opt_valgrind, opt_valgrind, bool)
+CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
+CTL_RO_NL_CGEN(config_tcache, opt_tcache, opt_tcache, bool)
+CTL_RO_NL_CGEN(config_tcache, opt_lg_tcache_max, opt_lg_tcache_max, ssize_t)
+CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool)
+CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *)
+CTL_RO_CGEN(config_prof, opt_prof_active, opt_prof_active, bool) /* Mutable. */
+CTL_RO_NL_CGEN(config_prof, opt_lg_prof_sample, opt_lg_prof_sample, size_t)
+CTL_RO_NL_CGEN(config_prof, opt_prof_accum, opt_prof_accum, bool)
+CTL_RO_NL_CGEN(config_prof, opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
+CTL_RO_NL_CGEN(config_prof, opt_prof_gdump, opt_prof_gdump, bool)
+CTL_RO_NL_CGEN(config_prof, opt_prof_final, opt_prof_final, bool)
+CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool)
 
-	ret = 0;
-label_return:
-	return (ret);
-}
+/******************************************************************************/
 
 static int
 thread_arena_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
@@ -1227,50 +1240,49 @@ CTL_RO_NL_CGEN(config_stats, thread_deallocated,
 CTL_RO_NL_CGEN(config_stats, thread_deallocatedp,
     &thread_allocated_tsd_get()->deallocated, uint64_t *)
 
-/******************************************************************************/
+static int
+thread_tcache_enabled_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+	bool oldval;
 
-CTL_RO_BOOL_CONFIG_GEN(config_debug)
-CTL_RO_BOOL_CONFIG_GEN(config_dss)
-CTL_RO_BOOL_CONFIG_GEN(config_fill)
-CTL_RO_BOOL_CONFIG_GEN(config_lazy_lock)
-CTL_RO_BOOL_CONFIG_GEN(config_mremap)
-CTL_RO_BOOL_CONFIG_GEN(config_munmap)
-CTL_RO_BOOL_CONFIG_GEN(config_prof)
-CTL_RO_BOOL_CONFIG_GEN(config_prof_libgcc)
-CTL_RO_BOOL_CONFIG_GEN(config_prof_libunwind)
-CTL_RO_BOOL_CONFIG_GEN(config_stats)
-CTL_RO_BOOL_CONFIG_GEN(config_tcache)
-CTL_RO_BOOL_CONFIG_GEN(config_tls)
-CTL_RO_BOOL_CONFIG_GEN(config_utrace)
-CTL_RO_BOOL_CONFIG_GEN(config_valgrind)
-CTL_RO_BOOL_CONFIG_GEN(config_xmalloc)
+	if (config_tcache == false)
+		return (ENOENT);
 
-/******************************************************************************/
+	oldval = tcache_enabled_get();
+	if (newp != NULL) {
+		if (newlen != sizeof(bool)) {
+			ret = EINVAL;
+			goto label_return;
+		}
+		tcache_enabled_set(*(bool *)newp);
+	}
+	READ(oldval, bool);
 
-CTL_RO_NL_GEN(opt_abort, opt_abort, bool)
-CTL_RO_NL_GEN(opt_dss, opt_dss, const char *)
-CTL_RO_NL_GEN(opt_lg_chunk, opt_lg_chunk, size_t)
-CTL_RO_NL_GEN(opt_narenas, opt_narenas, size_t)
-CTL_RO_NL_GEN(opt_lg_dirty_mult, opt_lg_dirty_mult, ssize_t)
-CTL_RO_NL_GEN(opt_stats_print, opt_stats_print, bool)
-CTL_RO_NL_CGEN(config_fill, opt_junk, opt_junk, bool)
-CTL_RO_NL_CGEN(config_fill, opt_zero, opt_zero, bool)
-CTL_RO_NL_CGEN(config_fill, opt_quarantine, opt_quarantine, size_t)
-CTL_RO_NL_CGEN(config_fill, opt_redzone, opt_redzone, bool)
-CTL_RO_NL_CGEN(config_utrace, opt_utrace, opt_utrace, bool)
-CTL_RO_NL_CGEN(config_valgrind, opt_valgrind, opt_valgrind, bool)
-CTL_RO_NL_CGEN(config_xmalloc, opt_xmalloc, opt_xmalloc, bool)
-CTL_RO_NL_CGEN(config_tcache, opt_tcache, opt_tcache, bool)
-CTL_RO_NL_CGEN(config_tcache, opt_lg_tcache_max, opt_lg_tcache_max, ssize_t)
-CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool)
-CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *)
-CTL_RO_CGEN(config_prof, opt_prof_active, opt_prof_active, bool) /* Mutable. */
-CTL_RO_NL_CGEN(config_prof, opt_lg_prof_sample, opt_lg_prof_sample, size_t)
-CTL_RO_NL_CGEN(config_prof, opt_lg_prof_interval, opt_lg_prof_interval, ssize_t)
-CTL_RO_NL_CGEN(config_prof, opt_prof_gdump, opt_prof_gdump, bool)
-CTL_RO_NL_CGEN(config_prof, opt_prof_final, opt_prof_final, bool)
-CTL_RO_NL_CGEN(config_prof, opt_prof_leak, opt_prof_leak, bool)
-CTL_RO_NL_CGEN(config_prof, opt_prof_accum, opt_prof_accum, bool)
+	ret = 0;
+label_return:
+	return (ret);
+}
+
+static int
+thread_tcache_flush_ctl(const size_t *mib, size_t miblen, void *oldp,
+    size_t *oldlenp, void *newp, size_t newlen)
+{
+	int ret;
+
+	if (config_tcache == false)
+		return (ENOENT);
+
+	READONLY();
+	WRITEONLY();
+
+	tcache_flush();
+
+	ret = 0;
+label_return:
+	return (ret);
+}
 
 /******************************************************************************/
 
@@ -1382,31 +1394,8 @@ label_return:
 	return (ret);
 }
 
-
 /******************************************************************************/
 
-CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t)
-CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t)
-CTL_RO_NL_GEN(arenas_bin_i_run_size, arena_bin_info[mib[2]].run_size, size_t)
-static const ctl_named_node_t *
-arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i)
-{
-
-	if (i > NBINS)
-		return (NULL);
-	return (super_arenas_bin_i_node);
-}
-
-CTL_RO_NL_GEN(arenas_lrun_i_size, ((mib[2]+1) << LG_PAGE), size_t)
-static const ctl_named_node_t *
-arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
-{
-
-	if (i > nlclasses)
-		return (NULL);
-	return (super_arenas_lrun_i_node);
-}
-
 static int
 arenas_narenas_ctl(const size_t *mib, size_t miblen, void *oldp,
     size_t *oldlenp, void *newp, size_t newlen)
@@ -1460,7 +1449,28 @@ CTL_RO_NL_GEN(arenas_page, PAGE, size_t)
 CTL_RO_NL_CGEN(config_tcache, arenas_tcache_max, tcache_maxclass, size_t)
 CTL_RO_NL_GEN(arenas_nbins, NBINS, unsigned)
 CTL_RO_NL_CGEN(config_tcache, arenas_nhbins, nhbins, unsigned)
+CTL_RO_NL_GEN(arenas_bin_i_size, arena_bin_info[mib[2]].reg_size, size_t)
+CTL_RO_NL_GEN(arenas_bin_i_nregs, arena_bin_info[mib[2]].nregs, uint32_t)
+CTL_RO_NL_GEN(arenas_bin_i_run_size, arena_bin_info[mib[2]].run_size, size_t)
+static const ctl_named_node_t *
+arenas_bin_i_index(const size_t *mib, size_t miblen, size_t i)
+{
+
+	if (i > NBINS)
+		return (NULL);
+	return (super_arenas_bin_i_node);
+}
+
 CTL_RO_NL_GEN(arenas_nlruns, nlclasses, size_t)
+CTL_RO_NL_GEN(arenas_lrun_i_size, ((mib[2]+1) << LG_PAGE), size_t)
+static const ctl_named_node_t *
+arenas_lrun_i_index(const size_t *mib, size_t miblen, size_t i)
+{
+
+	if (i > nlclasses)
+		return (NULL);
+	return (super_arenas_lrun_i_node);
+}
 
 static int
 arenas_purge_ctl(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
@@ -1567,6 +1577,11 @@ CTL_RO_NL_CGEN(config_prof, prof_interval, prof_interval, uint64_t)
 
 /******************************************************************************/
 
+CTL_RO_CGEN(config_stats, stats_cactive, &stats_cactive, size_t *)
+CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats.allocated, size_t)
+CTL_RO_CGEN(config_stats, stats_active, ctl_stats.active, size_t)
+CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats.mapped, size_t)
+
 CTL_RO_CGEN(config_stats, stats_chunks_current, ctl_stats.chunks.current,
     size_t)
 CTL_RO_CGEN(config_stats, stats_chunks_total, ctl_stats.chunks.total, uint64_t)
@@ -1574,6 +1589,20 @@ CTL_RO_CGEN(config_stats, stats_chunks_high, ctl_stats.chunks.high, size_t)
 CTL_RO_CGEN(config_stats, stats_huge_allocated, huge_allocated, size_t)
 CTL_RO_CGEN(config_stats, stats_huge_nmalloc, huge_nmalloc, uint64_t)
 CTL_RO_CGEN(config_stats, stats_huge_ndalloc, huge_ndalloc, uint64_t)
+
+CTL_RO_GEN(stats_arenas_i_dss, ctl_stats.arenas[mib[2]].dss, const char *)
+CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
+CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t)
+CTL_RO_GEN(stats_arenas_i_pdirty, ctl_stats.arenas[mib[2]].pdirty, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_mapped,
+    ctl_stats.arenas[mib[2]].astats.mapped, size_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_npurge,
+    ctl_stats.arenas[mib[2]].astats.npurge, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_nmadvise,
+    ctl_stats.arenas[mib[2]].astats.nmadvise, uint64_t)
+CTL_RO_CGEN(config_stats, stats_arenas_i_purged,
+    ctl_stats.arenas[mib[2]].astats.purged, uint64_t)
+
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_allocated,
     ctl_stats.arenas[mib[2]].allocated_small, size_t)
 CTL_RO_CGEN(config_stats, stats_arenas_i_small_nmalloc,
@@ -1637,19 +1666,6 @@ stats_arenas_i_lruns_j_index(const size_t *mib, size_t miblen, size_t j)
 	return (super_stats_arenas_i_lruns_j_node);
 }
 
-CTL_RO_GEN(stats_arenas_i_nthreads, ctl_stats.arenas[mib[2]].nthreads, unsigned)
-CTL_RO_GEN(stats_arenas_i_dss, ctl_stats.arenas[mib[2]].dss, const char *)
-CTL_RO_GEN(stats_arenas_i_pactive, ctl_stats.arenas[mib[2]].pactive, size_t)
-CTL_RO_GEN(stats_arenas_i_pdirty, ctl_stats.arenas[mib[2]].pdirty, size_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_mapped,
-    ctl_stats.arenas[mib[2]].astats.mapped, size_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_npurge,
-    ctl_stats.arenas[mib[2]].astats.npurge, uint64_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_nmadvise,
-    ctl_stats.arenas[mib[2]].astats.nmadvise, uint64_t)
-CTL_RO_CGEN(config_stats, stats_arenas_i_purged,
-    ctl_stats.arenas[mib[2]].astats.purged, uint64_t)
-
 static const ctl_named_node_t *
 stats_arenas_i_index(const size_t *mib, size_t miblen, size_t i)
 {
@@ -1666,8 +1682,3 @@ label_return:
 	malloc_mutex_unlock(&ctl_mtx);
 	return (ret);
 }
-
-CTL_RO_CGEN(config_stats, stats_cactive, &stats_cactive, size_t *)
-CTL_RO_CGEN(config_stats, stats_allocated, ctl_stats.allocated, size_t)
-CTL_RO_CGEN(config_stats, stats_active, ctl_stats.active, size_t)
-CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats.mapped, size_t)
diff --git a/dep/jemalloc/src/huge.c b/dep/jemalloc/src/huge.c
index aa08d43d362..d72f2135702 100644
--- a/dep/jemalloc/src/huge.c
+++ b/dep/jemalloc/src/huge.c
@@ -16,14 +16,14 @@ malloc_mutex_t	huge_mtx;
 static extent_tree_t	huge;
 
 void *
-huge_malloc(size_t size, bool zero)
+huge_malloc(size_t size, bool zero, dss_prec_t dss_prec)
 {
 
-	return (huge_palloc(size, chunksize, zero));
+	return (huge_palloc(size, chunksize, zero, dss_prec));
 }
 
 void *
-huge_palloc(size_t size, size_t alignment, bool zero)
+huge_palloc(size_t size, size_t alignment, bool zero, dss_prec_t dss_prec)
 {
 	void *ret;
 	size_t csize;
@@ -48,8 +48,7 @@ huge_palloc(size_t size, size_t alignment, bool zero)
 	 * it is possible to make correct junk/zero fill decisions below.
 	 */
 	is_zeroed = zero;
-	ret = chunk_alloc(csize, alignment, false, &is_zeroed,
-	    chunk_dss_prec_get());
+	ret = chunk_alloc(csize, alignment, false, &is_zeroed, dss_prec);
 	if (ret == NULL) {
 		base_node_dealloc(node);
 		return (NULL);
@@ -78,7 +77,7 @@ huge_palloc(size_t size, size_t alignment, bool zero)
 	return (ret);
 }
 
-void *
+bool
 huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 {
 
@@ -89,28 +88,23 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 	    && CHUNK_CEILING(oldsize) >= CHUNK_CEILING(size)
 	    && CHUNK_CEILING(oldsize) <= CHUNK_CEILING(size+extra)) {
 		assert(CHUNK_CEILING(oldsize) == oldsize);
-		if (config_fill && opt_junk && size < oldsize) {
-			memset((void *)((uintptr_t)ptr + size), 0x5a,
-			    oldsize - size);
-		}
-		return (ptr);
+		return (false);
 	}
 
 	/* Reallocation would require a move. */
-	return (NULL);
+	return (true);
 }
 
 void *
 huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
-    size_t alignment, bool zero, bool try_tcache_dalloc)
+    size_t alignment, bool zero, bool try_tcache_dalloc, dss_prec_t dss_prec)
 {
 	void *ret;
 	size_t copysize;
 
 	/* Try to avoid moving the allocation. */
-	ret = huge_ralloc_no_move(ptr, oldsize, size, extra);
-	if (ret != NULL)
-		return (ret);
+	if (huge_ralloc_no_move(ptr, oldsize, size, extra) == false)
+		return (ptr);
 
 	/*
 	 * size and oldsize are different enough that we need to use a
@@ -118,18 +112,18 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 	 * space and copying.
 	 */
 	if (alignment > chunksize)
-		ret = huge_palloc(size + extra, alignment, zero);
+		ret = huge_palloc(size + extra, alignment, zero, dss_prec);
 	else
-		ret = huge_malloc(size + extra, zero);
+		ret = huge_malloc(size + extra, zero, dss_prec);
 
 	if (ret == NULL) {
 		if (extra == 0)
 			return (NULL);
 		/* Try again, this time without extra. */
 		if (alignment > chunksize)
-			ret = huge_palloc(size, alignment, zero);
+			ret = huge_palloc(size, alignment, zero, dss_prec);
 		else
-			ret = huge_malloc(size, zero);
+			ret = huge_malloc(size, zero, dss_prec);
 
 		if (ret == NULL)
 			return (NULL);
@@ -169,23 +163,56 @@ huge_ralloc(void *ptr, size_t oldsize, size_t size, size_t extra,
 			 */
 			char buf[BUFERROR_BUF];
 
-			buferror(buf, sizeof(buf));
+			buferror(get_errno(), buf, sizeof(buf));
 			malloc_printf("<jemalloc>: Error in mremap(): %s\n",
 			    buf);
 			if (opt_abort)
 				abort();
 			memcpy(ret, ptr, copysize);
 			chunk_dealloc_mmap(ptr, oldsize);
+		} else if (config_fill && zero == false && opt_junk && oldsize
+		    < newsize) {
+			/*
+			 * mremap(2) clobbers the original mapping, so
+			 * junk/zero filling is not preserved.  There is no
+			 * need to zero fill here, since any trailing
+			 * uninititialized memory is demand-zeroed by the
+			 * kernel, but junk filling must be redone.
+			 */
+			memset(ret + oldsize, 0xa5, newsize - oldsize);
 		}
 	} else
 #endif
 	{
 		memcpy(ret, ptr, copysize);
-		iqallocx(ptr, try_tcache_dalloc);
+		iqalloct(ptr, try_tcache_dalloc);
 	}
 	return (ret);
 }
 
+#ifdef JEMALLOC_JET
+#undef huge_dalloc_junk
+#define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk_impl)
+#endif
+static void
+huge_dalloc_junk(void *ptr, size_t usize)
+{
+
+	if (config_fill && config_dss && opt_junk) {
+		/*
+		 * Only bother junk filling if the chunk isn't about to be
+		 * unmapped.
+		 */
+		if (config_munmap == false || (config_dss && chunk_in_dss(ptr)))
+			memset(ptr, 0x5a, usize);
+	}
+}
+#ifdef JEMALLOC_JET
+#undef huge_dalloc_junk
+#define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk)
+huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
+#endif
+
 void
 huge_dalloc(void *ptr, bool unmap)
 {
@@ -208,8 +235,8 @@ huge_dalloc(void *ptr, bool unmap)
 
 	malloc_mutex_unlock(&huge_mtx);
 
-	if (unmap && config_fill && config_dss && opt_junk)
-		memset(node->addr, 0x5a, node->size);
+	if (unmap)
+		huge_dalloc_junk(node->addr, node->size);
 
 	chunk_dealloc(node->addr, node->size, unmap);
 
@@ -236,6 +263,13 @@ huge_salloc(const void *ptr)
 	return (ret);
 }
 
+dss_prec_t
+huge_dss_prec_get(arena_t *arena)
+{
+
+	return (arena_dss_prec_get(choose_arena(arena)));
+}
+
 prof_ctx_t *
 huge_prof_ctx_get(const void *ptr)
 {
diff --git a/dep/jemalloc/src/jemalloc.c b/dep/jemalloc/src/jemalloc.c
index bc350ed953b..204778bc89d 100644
--- a/dep/jemalloc/src/jemalloc.c
+++ b/dep/jemalloc/src/jemalloc.c
@@ -100,18 +100,12 @@ typedef struct {
 #endif
 
 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static void	stats_print_atexit(void);
-static unsigned	malloc_ncpus(void);
-static bool	malloc_conf_next(char const **opts_p, char const **k_p,
-    size_t *klen_p, char const **v_p, size_t *vlen_p);
-static void	malloc_conf_error(const char *msg, const char *k, size_t klen,
-    const char *v, size_t vlen);
-static void	malloc_conf_init(void);
+/*
+ * Function prototypes for static functions that are referenced prior to
+ * definition.
+ */
+
 static bool	malloc_init_hard(void);
-static int	imemalign(void **memptr, size_t alignment, size_t size,
-    size_t min_alignment);
 
 /******************************************************************************/
 /*
@@ -252,7 +246,6 @@ stats_print_atexit(void)
 static unsigned
 malloc_ncpus(void)
 {
-	unsigned ret;
 	long result;
 
 #ifdef _WIN32
@@ -262,14 +255,7 @@ malloc_ncpus(void)
 #else
 	result = sysconf(_SC_NPROCESSORS_ONLN);
 #endif
-	if (result == -1) {
-		/* Error. */
-		ret = 1;
-	}  else {
-    ret = (unsigned)result;
-  }
-
-	return (ret);
+	return ((result == -1) ? 1 : (unsigned)result);
 }
 
 void
@@ -282,7 +268,7 @@ arenas_cleanup(void *arg)
 	malloc_mutex_unlock(&arenas_lock);
 }
 
-static JEMALLOC_ATTR(always_inline) void
+JEMALLOC_ALWAYS_INLINE_C void
 malloc_thread_init(void)
 {
 
@@ -299,7 +285,7 @@ malloc_thread_init(void)
 		quarantine_alloc_hook();
 }
 
-static JEMALLOC_ATTR(always_inline) bool
+JEMALLOC_ALWAYS_INLINE_C bool
 malloc_init(void)
 {
 
@@ -436,8 +422,9 @@ malloc_conf_init(void)
 			}
 			break;
 		case 1: {
+			int linklen = 0;
 #ifndef _WIN32
-			int linklen;
+			int saved_errno = errno;
 			const char *linkname =
 #  ifdef JEMALLOC_PREFIX
 			    "/etc/"JEMALLOC_PREFIX"malloc.conf"
@@ -446,21 +433,20 @@ malloc_conf_init(void)
 #  endif
 			    ;
 
-			if ((linklen = readlink(linkname, buf,
-			    sizeof(buf) - 1)) != -1) {
-				/*
-				 * Use the contents of the "/etc/malloc.conf"
-				 * symbolic link's name.
-				 */
-				buf[linklen] = '\0';
-				opts = buf;
-			} else
-#endif
-			{
+			/*
+			 * Try to use the contents of the "/etc/malloc.conf"
+			 * symbolic link's name.
+			 */
+			linklen = readlink(linkname, buf, sizeof(buf) - 1);
+			if (linklen == -1) {
 				/* No configuration specified. */
-				buf[0] = '\0';
-				opts = buf;
+				linklen = 0;
+				/* restore errno */
+				set_errno(saved_errno);
 			}
+#endif
+			buf[linklen] = '\0';
+			opts = buf;
 			break;
 		} case 2: {
 			const char *envname =
@@ -484,8 +470,7 @@ malloc_conf_init(void)
 			}
 			break;
 		} default:
-			/* NOTREACHED */
-			assert(false);
+			not_reached();
 			buf[0] = '\0';
 			opts = buf;
 		}
@@ -522,14 +507,15 @@ malloc_conf_init(void)
 					    "Invalid conf value",	\
 					    k, klen, v, vlen);		\
 				} else if (clip) {			\
-					if (um < min)			\
+					if (min != 0 && um < min)	\
 						o = min;		\
 					else if (um > max)		\
 						o = max;		\
 					else				\
 						o = um;			\
 				} else {				\
-					if (um < min || um > max) {	\
+					if ((min != 0 && um < min) ||	\
+					    um > max) {			\
 						malloc_conf_error(	\
 						    "Out-of-range "	\
 						    "conf value",	\
@@ -695,17 +681,6 @@ malloc_init_hard(void)
 
 	malloc_conf_init();
 
-#if (!defined(JEMALLOC_MUTEX_INIT_CB) && !defined(JEMALLOC_ZONE) \
-    && !defined(_WIN32))
-	/* Register fork handlers. */
-	if (pthread_atfork(jemalloc_prefork, jemalloc_postfork_parent,
-	    jemalloc_postfork_child) != 0) {
-		malloc_write("<jemalloc>: Error in pthread_atfork()\n");
-		if (opt_abort)
-			abort();
-	}
-#endif
-
 	if (opt_stats_print) {
 		/* Print statistics at exit. */
 		if (atexit(stats_print_atexit) != 0) {
@@ -745,8 +720,10 @@ malloc_init_hard(void)
 		return (true);
 	}
 
-	if (malloc_mutex_init(&arenas_lock))
+	if (malloc_mutex_init(&arenas_lock)) {
+		malloc_mutex_unlock(&init_lock);
 		return (true);
+	}
 
 	/*
 	 * Create enough scaffolding to allow recursive allocation in
@@ -792,9 +769,25 @@ malloc_init_hard(void)
 		return (true);
 	}
 
-	/* Get number of CPUs. */
 	malloc_mutex_unlock(&init_lock);
+	/**********************************************************************/
+	/* Recursive allocation may follow. */
+
 	ncpus = malloc_ncpus();
+
+#if (!defined(JEMALLOC_MUTEX_INIT_CB) && !defined(JEMALLOC_ZONE) \
+    && !defined(_WIN32))
+	/* LinuxThreads's pthread_atfork() allocates. */
+	if (pthread_atfork(jemalloc_prefork, jemalloc_postfork_parent,
+	    jemalloc_postfork_child) != 0) {
+		malloc_write("<jemalloc>: Error in pthread_atfork()\n");
+		if (opt_abort)
+			abort();
+	}
+#endif
+
+	/* Done recursively allocating. */
+	/**********************************************************************/
 	malloc_mutex_lock(&init_lock);
 
 	if (mutex_boot()) {
@@ -841,6 +834,7 @@ malloc_init_hard(void)
 
 	malloc_initialized = true;
 	malloc_mutex_unlock(&init_lock);
+
 	return (false);
 }
 
@@ -852,42 +846,88 @@ malloc_init_hard(void)
  * Begin malloc(3)-compatible functions.
  */
 
+static void *
+imalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		p = imalloc(SMALL_MAXCLASS+1);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else
+		p = imalloc(usize);
+
+	return (p);
+}
+
+JEMALLOC_ALWAYS_INLINE_C void *
+imalloc_prof(size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if ((uintptr_t)cnt != (uintptr_t)1U)
+		p = imalloc_prof_sample(usize, cnt);
+	else
+		p = imalloc(usize);
+	if (p == NULL)
+		return (NULL);
+	prof_malloc(p, usize, cnt);
+
+	return (p);
+}
+
+/*
+ * MALLOC_BODY() is a macro rather than a function because its contents are in
+ * the fast path, but inlining would cause reliability issues when determining
+ * how many frames to discard from heap profiling backtraces.
+ */
+#define	MALLOC_BODY(ret, size, usize) do {				\
+	if (malloc_init())						\
+		ret = NULL;						\
+	else {								\
+		if (config_prof && opt_prof) {				\
+			prof_thr_cnt_t *cnt;				\
+									\
+			usize = s2u(size);				\
+			/*						\
+			 * Call PROF_ALLOC_PREP() here rather than in	\
+			 * imalloc_prof() so that imalloc_prof() can be	\
+			 * inlined without introducing uncertainty	\
+			 * about the number of backtrace frames to	\
+			 * ignore.  imalloc_prof() is in the fast path	\
+			 * when heap profiling is enabled, so inlining	\
+			 * is critical to performance.  (For		\
+			 * consistency all callers of PROF_ALLOC_PREP()	\
+			 * are structured similarly, even though e.g.	\
+			 * realloc() isn't called enough for inlining	\
+			 * to be critical.)				\
+			 */						\
+			PROF_ALLOC_PREP(1, usize, cnt);			\
+			ret = imalloc_prof(usize, cnt);			\
+		} else {						\
+			if (config_stats || (config_valgrind &&		\
+			    opt_valgrind))				\
+				usize = s2u(size);			\
+			ret = imalloc(size);				\
+		}							\
+	}								\
+} while (0)
+
 void *
 je_malloc(size_t size)
 {
 	void *ret;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
-	prof_thr_cnt_t *cnt JEMALLOC_CC_SILENCE_INIT(NULL);
-
-	if (malloc_init()) {
-		ret = NULL;
-		goto label_oom;
-	}
 
 	if (size == 0)
 		size = 1;
 
-	if (config_prof && opt_prof) {
-		usize = s2u(size);
-		PROF_ALLOC_PREP(1, usize, cnt);
-		if (cnt == NULL) {
-			ret = NULL;
-			goto label_oom;
-		}
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
-		    SMALL_MAXCLASS) {
-			ret = imalloc(SMALL_MAXCLASS+1);
-			if (ret != NULL)
-				arena_prof_promoted(ret, usize);
-		} else
-			ret = imalloc(size);
-	} else {
-		if (config_stats || (config_valgrind && opt_valgrind))
-			usize = s2u(size);
-		ret = imalloc(size);
-	}
+	MALLOC_BODY(ret, size, usize);
 
-label_oom:
 	if (ret == NULL) {
 		if (config_xmalloc && opt_xmalloc) {
 			malloc_write("<jemalloc>: Error in malloc(): "
@@ -896,8 +936,6 @@ label_oom:
 		}
 		set_errno(ENOMEM);
 	}
-	if (config_prof && opt_prof && ret != NULL)
-		prof_malloc(ret, usize, cnt);
 	if (config_stats && ret != NULL) {
 		assert(usize == isalloc(ret, config_prof));
 		thread_allocated_tsd_get()->allocated += usize;
@@ -907,6 +945,42 @@ label_oom:
 	return (ret);
 }
 
+static void *
+imemalign_prof_sample(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		assert(sa2u(SMALL_MAXCLASS+1, alignment) != 0);
+		p = ipalloc(sa2u(SMALL_MAXCLASS+1, alignment), alignment,
+		    false);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else
+		p = ipalloc(usize, alignment, false);
+
+	return (p);
+}
+
+JEMALLOC_ALWAYS_INLINE_C void *
+imemalign_prof(size_t alignment, size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if ((uintptr_t)cnt != (uintptr_t)1U)
+		p = imemalign_prof_sample(alignment, usize, cnt);
+	else
+		p = ipalloc(usize, alignment, false);
+	if (p == NULL)
+		return (NULL);
+	prof_malloc(p, usize, cnt);
+
+	return (p);
+}
+
 JEMALLOC_ATTR(nonnull(1))
 #ifdef JEMALLOC_PROF
 /*
@@ -916,19 +990,18 @@ JEMALLOC_ATTR(nonnull(1))
 JEMALLOC_NOINLINE
 #endif
 static int
-imemalign(void **memptr, size_t alignment, size_t size,
-    size_t min_alignment)
+imemalign(void **memptr, size_t alignment, size_t size, size_t min_alignment)
 {
 	int ret;
 	size_t usize;
 	void *result;
-	prof_thr_cnt_t *cnt JEMALLOC_CC_SILENCE_INIT(NULL);
 
 	assert(min_alignment != 0);
 
-	if (malloc_init())
+	if (malloc_init()) {
 		result = NULL;
-	else {
+		goto label_oom;
+	} else {
 		if (size == 0)
 			size = 1;
 
@@ -948,57 +1021,38 @@ imemalign(void **memptr, size_t alignment, size_t size,
 		usize = sa2u(size, alignment);
 		if (usize == 0) {
 			result = NULL;
-			ret = ENOMEM;
-			goto label_return;
+			goto label_oom;
 		}
 
 		if (config_prof && opt_prof) {
+			prof_thr_cnt_t *cnt;
+
 			PROF_ALLOC_PREP(2, usize, cnt);
-			if (cnt == NULL) {
-				result = NULL;
-				ret = EINVAL;
-			} else {
-				if (prof_promote && (uintptr_t)cnt !=
-				    (uintptr_t)1U && usize <= SMALL_MAXCLASS) {
-					assert(sa2u(SMALL_MAXCLASS+1,
-					    alignment) != 0);
-					result = ipalloc(sa2u(SMALL_MAXCLASS+1,
-					    alignment), alignment, false);
-					if (result != NULL) {
-						arena_prof_promoted(result,
-						    usize);
-					}
-				} else {
-					result = ipalloc(usize, alignment,
-					    false);
-				}
-			}
+			result = imemalign_prof(alignment, usize, cnt);
 		} else
 			result = ipalloc(usize, alignment, false);
-	}
-
-	if (result == NULL) {
-		if (config_xmalloc && opt_xmalloc) {
-			malloc_write("<jemalloc>: Error allocating aligned "
-			    "memory: out of memory\n");
-			abort();
-		}
-		ret = ENOMEM;
-		goto label_return;
+		if (result == NULL)
+			goto label_oom;
 	}
 
 	*memptr = result;
 	ret = 0;
-
 label_return:
 	if (config_stats && result != NULL) {
 		assert(usize == isalloc(result, config_prof));
 		thread_allocated_tsd_get()->allocated += usize;
 	}
-	if (config_prof && opt_prof && result != NULL)
-		prof_malloc(result, usize, cnt);
 	UTRACE(0, size, result);
 	return (ret);
+label_oom:
+	assert(result == NULL);
+	if (config_xmalloc && opt_xmalloc) {
+		malloc_write("<jemalloc>: Error allocating aligned memory: "
+		    "out of memory\n");
+		abort();
+	}
+	ret = ENOMEM;
+	goto label_return;
 }
 
 int
@@ -1025,13 +1079,46 @@ je_aligned_alloc(size_t alignment, size_t size)
 	return (ret);
 }
 
+static void *
+icalloc_prof_sample(size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		p = icalloc(SMALL_MAXCLASS+1);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else
+		p = icalloc(usize);
+
+	return (p);
+}
+
+JEMALLOC_ALWAYS_INLINE_C void *
+icalloc_prof(size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if ((uintptr_t)cnt != (uintptr_t)1U)
+		p = icalloc_prof_sample(usize, cnt);
+	else
+		p = icalloc(usize);
+	if (p == NULL)
+		return (NULL);
+	prof_malloc(p, usize, cnt);
+
+	return (p);
+}
+
 void *
 je_calloc(size_t num, size_t size)
 {
 	void *ret;
 	size_t num_size;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
-	prof_thr_cnt_t *cnt JEMALLOC_CC_SILENCE_INIT(NULL);
 
 	if (malloc_init()) {
 		num_size = 0;
@@ -1060,19 +1147,11 @@ je_calloc(size_t num, size_t size)
 	}
 
 	if (config_prof && opt_prof) {
+		prof_thr_cnt_t *cnt;
+
 		usize = s2u(num_size);
 		PROF_ALLOC_PREP(1, usize, cnt);
-		if (cnt == NULL) {
-			ret = NULL;
-			goto label_return;
-		}
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize
-		    <= SMALL_MAXCLASS) {
-			ret = icalloc(SMALL_MAXCLASS+1);
-			if (ret != NULL)
-				arena_prof_promoted(ret, usize);
-		} else
-			ret = icalloc(num_size);
+		ret = icalloc_prof(usize, cnt);
 	} else {
 		if (config_stats || (config_valgrind && opt_valgrind))
 			usize = s2u(num_size);
@@ -1088,9 +1167,6 @@ label_return:
 		}
 		set_errno(ENOMEM);
 	}
-
-	if (config_prof && opt_prof && ret != NULL)
-		prof_malloc(ret, usize, cnt);
 	if (config_stats && ret != NULL) {
 		assert(usize == isalloc(ret, config_prof));
 		thread_allocated_tsd_get()->allocated += usize;
@@ -1100,152 +1176,126 @@ label_return:
 	return (ret);
 }
 
+static void *
+irealloc_prof_sample(void *oldptr, size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		p = iralloc(oldptr, SMALL_MAXCLASS+1, 0, 0, false);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else
+		p = iralloc(oldptr, usize, 0, 0, false);
+
+	return (p);
+}
+
+JEMALLOC_ALWAYS_INLINE_C void *
+irealloc_prof(void *oldptr, size_t old_usize, size_t usize, prof_thr_cnt_t *cnt)
+{
+	void *p;
+	prof_ctx_t *old_ctx;
+
+	old_ctx = prof_ctx_get(oldptr);
+	if ((uintptr_t)cnt != (uintptr_t)1U)
+		p = irealloc_prof_sample(oldptr, usize, cnt);
+	else
+		p = iralloc(oldptr, usize, 0, 0, false);
+	if (p == NULL)
+		return (NULL);
+	prof_realloc(p, usize, cnt, old_usize, old_ctx);
+
+	return (p);
+}
+
+JEMALLOC_INLINE_C void
+ifree(void *ptr)
+{
+	size_t usize;
+	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
+
+	assert(ptr != NULL);
+	assert(malloc_initialized || IS_INITIALIZER);
+
+	if (config_prof && opt_prof) {
+		usize = isalloc(ptr, config_prof);
+		prof_free(ptr, usize);
+	} else if (config_stats || config_valgrind)
+		usize = isalloc(ptr, config_prof);
+	if (config_stats)
+		thread_allocated_tsd_get()->deallocated += usize;
+	if (config_valgrind && opt_valgrind)
+		rzsize = p2rz(ptr);
+	iqalloc(ptr);
+	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
+}
+
 void *
 je_realloc(void *ptr, size_t size)
 {
 	void *ret;
 	size_t usize JEMALLOC_CC_SILENCE_INIT(0);
-	size_t old_size = 0;
-	size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
-	prof_thr_cnt_t *cnt JEMALLOC_CC_SILENCE_INIT(NULL);
-	prof_ctx_t *old_ctx JEMALLOC_CC_SILENCE_INIT(NULL);
+	size_t old_usize = 0;
+	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
 
 	if (size == 0) {
 		if (ptr != NULL) {
-			/* realloc(ptr, 0) is equivalent to free(p). */
-			assert(malloc_initialized || IS_INITIALIZER);
-			if (config_prof) {
-				old_size = isalloc(ptr, true);
-				if (config_valgrind && opt_valgrind)
-					old_rzsize = p2rz(ptr);
-			} else if (config_stats) {
-				old_size = isalloc(ptr, false);
-				if (config_valgrind && opt_valgrind)
-					old_rzsize = u2rz(old_size);
-			} else if (config_valgrind && opt_valgrind) {
-				old_size = isalloc(ptr, false);
-				old_rzsize = u2rz(old_size);
-			}
-			if (config_prof && opt_prof) {
-				old_ctx = prof_ctx_get(ptr);
-				cnt = NULL;
-			}
-			iqalloc(ptr);
-			ret = NULL;
-			goto label_return;
-		} else
-			size = 1;
+			/* realloc(ptr, 0) is equivalent to free(ptr). */
+			UTRACE(ptr, 0, 0);
+			ifree(ptr);
+			return (NULL);
+		}
+		size = 1;
 	}
 
 	if (ptr != NULL) {
 		assert(malloc_initialized || IS_INITIALIZER);
 		malloc_thread_init();
 
-		if (config_prof) {
-			old_size = isalloc(ptr, true);
-			if (config_valgrind && opt_valgrind)
-				old_rzsize = p2rz(ptr);
-		} else if (config_stats) {
-			old_size = isalloc(ptr, false);
-			if (config_valgrind && opt_valgrind)
-				old_rzsize = u2rz(old_size);
-		} else if (config_valgrind && opt_valgrind) {
-			old_size = isalloc(ptr, false);
-			old_rzsize = u2rz(old_size);
-		}
+		if ((config_prof && opt_prof) || config_stats ||
+		    (config_valgrind && opt_valgrind))
+			old_usize = isalloc(ptr, config_prof);
+		if (config_valgrind && opt_valgrind)
+			old_rzsize = config_prof ? p2rz(ptr) : u2rz(old_usize);
+
 		if (config_prof && opt_prof) {
+			prof_thr_cnt_t *cnt;
+
 			usize = s2u(size);
-			old_ctx = prof_ctx_get(ptr);
 			PROF_ALLOC_PREP(1, usize, cnt);
-			if (cnt == NULL) {
-				old_ctx = NULL;
-				ret = NULL;
-				goto label_oom;
-			}
-			if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U &&
-			    usize <= SMALL_MAXCLASS) {
-				ret = iralloc(ptr, SMALL_MAXCLASS+1, 0, 0,
-				    false, false);
-				if (ret != NULL)
-					arena_prof_promoted(ret, usize);
-				else
-					old_ctx = NULL;
-			} else {
-				ret = iralloc(ptr, size, 0, 0, false, false);
-				if (ret == NULL)
-					old_ctx = NULL;
-			}
+			ret = irealloc_prof(ptr, old_usize, usize, cnt);
 		} else {
 			if (config_stats || (config_valgrind && opt_valgrind))
 				usize = s2u(size);
-			ret = iralloc(ptr, size, 0, 0, false, false);
-		}
-
-label_oom:
-		if (ret == NULL) {
-			if (config_xmalloc && opt_xmalloc) {
-				malloc_write("<jemalloc>: Error in realloc(): "
-				    "out of memory\n");
-				abort();
-			}
-			set_errno(ENOMEM);
+			ret = iralloc(ptr, size, 0, 0, false);
 		}
 	} else {
 		/* realloc(NULL, size) is equivalent to malloc(size). */
-		if (config_prof && opt_prof)
-			old_ctx = NULL;
-		if (malloc_init()) {
-			if (config_prof && opt_prof)
-				cnt = NULL;
-			ret = NULL;
-		} else {
-			if (config_prof && opt_prof) {
-				usize = s2u(size);
-				PROF_ALLOC_PREP(1, usize, cnt);
-				if (cnt == NULL)
-					ret = NULL;
-				else {
-					if (prof_promote && (uintptr_t)cnt !=
-					    (uintptr_t)1U && usize <=
-					    SMALL_MAXCLASS) {
-						ret = imalloc(SMALL_MAXCLASS+1);
-						if (ret != NULL) {
-							arena_prof_promoted(ret,
-							    usize);
-						}
-					} else
-						ret = imalloc(size);
-				}
-			} else {
-				if (config_stats || (config_valgrind &&
-				    opt_valgrind))
-					usize = s2u(size);
-				ret = imalloc(size);
-			}
-		}
+		MALLOC_BODY(ret, size, usize);
+	}
 
-		if (ret == NULL) {
-			if (config_xmalloc && opt_xmalloc) {
-				malloc_write("<jemalloc>: Error in realloc(): "
-				    "out of memory\n");
-				abort();
-			}
-			set_errno(ENOMEM);
+	if (ret == NULL) {
+		if (config_xmalloc && opt_xmalloc) {
+			malloc_write("<jemalloc>: Error in realloc(): "
+			    "out of memory\n");
+			abort();
 		}
+		set_errno(ENOMEM);
 	}
-
-label_return:
-	if (config_prof && opt_prof)
-		prof_realloc(ret, usize, cnt, old_size, old_ctx);
 	if (config_stats && ret != NULL) {
 		thread_allocated_t *ta;
 		assert(usize == isalloc(ret, config_prof));
 		ta = thread_allocated_tsd_get();
 		ta->allocated += usize;
-		ta->deallocated += old_size;
+		ta->deallocated += old_usize;
 	}
 	UTRACE(ptr, size, ret);
-	JEMALLOC_VALGRIND_REALLOC(ret, usize, ptr, old_size, old_rzsize, false);
+	JEMALLOC_VALGRIND_REALLOC(ret, usize, ptr, old_usize, old_rzsize,
+	    false);
 	return (ret);
 }
 
@@ -1254,24 +1304,8 @@ je_free(void *ptr)
 {
 
 	UTRACE(ptr, 0, 0);
-	if (ptr != NULL) {
-		size_t usize;
-		size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
-
-		assert(malloc_initialized || IS_INITIALIZER);
-
-		if (config_prof && opt_prof) {
-			usize = isalloc(ptr, config_prof);
-			prof_free(ptr, usize);
-		} else if (config_stats || config_valgrind)
-			usize = isalloc(ptr, config_prof);
-		if (config_stats)
-			thread_allocated_tsd_get()->deallocated += usize;
-		if (config_valgrind && opt_valgrind)
-			rzsize = p2rz(ptr);
-		iqalloc(ptr);
-		JEMALLOC_VALGRIND_FREE(ptr, rzsize);
-	}
+	if (ptr != NULL)
+		ifree(ptr);
 }
 
 /*
@@ -1337,208 +1371,344 @@ JEMALLOC_EXPORT void *(* __memalign_hook)(size_t alignment, size_t size) =
  * Begin non-standard functions.
  */
 
-size_t
-je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
+JEMALLOC_ALWAYS_INLINE_C void *
+imallocx(size_t usize, size_t alignment, bool zero, bool try_tcache,
+    arena_t *arena)
 {
-	size_t ret;
 
-	assert(malloc_initialized || IS_INITIALIZER);
-	malloc_thread_init();
+	assert(usize == ((alignment == 0) ? s2u(usize) : sa2u(usize,
+	    alignment)));
 
-	if (config_ivsalloc)
-		ret = ivsalloc(ptr, config_prof);
+	if (alignment != 0)
+		return (ipalloct(usize, alignment, zero, try_tcache, arena));
+	else if (zero)
+		return (icalloct(usize, try_tcache, arena));
 	else
-		ret = (ptr != NULL) ? isalloc(ptr, config_prof) : 0;
-
-	return (ret);
+		return (imalloct(usize, try_tcache, arena));
 }
 
-void
-je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    const char *opts)
+static void *
+imallocx_prof_sample(size_t usize, size_t alignment, bool zero, bool try_tcache,
+    arena_t *arena, prof_thr_cnt_t *cnt)
 {
+	void *p;
 
-	stats_print(write_cb, cbopaque, opts);
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		size_t usize_promoted = (alignment == 0) ?
+		    s2u(SMALL_MAXCLASS+1) : sa2u(SMALL_MAXCLASS+1, alignment);
+		assert(usize_promoted != 0);
+		p = imallocx(usize_promoted, alignment, zero, try_tcache,
+		    arena);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else
+		p = imallocx(usize, alignment, zero, try_tcache, arena);
+
+	return (p);
 }
 
-int
-je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
-    size_t newlen)
+JEMALLOC_ALWAYS_INLINE_C void *
+imallocx_prof(size_t usize, size_t alignment, bool zero, bool try_tcache,
+    arena_t *arena, prof_thr_cnt_t *cnt)
 {
+	void *p;
 
-	if (malloc_init())
-		return (EAGAIN);
+	if ((uintptr_t)cnt != (uintptr_t)1U) {
+		p = imallocx_prof_sample(usize, alignment, zero, try_tcache,
+		    arena, cnt);
+	} else
+		p = imallocx(usize, alignment, zero, try_tcache, arena);
+	if (p == NULL)
+		return (NULL);
+	prof_malloc(p, usize, cnt);
 
-	return (ctl_byname(name, oldp, oldlenp, newp, newlen));
+	return (p);
 }
 
-int
-je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp)
+void *
+je_mallocx(size_t size, int flags)
 {
+	void *p;
+	size_t usize;
+	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
+	    & (SIZE_T_MAX-1));
+	bool zero = flags & MALLOCX_ZERO;
+	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
+	arena_t *arena;
+	bool try_tcache;
+
+	assert(size != 0);
 
 	if (malloc_init())
-		return (EAGAIN);
+		goto label_oom;
 
-	return (ctl_nametomib(name, mibp, miblenp));
+	if (arena_ind != UINT_MAX) {
+		arena = arenas[arena_ind];
+		try_tcache = false;
+	} else {
+		arena = NULL;
+		try_tcache = true;
+	}
+
+	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
+	assert(usize != 0);
+
+	if (config_prof && opt_prof) {
+		prof_thr_cnt_t *cnt;
+
+		PROF_ALLOC_PREP(1, usize, cnt);
+		p = imallocx_prof(usize, alignment, zero, try_tcache, arena,
+		    cnt);
+	} else
+		p = imallocx(usize, alignment, zero, try_tcache, arena);
+	if (p == NULL)
+		goto label_oom;
+
+	if (config_stats) {
+		assert(usize == isalloc(p, config_prof));
+		thread_allocated_tsd_get()->allocated += usize;
+	}
+	UTRACE(0, size, p);
+	JEMALLOC_VALGRIND_MALLOC(true, p, usize, zero);
+	return (p);
+label_oom:
+	if (config_xmalloc && opt_xmalloc) {
+		malloc_write("<jemalloc>: Error in mallocx(): out of memory\n");
+		abort();
+	}
+	UTRACE(0, size, 0);
+	return (NULL);
 }
 
-int
-je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
-  void *newp, size_t newlen)
+static void *
+irallocx_prof_sample(void *oldptr, size_t size, size_t alignment, size_t usize,
+    bool zero, bool try_tcache_alloc, bool try_tcache_dalloc, arena_t *arena,
+    prof_thr_cnt_t *cnt)
 {
+	void *p;
 
-	if (malloc_init())
-		return (EAGAIN);
+	if (cnt == NULL)
+		return (NULL);
+	if (prof_promote && usize <= SMALL_MAXCLASS) {
+		p = iralloct(oldptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
+		    size) ? 0 : size - (SMALL_MAXCLASS+1), alignment, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena);
+		if (p == NULL)
+			return (NULL);
+		arena_prof_promoted(p, usize);
+	} else {
+		p = iralloct(oldptr, size, 0, alignment, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena);
+	}
 
-	return (ctl_bymib(mib, miblen, oldp, oldlenp, newp, newlen));
+	return (p);
 }
 
-/*
- * End non-standard functions.
- */
-/******************************************************************************/
-/*
- * Begin experimental functions.
- */
-#ifdef JEMALLOC_EXPERIMENTAL
-
-static JEMALLOC_ATTR(always_inline) void *
-iallocm(size_t usize, size_t alignment, bool zero, bool try_tcache,
-    arena_t *arena)
+JEMALLOC_ALWAYS_INLINE_C void *
+irallocx_prof(void *oldptr, size_t old_usize, size_t size, size_t alignment,
+    size_t *usize, bool zero, bool try_tcache_alloc, bool try_tcache_dalloc,
+    arena_t *arena, prof_thr_cnt_t *cnt)
 {
+	void *p;
+	prof_ctx_t *old_ctx;
 
-	assert(usize == ((alignment == 0) ? s2u(usize) : sa2u(usize,
-	    alignment)));
+	old_ctx = prof_ctx_get(oldptr);
+	if ((uintptr_t)cnt != (uintptr_t)1U)
+		p = irallocx_prof_sample(oldptr, size, alignment, *usize, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
+	else {
+		p = iralloct(oldptr, size, 0, alignment, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena);
+	}
+	if (p == NULL)
+		return (NULL);
 
-	if (alignment != 0)
-		return (ipallocx(usize, alignment, zero, try_tcache, arena));
-	else if (zero)
-		return (icallocx(usize, try_tcache, arena));
-	else
-		return (imallocx(usize, try_tcache, arena));
+	if (p == oldptr && alignment != 0) {
+		/*
+		 * The allocation did not move, so it is possible that the size
+		 * class is smaller than would guarantee the requested
+		 * alignment, and that the alignment constraint was
+		 * serendipitously satisfied.  Additionally, old_usize may not
+		 * be the same as the current usize because of in-place large
+		 * reallocation.  Therefore, query the actual value of usize.
+		 */
+		*usize = isalloc(p, config_prof);
+	}
+	prof_realloc(p, *usize, cnt, old_usize, old_ctx);
+
+	return (p);
 }
 
-int
-je_allocm(void **ptr, size_t *rsize, size_t size, int flags)
+void *
+je_rallocx(void *ptr, size_t size, int flags)
 {
 	void *p;
-	size_t usize;
-	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	size_t usize, old_usize;
+	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
+	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
 	    & (SIZE_T_MAX-1));
-	bool zero = flags & ALLOCM_ZERO;
+	bool zero = flags & MALLOCX_ZERO;
 	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
+	bool try_tcache_alloc, try_tcache_dalloc;
 	arena_t *arena;
-	bool try_tcache;
 
 	assert(ptr != NULL);
 	assert(size != 0);
-
-	if (malloc_init())
-		goto label_oom;
+	assert(malloc_initialized || IS_INITIALIZER);
+	malloc_thread_init();
 
 	if (arena_ind != UINT_MAX) {
+		arena_chunk_t *chunk;
+		try_tcache_alloc = false;
+		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+		try_tcache_dalloc = (chunk == ptr || chunk->arena !=
+		    arenas[arena_ind]);
 		arena = arenas[arena_ind];
-		try_tcache = false;
 	} else {
+		try_tcache_alloc = true;
+		try_tcache_dalloc = true;
 		arena = NULL;
-		try_tcache = true;
 	}
 
-	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
-	if (usize == 0)
-		goto label_oom;
+	if ((config_prof && opt_prof) || config_stats ||
+	    (config_valgrind && opt_valgrind))
+		old_usize = isalloc(ptr, config_prof);
+	if (config_valgrind && opt_valgrind)
+		old_rzsize = u2rz(old_usize);
 
 	if (config_prof && opt_prof) {
 		prof_thr_cnt_t *cnt;
 
+		usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
+		assert(usize != 0);
 		PROF_ALLOC_PREP(1, usize, cnt);
-		if (cnt == NULL)
+		p = irallocx_prof(ptr, old_usize, size, alignment, &usize, zero,
+		    try_tcache_alloc, try_tcache_dalloc, arena, cnt);
+		if (p == NULL)
 			goto label_oom;
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && usize <=
-		    SMALL_MAXCLASS) {
-			size_t usize_promoted = (alignment == 0) ?
-			    s2u(SMALL_MAXCLASS+1) : sa2u(SMALL_MAXCLASS+1,
-			    alignment);
-			assert(usize_promoted != 0);
-			p = iallocm(usize_promoted, alignment, zero,
-			    try_tcache, arena);
-			if (p == NULL)
-				goto label_oom;
-			arena_prof_promoted(p, usize);
-		} else {
-			p = iallocm(usize, alignment, zero, try_tcache, arena);
-			if (p == NULL)
-				goto label_oom;
-		}
-		prof_malloc(p, usize, cnt);
 	} else {
-		p = iallocm(usize, alignment, zero, try_tcache, arena);
+		p = iralloct(ptr, size, 0, alignment, zero, try_tcache_alloc,
+		    try_tcache_dalloc, arena);
 		if (p == NULL)
 			goto label_oom;
+		if (config_stats || (config_valgrind && opt_valgrind))
+			usize = isalloc(p, config_prof);
 	}
-	if (rsize != NULL)
-		*rsize = usize;
 
-	*ptr = p;
 	if (config_stats) {
-		assert(usize == isalloc(p, config_prof));
-		thread_allocated_tsd_get()->allocated += usize;
+		thread_allocated_t *ta;
+		ta = thread_allocated_tsd_get();
+		ta->allocated += usize;
+		ta->deallocated += old_usize;
 	}
-	UTRACE(0, size, p);
-	JEMALLOC_VALGRIND_MALLOC(true, p, usize, zero);
-	return (ALLOCM_SUCCESS);
+	UTRACE(ptr, size, p);
+	JEMALLOC_VALGRIND_REALLOC(p, usize, ptr, old_usize, old_rzsize, zero);
+	return (p);
 label_oom:
 	if (config_xmalloc && opt_xmalloc) {
-		malloc_write("<jemalloc>: Error in allocm(): "
-		    "out of memory\n");
+		malloc_write("<jemalloc>: Error in rallocx(): out of memory\n");
 		abort();
 	}
-	*ptr = NULL;
-	UTRACE(0, size, 0);
-	return (ALLOCM_ERR_OOM);
+	UTRACE(ptr, size, 0);
+	return (NULL);
 }
 
-int
-je_rallocm(void **ptr, size_t *rsize, size_t size, size_t extra, int flags)
+JEMALLOC_ALWAYS_INLINE_C size_t
+ixallocx_helper(void *ptr, size_t old_usize, size_t size, size_t extra,
+    size_t alignment, bool zero, arena_t *arena)
+{
+	size_t usize;
+
+	if (ixalloc(ptr, size, extra, alignment, zero))
+		return (old_usize);
+	usize = isalloc(ptr, config_prof);
+
+	return (usize);
+}
+
+static size_t
+ixallocx_prof_sample(void *ptr, size_t old_usize, size_t size, size_t extra,
+    size_t alignment, size_t max_usize, bool zero, arena_t *arena,
+    prof_thr_cnt_t *cnt)
 {
-	void *p, *q;
 	size_t usize;
-	size_t old_size;
-	size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
-	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+
+	if (cnt == NULL)
+		return (old_usize);
+	/* Use minimum usize to determine whether promotion may happen. */
+	if (prof_promote && ((alignment == 0) ? s2u(size) : sa2u(size,
+	    alignment)) <= SMALL_MAXCLASS) {
+		if (ixalloc(ptr, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
+		    size+extra) ? 0 : size+extra - (SMALL_MAXCLASS+1),
+		    alignment, zero))
+			return (old_usize);
+		usize = isalloc(ptr, config_prof);
+		if (max_usize < PAGE)
+			arena_prof_promoted(ptr, usize);
+	} else {
+		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
+		    zero, arena);
+	}
+
+	return (usize);
+}
+
+JEMALLOC_ALWAYS_INLINE_C size_t
+ixallocx_prof(void *ptr, size_t old_usize, size_t size, size_t extra,
+    size_t alignment, size_t max_usize, bool zero, arena_t *arena,
+    prof_thr_cnt_t *cnt)
+{
+	size_t usize;
+	prof_ctx_t *old_ctx;
+
+	old_ctx = prof_ctx_get(ptr);
+	if ((uintptr_t)cnt != (uintptr_t)1U) {
+		usize = ixallocx_prof_sample(ptr, old_usize, size, extra,
+		    alignment, zero, max_usize, arena, cnt);
+	} else {
+		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
+		    zero, arena);
+	}
+	if (usize == old_usize)
+		return (usize);
+	prof_realloc(ptr, usize, cnt, old_usize, old_ctx);
+
+	return (usize);
+}
+
+size_t
+je_xallocx(void *ptr, size_t size, size_t extra, int flags)
+{
+	size_t usize, old_usize;
+	UNUSED size_t old_rzsize JEMALLOC_CC_SILENCE_INIT(0);
+	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
 	    & (SIZE_T_MAX-1));
-	bool zero = flags & ALLOCM_ZERO;
-	bool no_move = flags & ALLOCM_NO_MOVE;
+	bool zero = flags & MALLOCX_ZERO;
 	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
-	bool try_tcache_alloc, try_tcache_dalloc;
 	arena_t *arena;
 
 	assert(ptr != NULL);
-	assert(*ptr != NULL);
 	assert(size != 0);
 	assert(SIZE_T_MAX - size >= extra);
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
 
-	if (arena_ind != UINT_MAX) {
-		arena_chunk_t *chunk;
-		try_tcache_alloc = true;
-		chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(*ptr);
-		try_tcache_dalloc = (chunk == *ptr || chunk->arena !=
-		    arenas[arena_ind]);
+	if (arena_ind != UINT_MAX)
 		arena = arenas[arena_ind];
-	} else {
-		try_tcache_alloc = true;
-		try_tcache_dalloc = true;
+	else
 		arena = NULL;
-	}
 
-	p = *ptr;
+	old_usize = isalloc(ptr, config_prof);
+	if (config_valgrind && opt_valgrind)
+		old_rzsize = u2rz(old_usize);
+
 	if (config_prof && opt_prof) {
 		prof_thr_cnt_t *cnt;
-
 		/*
-		 * usize isn't knowable before iralloc() returns when extra is
+		 * usize isn't knowable before ixalloc() returns when extra is
 		 * non-zero.  Therefore, compute its maximum possible value and
 		 * use that in PROF_ALLOC_PREP() to decide whether to capture a
 		 * backtrace.  prof_realloc() will use the actual usize to
@@ -1546,112 +1716,51 @@ je_rallocm(void **ptr, size_t *rsize, size_t size, size_t extra, int flags)
 		 */
 		size_t max_usize = (alignment == 0) ? s2u(size+extra) :
 		    sa2u(size+extra, alignment);
-		prof_ctx_t *old_ctx = prof_ctx_get(p);
-		old_size = isalloc(p, true);
-		if (config_valgrind && opt_valgrind)
-			old_rzsize = p2rz(p);
 		PROF_ALLOC_PREP(1, max_usize, cnt);
-		if (cnt == NULL)
-			goto label_oom;
-		/*
-		 * Use minimum usize to determine whether promotion may happen.
-		 */
-		if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U
-		    && ((alignment == 0) ? s2u(size) : sa2u(size, alignment))
-		    <= SMALL_MAXCLASS) {
-			q = irallocx(p, SMALL_MAXCLASS+1, (SMALL_MAXCLASS+1 >=
-			    size+extra) ? 0 : size+extra - (SMALL_MAXCLASS+1),
-			    alignment, zero, no_move, try_tcache_alloc,
-			    try_tcache_dalloc, arena);
-			if (q == NULL)
-				goto label_err;
-			if (max_usize < PAGE) {
-				usize = max_usize;
-				arena_prof_promoted(q, usize);
-			} else
-				usize = isalloc(q, config_prof);
-		} else {
-			q = irallocx(p, size, extra, alignment, zero, no_move,
-			    try_tcache_alloc, try_tcache_dalloc, arena);
-			if (q == NULL)
-				goto label_err;
-			usize = isalloc(q, config_prof);
-		}
-		prof_realloc(q, usize, cnt, old_size, old_ctx);
-		if (rsize != NULL)
-			*rsize = usize;
+		usize = ixallocx_prof(ptr, old_usize, size, extra, alignment,
+		    max_usize, zero, arena, cnt);
 	} else {
-		if (config_stats) {
-			old_size = isalloc(p, false);
-			if (config_valgrind && opt_valgrind)
-				old_rzsize = u2rz(old_size);
-		} else if (config_valgrind && opt_valgrind) {
-			old_size = isalloc(p, false);
-			old_rzsize = u2rz(old_size);
-		}
-		q = irallocx(p, size, extra, alignment, zero, no_move,
-		    try_tcache_alloc, try_tcache_dalloc, arena);
-		if (q == NULL)
-			goto label_err;
-		if (config_stats)
-			usize = isalloc(q, config_prof);
-		if (rsize != NULL) {
-			if (config_stats == false)
-				usize = isalloc(q, config_prof);
-			*rsize = usize;
-		}
+		usize = ixallocx_helper(ptr, old_usize, size, extra, alignment,
+		    zero, arena);
 	}
+	if (usize == old_usize)
+		goto label_not_resized;
 
-	*ptr = q;
 	if (config_stats) {
 		thread_allocated_t *ta;
 		ta = thread_allocated_tsd_get();
 		ta->allocated += usize;
-		ta->deallocated += old_size;
+		ta->deallocated += old_usize;
 	}
-	UTRACE(p, size, q);
-	JEMALLOC_VALGRIND_REALLOC(q, usize, p, old_size, old_rzsize, zero);
-	return (ALLOCM_SUCCESS);
-label_err:
-	if (no_move) {
-		UTRACE(p, size, q);
-		return (ALLOCM_ERR_NOT_MOVED);
-	}
-label_oom:
-	if (config_xmalloc && opt_xmalloc) {
-		malloc_write("<jemalloc>: Error in rallocm(): "
-		    "out of memory\n");
-		abort();
-	}
-	UTRACE(p, size, 0);
-	return (ALLOCM_ERR_OOM);
+	JEMALLOC_VALGRIND_REALLOC(ptr, usize, ptr, old_usize, old_rzsize, zero);
+label_not_resized:
+	UTRACE(ptr, size, ptr);
+	return (usize);
 }
 
-int
-je_sallocm(const void *ptr, size_t *rsize, int flags)
+size_t
+je_sallocx(const void *ptr, int flags)
 {
-	size_t sz;
+	size_t usize;
 
 	assert(malloc_initialized || IS_INITIALIZER);
 	malloc_thread_init();
 
 	if (config_ivsalloc)
-		sz = ivsalloc(ptr, config_prof);
+		usize = ivsalloc(ptr, config_prof);
 	else {
 		assert(ptr != NULL);
-		sz = isalloc(ptr, config_prof);
+		usize = isalloc(ptr, config_prof);
 	}
-	assert(rsize != NULL);
-	*rsize = sz;
 
-	return (ALLOCM_SUCCESS);
+	return (usize);
 }
 
-int
-je_dallocm(void *ptr, int flags)
+void
+je_dallocx(void *ptr, int flags)
 {
 	size_t usize;
-	size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
+	UNUSED size_t rzsize JEMALLOC_CC_SILENCE_INIT(0);
 	unsigned arena_ind = ((unsigned)(flags >> 8)) - 1;
 	bool try_tcache;
 
@@ -1677,28 +1786,162 @@ je_dallocm(void *ptr, int flags)
 		thread_allocated_tsd_get()->deallocated += usize;
 	if (config_valgrind && opt_valgrind)
 		rzsize = p2rz(ptr);
-	iqallocx(ptr, try_tcache);
+	iqalloct(ptr, try_tcache);
 	JEMALLOC_VALGRIND_FREE(ptr, rzsize);
-
-	return (ALLOCM_SUCCESS);
 }
 
-int
-je_nallocm(size_t *rsize, size_t size, int flags)
+size_t
+je_nallocx(size_t size, int flags)
 {
 	size_t usize;
-	size_t alignment = (ZU(1) << (flags & ALLOCM_LG_ALIGN_MASK)
+	size_t alignment = (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK)
 	    & (SIZE_T_MAX-1));
 
 	assert(size != 0);
 
 	if (malloc_init())
-		return (ALLOCM_ERR_OOM);
+		return (0);
 
 	usize = (alignment == 0) ? s2u(size) : sa2u(size, alignment);
-	if (usize == 0)
+	assert(usize != 0);
+	return (usize);
+}
+
+int
+je_mallctl(const char *name, void *oldp, size_t *oldlenp, void *newp,
+    size_t newlen)
+{
+
+	if (malloc_init())
+		return (EAGAIN);
+
+	return (ctl_byname(name, oldp, oldlenp, newp, newlen));
+}
+
+int
+je_mallctlnametomib(const char *name, size_t *mibp, size_t *miblenp)
+{
+
+	if (malloc_init())
+		return (EAGAIN);
+
+	return (ctl_nametomib(name, mibp, miblenp));
+}
+
+int
+je_mallctlbymib(const size_t *mib, size_t miblen, void *oldp, size_t *oldlenp,
+  void *newp, size_t newlen)
+{
+
+	if (malloc_init())
+		return (EAGAIN);
+
+	return (ctl_bymib(mib, miblen, oldp, oldlenp, newp, newlen));
+}
+
+void
+je_malloc_stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
+    const char *opts)
+{
+
+	stats_print(write_cb, cbopaque, opts);
+}
+
+size_t
+je_malloc_usable_size(JEMALLOC_USABLE_SIZE_CONST void *ptr)
+{
+	size_t ret;
+
+	assert(malloc_initialized || IS_INITIALIZER);
+	malloc_thread_init();
+
+	if (config_ivsalloc)
+		ret = ivsalloc(ptr, config_prof);
+	else
+		ret = (ptr != NULL) ? isalloc(ptr, config_prof) : 0;
+
+	return (ret);
+}
+
+/*
+ * End non-standard functions.
+ */
+/******************************************************************************/
+/*
+ * Begin experimental functions.
+ */
+#ifdef JEMALLOC_EXPERIMENTAL
+
+int
+je_allocm(void **ptr, size_t *rsize, size_t size, int flags)
+{
+	void *p;
+
+	assert(ptr != NULL);
+
+	p = je_mallocx(size, flags);
+	if (p == NULL)
 		return (ALLOCM_ERR_OOM);
+	if (rsize != NULL)
+		*rsize = isalloc(p, config_prof);
+	*ptr = p;
+	return (ALLOCM_SUCCESS);
+}
 
+int
+je_rallocm(void **ptr, size_t *rsize, size_t size, size_t extra, int flags)
+{
+	int ret;
+	bool no_move = flags & ALLOCM_NO_MOVE;
+
+	assert(ptr != NULL);
+	assert(*ptr != NULL);
+	assert(size != 0);
+	assert(SIZE_T_MAX - size >= extra);
+
+	if (no_move) {
+		size_t usize = je_xallocx(*ptr, size, extra, flags);
+		ret = (usize >= size) ? ALLOCM_SUCCESS : ALLOCM_ERR_NOT_MOVED;
+		if (rsize != NULL)
+			*rsize = usize;
+	} else {
+		void *p = je_rallocx(*ptr, size+extra, flags);
+		if (p != NULL) {
+			*ptr = p;
+			ret = ALLOCM_SUCCESS;
+		} else
+			ret = ALLOCM_ERR_OOM;
+		if (rsize != NULL)
+			*rsize = isalloc(*ptr, config_prof);
+	}
+	return (ret);
+}
+
+int
+je_sallocm(const void *ptr, size_t *rsize, int flags)
+{
+
+	assert(rsize != NULL);
+	*rsize = je_sallocx(ptr, flags);
+	return (ALLOCM_SUCCESS);
+}
+
+int
+je_dallocm(void *ptr, int flags)
+{
+
+	je_dallocx(ptr, flags);
+	return (ALLOCM_SUCCESS);
+}
+
+int
+je_nallocm(size_t *rsize, size_t size, int flags)
+{
+	size_t usize;
+
+	usize = je_nallocx(size, flags);
+	if (usize == 0)
+		return (ALLOCM_ERR_OOM);
 	if (rsize != NULL)
 		*rsize = usize;
 	return (ALLOCM_SUCCESS);
@@ -1833,7 +2076,7 @@ a0alloc(size_t size, bool zero)
 	if (size <= arena_maxclass)
 		return (arena_malloc(arenas[0], size, zero, false));
 	else
-		return (huge_malloc(size, zero));
+		return (huge_malloc(size, zero, huge_dss_prec_get(arenas[0])));
 }
 
 void *
diff --git a/dep/jemalloc/src/mutex.c b/dep/jemalloc/src/mutex.c
index 55e18c23713..788eca38703 100644
--- a/dep/jemalloc/src/mutex.c
+++ b/dep/jemalloc/src/mutex.c
@@ -6,7 +6,7 @@
 #endif
 
 #ifndef _CRT_SPINCOUNT
-#define _CRT_SPINCOUNT 4000
+#define	_CRT_SPINCOUNT 4000
 #endif
 
 /******************************************************************************/
diff --git a/dep/jemalloc/src/prof.c b/dep/jemalloc/src/prof.c
index c133b95c2c6..7722b7b4373 100644
--- a/dep/jemalloc/src/prof.c
+++ b/dep/jemalloc/src/prof.c
@@ -24,7 +24,12 @@ bool		opt_prof_gdump = false;
 bool		opt_prof_final = true;
 bool		opt_prof_leak = false;
 bool		opt_prof_accum = false;
-char		opt_prof_prefix[PATH_MAX + 1];
+char		opt_prof_prefix[
+    /* Minimize memory bloat for non-prof builds. */
+#ifdef JEMALLOC_PROF
+    PATH_MAX +
+#endif
+    1];
 
 uint64_t	prof_interval = 0;
 bool		prof_promote;
@@ -54,10 +59,17 @@ static uint64_t		prof_dump_useq;
 
 /*
  * This buffer is rather large for stack allocation, so use a single buffer for
- * all profile dumps.  The buffer is implicitly protected by bt2ctx_mtx, since
- * it must be locked anyway during dumping.
+ * all profile dumps.
  */
-static char		prof_dump_buf[PROF_DUMP_BUFSIZE];
+static malloc_mutex_t	prof_dump_mtx;
+static char		prof_dump_buf[
+    /* Minimize memory bloat for non-prof builds. */
+#ifdef JEMALLOC_PROF
+    PROF_DUMP_BUFSIZE
+#else
+    1
+#endif
+];
 static unsigned		prof_dump_buf_end;
 static int		prof_dump_fd;
 
@@ -65,36 +77,6 @@ static int		prof_dump_fd;
 static bool		prof_booted = false;
 
 /******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static prof_bt_t	*bt_dup(prof_bt_t *bt);
-static void	bt_destroy(prof_bt_t *bt);
-#ifdef JEMALLOC_PROF_LIBGCC
-static _Unwind_Reason_Code	prof_unwind_init_callback(
-    struct _Unwind_Context *context, void *arg);
-static _Unwind_Reason_Code	prof_unwind_callback(
-    struct _Unwind_Context *context, void *arg);
-#endif
-static bool	prof_flush(bool propagate_err);
-static bool	prof_write(bool propagate_err, const char *s);
-static bool	prof_printf(bool propagate_err, const char *format, ...)
-    JEMALLOC_ATTR(format(printf, 2, 3));
-static void	prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all,
-    size_t *leak_nctx);
-static void	prof_ctx_destroy(prof_ctx_t *ctx);
-static void	prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt);
-static bool	prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx,
-    prof_bt_t *bt);
-static bool	prof_dump_maps(bool propagate_err);
-static bool	prof_dump(bool propagate_err, const char *filename,
-    bool leakcheck);
-static void	prof_dump_filename(char *filename, char v, int64_t vseq);
-static void	prof_fdump(void);
-static void	prof_bt_hash(const void *key, size_t r_hash[2]);
-static bool	prof_bt_keycomp(const void *k1, const void *k2);
-static malloc_mutex_t	*prof_ctx_mutex_choose(void);
-
-/******************************************************************************/
 
 void
 bt_init(prof_bt_t *bt, void **vec)
@@ -423,10 +405,169 @@ prof_backtrace(prof_bt_t *bt, unsigned nignore)
 {
 
 	cassert(config_prof);
-	assert(false);
+	not_reached();
 }
 #endif
 
+static malloc_mutex_t *
+prof_ctx_mutex_choose(void)
+{
+	unsigned nctxs = atomic_add_u(&cum_ctxs, 1);
+
+	return (&ctx_locks[(nctxs - 1) % PROF_NCTX_LOCKS]);
+}
+
+static void
+prof_ctx_init(prof_ctx_t *ctx, prof_bt_t *bt)
+{
+
+	ctx->bt = bt;
+	ctx->lock = prof_ctx_mutex_choose();
+	/*
+	 * Set nlimbo to 1, in order to avoid a race condition with
+	 * prof_ctx_merge()/prof_ctx_destroy().
+	 */
+	ctx->nlimbo = 1;
+	ql_elm_new(ctx, dump_link);
+	memset(&ctx->cnt_merged, 0, sizeof(prof_cnt_t));
+	ql_new(&ctx->cnts_ql);
+}
+
+static void
+prof_ctx_destroy(prof_ctx_t *ctx)
+{
+	prof_tdata_t *prof_tdata;
+
+	cassert(config_prof);
+
+	/*
+	 * Check that ctx is still unused by any thread cache before destroying
+	 * it.  prof_lookup() increments ctx->nlimbo in order to avoid a race
+	 * condition with this function, as does prof_ctx_merge() in order to
+	 * avoid a race between the main body of prof_ctx_merge() and entry
+	 * into this function.
+	 */
+	prof_tdata = prof_tdata_get(false);
+	assert((uintptr_t)prof_tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
+	prof_enter(prof_tdata);
+	malloc_mutex_lock(ctx->lock);
+	if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0 &&
+	    ctx->nlimbo == 1) {
+		assert(ctx->cnt_merged.curbytes == 0);
+		assert(ctx->cnt_merged.accumobjs == 0);
+		assert(ctx->cnt_merged.accumbytes == 0);
+		/* Remove ctx from bt2ctx. */
+		if (ckh_remove(&bt2ctx, ctx->bt, NULL, NULL))
+			not_reached();
+		prof_leave(prof_tdata);
+		/* Destroy ctx. */
+		malloc_mutex_unlock(ctx->lock);
+		bt_destroy(ctx->bt);
+		idalloc(ctx);
+	} else {
+		/*
+		 * Compensate for increment in prof_ctx_merge() or
+		 * prof_lookup().
+		 */
+		ctx->nlimbo--;
+		malloc_mutex_unlock(ctx->lock);
+		prof_leave(prof_tdata);
+	}
+}
+
+static void
+prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+{
+	bool destroy;
+
+	cassert(config_prof);
+
+	/* Merge cnt stats and detach from ctx. */
+	malloc_mutex_lock(ctx->lock);
+	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
+	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
+	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
+	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
+	ql_remove(&ctx->cnts_ql, cnt, cnts_link);
+	if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
+	    ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 0) {
+		/*
+		 * Increment ctx->nlimbo in order to keep another thread from
+		 * winning the race to destroy ctx while this one has ctx->lock
+		 * dropped.  Without this, it would be possible for another
+		 * thread to:
+		 *
+		 * 1) Sample an allocation associated with ctx.
+		 * 2) Deallocate the sampled object.
+		 * 3) Successfully prof_ctx_destroy(ctx).
+		 *
+		 * The result would be that ctx no longer exists by the time
+		 * this thread accesses it in prof_ctx_destroy().
+		 */
+		ctx->nlimbo++;
+		destroy = true;
+	} else
+		destroy = false;
+	malloc_mutex_unlock(ctx->lock);
+	if (destroy)
+		prof_ctx_destroy(ctx);
+}
+
+static bool
+prof_lookup_global(prof_bt_t *bt, prof_tdata_t *prof_tdata, void **p_btkey,
+    prof_ctx_t **p_ctx, bool *p_new_ctx)
+{
+	union {
+		prof_ctx_t	*p;
+		void		*v;
+	} ctx;
+	union {
+		prof_bt_t	*p;
+		void		*v;
+	} btkey;
+	bool new_ctx;
+
+	prof_enter(prof_tdata);
+	if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
+		/* bt has never been seen before.  Insert it. */
+		ctx.v = imalloc(sizeof(prof_ctx_t));
+		if (ctx.v == NULL) {
+			prof_leave(prof_tdata);
+			return (true);
+		}
+		btkey.p = bt_dup(bt);
+		if (btkey.v == NULL) {
+			prof_leave(prof_tdata);
+			idalloc(ctx.v);
+			return (true);
+		}
+		prof_ctx_init(ctx.p, btkey.p);
+		if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
+			/* OOM. */
+			prof_leave(prof_tdata);
+			idalloc(btkey.v);
+			idalloc(ctx.v);
+			return (true);
+		}
+		new_ctx = true;
+	} else {
+		/*
+		 * Increment nlimbo, in order to avoid a race condition with
+		 * prof_ctx_merge()/prof_ctx_destroy().
+		 */
+		malloc_mutex_lock(ctx.p->lock);
+		ctx.p->nlimbo++;
+		malloc_mutex_unlock(ctx.p->lock);
+		new_ctx = false;
+	}
+	prof_leave(prof_tdata);
+
+	*p_btkey = btkey.v;
+	*p_ctx = ctx.p;
+	*p_new_ctx = new_ctx;
+	return (false);
+}
+
 prof_thr_cnt_t *
 prof_lookup(prof_bt_t *bt)
 {
@@ -443,62 +584,16 @@ prof_lookup(prof_bt_t *bt)
 		return (NULL);
 
 	if (ckh_search(&prof_tdata->bt2cnt, bt, NULL, &ret.v)) {
-		union {
-			prof_bt_t	*p;
-			void		*v;
-		} btkey;
-		union {
-			prof_ctx_t	*p;
-			void		*v;
-		} ctx;
+		void *btkey;
+		prof_ctx_t *ctx;
 		bool new_ctx;
 
 		/*
 		 * This thread's cache lacks bt.  Look for it in the global
 		 * cache.
 		 */
-		prof_enter(prof_tdata);
-		if (ckh_search(&bt2ctx, bt, &btkey.v, &ctx.v)) {
-			/* bt has never been seen before.  Insert it. */
-			ctx.v = imalloc(sizeof(prof_ctx_t));
-			if (ctx.v == NULL) {
-				prof_leave(prof_tdata);
-				return (NULL);
-			}
-			btkey.p = bt_dup(bt);
-			if (btkey.v == NULL) {
-				prof_leave(prof_tdata);
-				idalloc(ctx.v);
-				return (NULL);
-			}
-			ctx.p->bt = btkey.p;
-			ctx.p->lock = prof_ctx_mutex_choose();
-			/*
-			 * Set nlimbo to 1, in order to avoid a race condition
-			 * with prof_ctx_merge()/prof_ctx_destroy().
-			 */
-			ctx.p->nlimbo = 1;
-			memset(&ctx.p->cnt_merged, 0, sizeof(prof_cnt_t));
-			ql_new(&ctx.p->cnts_ql);
-			if (ckh_insert(&bt2ctx, btkey.v, ctx.v)) {
-				/* OOM. */
-				prof_leave(prof_tdata);
-				idalloc(btkey.v);
-				idalloc(ctx.v);
-				return (NULL);
-			}
-			new_ctx = true;
-		} else {
-			/*
-			 * Increment nlimbo, in order to avoid a race condition
-			 * with prof_ctx_merge()/prof_ctx_destroy().
-			 */
-			malloc_mutex_lock(ctx.p->lock);
-			ctx.p->nlimbo++;
-			malloc_mutex_unlock(ctx.p->lock);
-			new_ctx = false;
-		}
-		prof_leave(prof_tdata);
+		if (prof_lookup_global(bt, prof_tdata, &btkey, &ctx, &new_ctx))
+			return (NULL);
 
 		/* Link a prof_thd_cnt_t into ctx for this thread. */
 		if (ckh_count(&prof_tdata->bt2cnt) == PROF_TCMAX) {
@@ -511,7 +606,7 @@ prof_lookup(prof_bt_t *bt)
 			assert(ret.v != NULL);
 			if (ckh_remove(&prof_tdata->bt2cnt, ret.p->ctx->bt,
 			    NULL, NULL))
-				assert(false);
+				not_reached();
 			ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
 			prof_ctx_merge(ret.p->ctx, ret.p);
 			/* ret can now be re-used. */
@@ -521,27 +616,27 @@ prof_lookup(prof_bt_t *bt)
 			ret.v = imalloc(sizeof(prof_thr_cnt_t));
 			if (ret.p == NULL) {
 				if (new_ctx)
-					prof_ctx_destroy(ctx.p);
+					prof_ctx_destroy(ctx);
 				return (NULL);
 			}
 			ql_elm_new(ret.p, cnts_link);
 			ql_elm_new(ret.p, lru_link);
 		}
 		/* Finish initializing ret. */
-		ret.p->ctx = ctx.p;
+		ret.p->ctx = ctx;
 		ret.p->epoch = 0;
 		memset(&ret.p->cnts, 0, sizeof(prof_cnt_t));
-		if (ckh_insert(&prof_tdata->bt2cnt, btkey.v, ret.v)) {
+		if (ckh_insert(&prof_tdata->bt2cnt, btkey, ret.v)) {
 			if (new_ctx)
-				prof_ctx_destroy(ctx.p);
+				prof_ctx_destroy(ctx);
 			idalloc(ret.v);
 			return (NULL);
 		}
 		ql_head_insert(&prof_tdata->lru_ql, ret.p, lru_link);
-		malloc_mutex_lock(ctx.p->lock);
-		ql_tail_insert(&ctx.p->cnts_ql, ret.p, cnts_link);
-		ctx.p->nlimbo--;
-		malloc_mutex_unlock(ctx.p->lock);
+		malloc_mutex_lock(ctx->lock);
+		ql_tail_insert(&ctx->cnts_ql, ret.p, cnts_link);
+		ctx->nlimbo--;
+		malloc_mutex_unlock(ctx->lock);
 	} else {
 		/* Move ret to the front of the LRU. */
 		ql_remove(&prof_tdata->lru_ql, ret.p, lru_link);
@@ -551,8 +646,52 @@ prof_lookup(prof_bt_t *bt)
 	return (ret.p);
 }
 
+#ifdef JEMALLOC_JET
+size_t
+prof_bt_count(void)
+{
+	size_t bt_count;
+	prof_tdata_t *prof_tdata;
+
+	prof_tdata = prof_tdata_get(false);
+	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
+		return (0);
+
+	prof_enter(prof_tdata);
+	bt_count = ckh_count(&bt2ctx);
+	prof_leave(prof_tdata);
+
+	return (bt_count);
+}
+#endif
+
+#ifdef JEMALLOC_JET
+#undef prof_dump_open
+#define	prof_dump_open JEMALLOC_N(prof_dump_open_impl)
+#endif
+static int
+prof_dump_open(bool propagate_err, const char *filename)
+{
+	int fd;
+
+	fd = creat(filename, 0644);
+	if (fd == -1 && propagate_err == false) {
+		malloc_printf("<jemalloc>: creat(\"%s\"), 0644) failed\n",
+		    filename);
+		if (opt_abort)
+			abort();
+	}
+
+	return (fd);
+}
+#ifdef JEMALLOC_JET
+#undef prof_dump_open
+#define	prof_dump_open JEMALLOC_N(prof_dump_open)
+prof_dump_open_t *prof_dump_open = JEMALLOC_N(prof_dump_open_impl);
+#endif
+
 static bool
-prof_flush(bool propagate_err)
+prof_dump_flush(bool propagate_err)
 {
 	bool ret = false;
 	ssize_t err;
@@ -575,7 +714,20 @@ prof_flush(bool propagate_err)
 }
 
 static bool
-prof_write(bool propagate_err, const char *s)
+prof_dump_close(bool propagate_err)
+{
+	bool ret;
+
+	assert(prof_dump_fd != -1);
+	ret = prof_dump_flush(propagate_err);
+	close(prof_dump_fd);
+	prof_dump_fd = -1;
+
+	return (ret);
+}
+
+static bool
+prof_dump_write(bool propagate_err, const char *s)
 {
 	unsigned i, slen, n;
 
@@ -586,7 +738,7 @@ prof_write(bool propagate_err, const char *s)
 	while (i < slen) {
 		/* Flush the buffer if it is full. */
 		if (prof_dump_buf_end == PROF_DUMP_BUFSIZE)
-			if (prof_flush(propagate_err) && propagate_err)
+			if (prof_dump_flush(propagate_err) && propagate_err)
 				return (true);
 
 		if (prof_dump_buf_end + slen <= PROF_DUMP_BUFSIZE) {
@@ -606,7 +758,7 @@ prof_write(bool propagate_err, const char *s)
 
 JEMALLOC_ATTR(format(printf, 2, 3))
 static bool
-prof_printf(bool propagate_err, const char *format, ...)
+prof_dump_printf(bool propagate_err, const char *format, ...)
 {
 	bool ret;
 	va_list ap;
@@ -615,13 +767,14 @@ prof_printf(bool propagate_err, const char *format, ...)
 	va_start(ap, format);
 	malloc_vsnprintf(buf, sizeof(buf), format, ap);
 	va_end(ap);
-	ret = prof_write(propagate_err, buf);
+	ret = prof_dump_write(propagate_err, buf);
 
 	return (ret);
 }
 
 static void
-prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
+prof_dump_ctx_prep(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx,
+    prof_ctx_list_t *ctx_ql)
 {
 	prof_thr_cnt_t *thr_cnt;
 	prof_cnt_t tcnt;
@@ -630,6 +783,14 @@ prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 
 	malloc_mutex_lock(ctx->lock);
 
+	/*
+	 * Increment nlimbo so that ctx won't go away before dump.
+	 * Additionally, link ctx into the dump list so that it is included in
+	 * prof_dump()'s second pass.
+	 */
+	ctx->nlimbo++;
+	ql_tail_insert(ctx_ql, ctx, dump_link);
+
 	memcpy(&ctx->cnt_summed, &ctx->cnt_merged, sizeof(prof_cnt_t));
 	ql_foreach(thr_cnt, &ctx->cnts_ql, cnts_link) {
 		volatile unsigned *epoch = &thr_cnt->epoch;
@@ -670,89 +831,52 @@ prof_ctx_sum(prof_ctx_t *ctx, prof_cnt_t *cnt_all, size_t *leak_nctx)
 	malloc_mutex_unlock(ctx->lock);
 }
 
-static void
-prof_ctx_destroy(prof_ctx_t *ctx)
+static bool
+prof_dump_header(bool propagate_err, const prof_cnt_t *cnt_all)
 {
-	prof_tdata_t *prof_tdata;
 
-	cassert(config_prof);
-
-	/*
-	 * Check that ctx is still unused by any thread cache before destroying
-	 * it.  prof_lookup() increments ctx->nlimbo in order to avoid a race
-	 * condition with this function, as does prof_ctx_merge() in order to
-	 * avoid a race between the main body of prof_ctx_merge() and entry
-	 * into this function.
-	 */
-	prof_tdata = prof_tdata_get(false);
-	assert((uintptr_t)prof_tdata > (uintptr_t)PROF_TDATA_STATE_MAX);
-	prof_enter(prof_tdata);
-	malloc_mutex_lock(ctx->lock);
-	if (ql_first(&ctx->cnts_ql) == NULL && ctx->cnt_merged.curobjs == 0 &&
-	    ctx->nlimbo == 1) {
-		assert(ctx->cnt_merged.curbytes == 0);
-		assert(ctx->cnt_merged.accumobjs == 0);
-		assert(ctx->cnt_merged.accumbytes == 0);
-		/* Remove ctx from bt2ctx. */
-		if (ckh_remove(&bt2ctx, ctx->bt, NULL, NULL))
-			assert(false);
-		prof_leave(prof_tdata);
-		/* Destroy ctx. */
-		malloc_mutex_unlock(ctx->lock);
-		bt_destroy(ctx->bt);
-		idalloc(ctx);
+	if (opt_lg_prof_sample == 0) {
+		if (prof_dump_printf(propagate_err,
+		    "heap profile: %"PRId64": %"PRId64
+		    " [%"PRIu64": %"PRIu64"] @ heapprofile\n",
+		    cnt_all->curobjs, cnt_all->curbytes,
+		    cnt_all->accumobjs, cnt_all->accumbytes))
+			return (true);
 	} else {
-		/*
-		 * Compensate for increment in prof_ctx_merge() or
-		 * prof_lookup().
-		 */
-		ctx->nlimbo--;
-		malloc_mutex_unlock(ctx->lock);
-		prof_leave(prof_tdata);
+		if (prof_dump_printf(propagate_err,
+		    "heap profile: %"PRId64": %"PRId64
+		    " [%"PRIu64": %"PRIu64"] @ heap_v2/%"PRIu64"\n",
+		    cnt_all->curobjs, cnt_all->curbytes,
+		    cnt_all->accumobjs, cnt_all->accumbytes,
+		    ((uint64_t)1U << opt_lg_prof_sample)))
+			return (true);
 	}
+
+	return (false);
 }
 
 static void
-prof_ctx_merge(prof_ctx_t *ctx, prof_thr_cnt_t *cnt)
+prof_dump_ctx_cleanup_locked(prof_ctx_t *ctx, prof_ctx_list_t *ctx_ql)
 {
-	bool destroy;
 
-	cassert(config_prof);
+	ctx->nlimbo--;
+	ql_remove(ctx_ql, ctx, dump_link);
+}
+
+static void
+prof_dump_ctx_cleanup(prof_ctx_t *ctx, prof_ctx_list_t *ctx_ql)
+{
 
-	/* Merge cnt stats and detach from ctx. */
 	malloc_mutex_lock(ctx->lock);
-	ctx->cnt_merged.curobjs += cnt->cnts.curobjs;
-	ctx->cnt_merged.curbytes += cnt->cnts.curbytes;
-	ctx->cnt_merged.accumobjs += cnt->cnts.accumobjs;
-	ctx->cnt_merged.accumbytes += cnt->cnts.accumbytes;
-	ql_remove(&ctx->cnts_ql, cnt, cnts_link);
-	if (opt_prof_accum == false && ql_first(&ctx->cnts_ql) == NULL &&
-	    ctx->cnt_merged.curobjs == 0 && ctx->nlimbo == 0) {
-		/*
-		 * Increment ctx->nlimbo in order to keep another thread from
-		 * winning the race to destroy ctx while this one has ctx->lock
-		 * dropped.  Without this, it would be possible for another
-		 * thread to:
-		 *
-		 * 1) Sample an allocation associated with ctx.
-		 * 2) Deallocate the sampled object.
-		 * 3) Successfully prof_ctx_destroy(ctx).
-		 *
-		 * The result would be that ctx no longer exists by the time
-		 * this thread accesses it in prof_ctx_destroy().
-		 */
-		ctx->nlimbo++;
-		destroy = true;
-	} else
-		destroy = false;
+	prof_dump_ctx_cleanup_locked(ctx, ctx_ql);
 	malloc_mutex_unlock(ctx->lock);
-	if (destroy)
-		prof_ctx_destroy(ctx);
 }
 
 static bool
-prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, prof_bt_t *bt)
+prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, const prof_bt_t *bt,
+    prof_ctx_list_t *ctx_ql)
 {
+	bool ret;
 	unsigned i;
 
 	cassert(config_prof);
@@ -764,66 +888,109 @@ prof_dump_ctx(bool propagate_err, prof_ctx_t *ctx, prof_bt_t *bt)
 	 * filled in.  Avoid dumping any ctx that is an artifact of either
 	 * implementation detail.
 	 */
+	malloc_mutex_lock(ctx->lock);
 	if ((opt_prof_accum == false && ctx->cnt_summed.curobjs == 0) ||
 	    (opt_prof_accum && ctx->cnt_summed.accumobjs == 0)) {
 		assert(ctx->cnt_summed.curobjs == 0);
 		assert(ctx->cnt_summed.curbytes == 0);
 		assert(ctx->cnt_summed.accumobjs == 0);
 		assert(ctx->cnt_summed.accumbytes == 0);
-		return (false);
+		ret = false;
+		goto label_return;
 	}
 
-	if (prof_printf(propagate_err, "%"PRId64": %"PRId64
+	if (prof_dump_printf(propagate_err, "%"PRId64": %"PRId64
 	    " [%"PRIu64": %"PRIu64"] @",
 	    ctx->cnt_summed.curobjs, ctx->cnt_summed.curbytes,
-	    ctx->cnt_summed.accumobjs, ctx->cnt_summed.accumbytes))
-		return (true);
+	    ctx->cnt_summed.accumobjs, ctx->cnt_summed.accumbytes)) {
+		ret = true;
+		goto label_return;
+	}
 
 	for (i = 0; i < bt->len; i++) {
-		if (prof_printf(propagate_err, " %#"PRIxPTR,
-		    (uintptr_t)bt->vec[i]))
-			return (true);
+		if (prof_dump_printf(propagate_err, " %#"PRIxPTR,
+		    (uintptr_t)bt->vec[i])) {
+			ret = true;
+			goto label_return;
+		}
 	}
 
-	if (prof_write(propagate_err, "\n"))
-		return (true);
+	if (prof_dump_write(propagate_err, "\n")) {
+		ret = true;
+		goto label_return;
+	}
 
-	return (false);
+	ret = false;
+label_return:
+	prof_dump_ctx_cleanup_locked(ctx, ctx_ql);
+	malloc_mutex_unlock(ctx->lock);
+	return (ret);
 }
 
 static bool
 prof_dump_maps(bool propagate_err)
 {
+	bool ret;
 	int mfd;
 	char filename[PATH_MAX + 1];
 
 	cassert(config_prof);
-
+#ifdef __FreeBSD__
+	malloc_snprintf(filename, sizeof(filename), "/proc/curproc/map");
+#else
 	malloc_snprintf(filename, sizeof(filename), "/proc/%d/maps",
 	    (int)getpid());
+#endif
 	mfd = open(filename, O_RDONLY);
 	if (mfd != -1) {
 		ssize_t nread;
 
-		if (prof_write(propagate_err, "\nMAPPED_LIBRARIES:\n") &&
-		    propagate_err)
-			return (true);
+		if (prof_dump_write(propagate_err, "\nMAPPED_LIBRARIES:\n") &&
+		    propagate_err) {
+			ret = true;
+			goto label_return;
+		}
 		nread = 0;
 		do {
 			prof_dump_buf_end += nread;
 			if (prof_dump_buf_end == PROF_DUMP_BUFSIZE) {
 				/* Make space in prof_dump_buf before read(). */
-				if (prof_flush(propagate_err) && propagate_err)
-					return (true);
+				if (prof_dump_flush(propagate_err) &&
+				    propagate_err) {
+					ret = true;
+					goto label_return;
+				}
 			}
 			nread = read(mfd, &prof_dump_buf[prof_dump_buf_end],
 			    PROF_DUMP_BUFSIZE - prof_dump_buf_end);
 		} while (nread > 0);
+	} else {
+		ret = true;
+		goto label_return;
+	}
+
+	ret = false;
+label_return:
+	if (mfd != -1)
 		close(mfd);
-	} else
-		return (true);
+	return (ret);
+}
 
-	return (false);
+static void
+prof_leakcheck(const prof_cnt_t *cnt_all, size_t leak_nctx,
+    const char *filename)
+{
+
+	if (cnt_all->curbytes != 0) {
+		malloc_printf("<jemalloc>: Leak summary: %"PRId64" byte%s, %"
+		    PRId64" object%s, %zu context%s\n",
+		    cnt_all->curbytes, (cnt_all->curbytes != 1) ? "s" : "",
+		    cnt_all->curobjs, (cnt_all->curobjs != 1) ? "s" : "",
+		    leak_nctx, (leak_nctx != 1) ? "s" : "");
+		malloc_printf(
+		    "<jemalloc>: Run pprof on \"%s\" for leak detail\n",
+		    filename);
+	}
 }
 
 static bool
@@ -833,98 +1000,74 @@ prof_dump(bool propagate_err, const char *filename, bool leakcheck)
 	prof_cnt_t cnt_all;
 	size_t tabind;
 	union {
-		prof_bt_t	*p;
-		void		*v;
-	} bt;
-	union {
 		prof_ctx_t	*p;
 		void		*v;
 	} ctx;
 	size_t leak_nctx;
+	prof_ctx_list_t ctx_ql;
 
 	cassert(config_prof);
 
 	prof_tdata = prof_tdata_get(false);
 	if ((uintptr_t)prof_tdata <= (uintptr_t)PROF_TDATA_STATE_MAX)
 		return (true);
-	prof_enter(prof_tdata);
-	prof_dump_fd = creat(filename, 0644);
-	if (prof_dump_fd == -1) {
-		if (propagate_err == false) {
-			malloc_printf(
-			    "<jemalloc>: creat(\"%s\"), 0644) failed\n",
-			    filename);
-			if (opt_abort)
-				abort();
-		}
-		goto label_error;
-	}
+
+	malloc_mutex_lock(&prof_dump_mtx);
 
 	/* Merge per thread profile stats, and sum them in cnt_all. */
 	memset(&cnt_all, 0, sizeof(prof_cnt_t));
 	leak_nctx = 0;
+	ql_new(&ctx_ql);
+	prof_enter(prof_tdata);
 	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, NULL, &ctx.v) == false;)
-		prof_ctx_sum(ctx.p, &cnt_all, &leak_nctx);
+		prof_dump_ctx_prep(ctx.p, &cnt_all, &leak_nctx, &ctx_ql);
+	prof_leave(prof_tdata);
+
+	/* Create dump file. */
+	if ((prof_dump_fd = prof_dump_open(propagate_err, filename)) == -1)
+		goto label_open_close_error;
 
 	/* Dump profile header. */
-	if (opt_lg_prof_sample == 0) {
-		if (prof_printf(propagate_err,
-		    "heap profile: %"PRId64": %"PRId64
-		    " [%"PRIu64": %"PRIu64"] @ heapprofile\n",
-		    cnt_all.curobjs, cnt_all.curbytes,
-		    cnt_all.accumobjs, cnt_all.accumbytes))
-			goto label_error;
-	} else {
-		if (prof_printf(propagate_err,
-		    "heap profile: %"PRId64": %"PRId64
-		    " [%"PRIu64": %"PRIu64"] @ heap_v2/%"PRIu64"\n",
-		    cnt_all.curobjs, cnt_all.curbytes,
-		    cnt_all.accumobjs, cnt_all.accumbytes,
-		    ((uint64_t)1U << opt_lg_prof_sample)))
-			goto label_error;
-	}
+	if (prof_dump_header(propagate_err, &cnt_all))
+		goto label_write_error;
 
-	/* Dump  per ctx profile stats. */
-	for (tabind = 0; ckh_iter(&bt2ctx, &tabind, &bt.v, &ctx.v)
-	    == false;) {
-		if (prof_dump_ctx(propagate_err, ctx.p, bt.p))
-			goto label_error;
+	/* Dump per ctx profile stats. */
+	while ((ctx.p = ql_first(&ctx_ql)) != NULL) {
+		if (prof_dump_ctx(propagate_err, ctx.p, ctx.p->bt, &ctx_ql))
+			goto label_write_error;
 	}
 
 	/* Dump /proc/<pid>/maps if possible. */
 	if (prof_dump_maps(propagate_err))
-		goto label_error;
+		goto label_write_error;
 
-	if (prof_flush(propagate_err))
-		goto label_error;
-	close(prof_dump_fd);
-	prof_leave(prof_tdata);
+	if (prof_dump_close(propagate_err))
+		goto label_open_close_error;
 
-	if (leakcheck && cnt_all.curbytes != 0) {
-		malloc_printf("<jemalloc>: Leak summary: %"PRId64" byte%s, %"
-		    PRId64" object%s, %zu context%s\n",
-		    cnt_all.curbytes, (cnt_all.curbytes != 1) ? "s" : "",
-		    cnt_all.curobjs, (cnt_all.curobjs != 1) ? "s" : "",
-		    leak_nctx, (leak_nctx != 1) ? "s" : "");
-		malloc_printf(
-		    "<jemalloc>: Run pprof on \"%s\" for leak detail\n",
-		    filename);
-	}
+	malloc_mutex_unlock(&prof_dump_mtx);
+
+	if (leakcheck)
+		prof_leakcheck(&cnt_all, leak_nctx, filename);
 
 	return (false);
-label_error:
-	prof_leave(prof_tdata);
+label_write_error:
+	prof_dump_close(propagate_err);
+label_open_close_error:
+	while ((ctx.p = ql_first(&ctx_ql)) != NULL)
+		prof_dump_ctx_cleanup(ctx.p, &ctx_ql);
+	malloc_mutex_unlock(&prof_dump_mtx);
 	return (true);
 }
 
 #define	DUMP_FILENAME_BUFSIZE	(PATH_MAX + 1)
+#define	VSEQ_INVALID		UINT64_C(0xffffffffffffffff)
 static void
 prof_dump_filename(char *filename, char v, int64_t vseq)
 {
 
 	cassert(config_prof);
 
-	if (vseq != UINT64_C(0xffffffffffffffff)) {
+	if (vseq != VSEQ_INVALID) {
 	        /* "<prefix>.<pid>.<seq>.v<vseq>.heap" */
 		malloc_snprintf(filename, DUMP_FILENAME_BUFSIZE,
 		    "%s.%d.%"PRIu64".%c%"PRId64".heap",
@@ -950,7 +1093,7 @@ prof_fdump(void)
 
 	if (opt_prof_final && opt_prof_prefix[0] != '\0') {
 		malloc_mutex_lock(&prof_dump_seq_mtx);
-		prof_dump_filename(filename, 'f', UINT64_C(0xffffffffffffffff));
+		prof_dump_filename(filename, 'f', VSEQ_INVALID);
 		malloc_mutex_unlock(&prof_dump_seq_mtx);
 		prof_dump(false, filename, opt_prof_leak);
 	}
@@ -1056,14 +1199,6 @@ prof_bt_keycomp(const void *k1, const void *k2)
 	return (memcmp(bt1->vec, bt2->vec, bt1->len * sizeof(void *)) == 0);
 }
 
-static malloc_mutex_t *
-prof_ctx_mutex_choose(void)
-{
-	unsigned nctxs = atomic_add_u(&cum_ctxs, 1);
-
-	return (&ctx_locks[(nctxs - 1) % PROF_NCTX_LOCKS]);
-}
-
 prof_tdata_t *
 prof_tdata_init(void)
 {
@@ -1208,6 +1343,8 @@ prof_boot2(void)
 
 		if (malloc_mutex_init(&prof_dump_seq_mtx))
 			return (true);
+		if (malloc_mutex_init(&prof_dump_mtx))
+			return (true);
 
 		if (atexit(prof_fdump) != 0) {
 			malloc_write("<jemalloc>: Error in atexit()\n");
@@ -1245,10 +1382,10 @@ prof_prefork(void)
 	if (opt_prof) {
 		unsigned i;
 
-		malloc_mutex_lock(&bt2ctx_mtx);
-		malloc_mutex_lock(&prof_dump_seq_mtx);
+		malloc_mutex_prefork(&bt2ctx_mtx);
+		malloc_mutex_prefork(&prof_dump_seq_mtx);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++)
-			malloc_mutex_lock(&ctx_locks[i]);
+			malloc_mutex_prefork(&ctx_locks[i]);
 	}
 }
 
diff --git a/dep/jemalloc/src/quarantine.c b/dep/jemalloc/src/quarantine.c
index f96a948d5c7..5431511640a 100644
--- a/dep/jemalloc/src/quarantine.c
+++ b/dep/jemalloc/src/quarantine.c
@@ -141,8 +141,17 @@ quarantine(void *ptr)
 		obj->usize = usize;
 		quarantine->curbytes += usize;
 		quarantine->curobjs++;
-		if (opt_junk)
-			memset(ptr, 0x5a, usize);
+		if (config_fill && opt_junk) {
+			/*
+			 * Only do redzone validation if Valgrind isn't in
+			 * operation.
+			 */
+			if ((config_valgrind == false || opt_valgrind == false)
+			    && usize <= SMALL_MAXCLASS)
+				arena_quarantine_junk_small(ptr, usize);
+			else
+				memset(ptr, 0x5a, usize);
+		}
 	} else {
 		assert(quarantine->curbytes == 0);
 		idalloc(ptr);
diff --git a/dep/jemalloc/src/rtree.c b/dep/jemalloc/src/rtree.c
index 90c6935a0ed..205957ac4e1 100644
--- a/dep/jemalloc/src/rtree.c
+++ b/dep/jemalloc/src/rtree.c
@@ -2,42 +2,55 @@
 #include "jemalloc/internal/jemalloc_internal.h"
 
 rtree_t *
-rtree_new(unsigned bits)
+rtree_new(unsigned bits, rtree_alloc_t *alloc, rtree_dalloc_t *dalloc)
 {
 	rtree_t *ret;
-	unsigned bits_per_level, height, i;
+	unsigned bits_per_level, bits_in_leaf, height, i;
+
+	assert(bits > 0 && bits <= (sizeof(uintptr_t) << 3));
 
 	bits_per_level = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(void *)))) - 1;
-	height = bits / bits_per_level;
-	if (height * bits_per_level != bits)
-		height++;
-	assert(height * bits_per_level >= bits);
+	bits_in_leaf = ffs(pow2_ceil((RTREE_NODESIZE / sizeof(uint8_t)))) - 1;
+	if (bits > bits_in_leaf) {
+		height = 1 + (bits - bits_in_leaf) / bits_per_level;
+		if ((height-1) * bits_per_level + bits_in_leaf != bits)
+			height++;
+	} else {
+		height = 1;
+	}
+	assert((height-1) * bits_per_level + bits_in_leaf >= bits);
 
-	ret = (rtree_t*)base_alloc(offsetof(rtree_t, level2bits) +
+	ret = (rtree_t*)alloc(offsetof(rtree_t, level2bits) +
 	    (sizeof(unsigned) * height));
 	if (ret == NULL)
 		return (NULL);
 	memset(ret, 0, offsetof(rtree_t, level2bits) + (sizeof(unsigned) *
 	    height));
 
+	ret->alloc = alloc;
+	ret->dalloc = dalloc;
 	if (malloc_mutex_init(&ret->mutex)) {
-		/* Leak the rtree. */
+		if (dalloc != NULL)
+			dalloc(ret);
 		return (NULL);
 	}
 	ret->height = height;
-	if (bits_per_level * height > bits)
-		ret->level2bits[0] = bits % bits_per_level;
-	else
-		ret->level2bits[0] = bits_per_level;
-	for (i = 1; i < height; i++)
-		ret->level2bits[i] = bits_per_level;
-
-	ret->root = (void**)base_alloc(sizeof(void *) << ret->level2bits[0]);
+	if (height > 1) {
+		if ((height-1) * bits_per_level + bits_in_leaf > bits) {
+			ret->level2bits[0] = (bits - bits_in_leaf) %
+			    bits_per_level;
+		} else
+			ret->level2bits[0] = bits_per_level;
+		for (i = 1; i < height-1; i++)
+			ret->level2bits[i] = bits_per_level;
+		ret->level2bits[height-1] = bits_in_leaf;
+	} else
+		ret->level2bits[0] = bits;
+
+	ret->root = (void**)alloc(sizeof(void *) << ret->level2bits[0]);
 	if (ret->root == NULL) {
-		/*
-		 * We leak the rtree here, since there's no generic base
-		 * deallocation.
-		 */
+		if (dalloc != NULL)
+			dalloc(ret);
 		return (NULL);
 	}
 	memset(ret->root, 0, sizeof(void *) << ret->level2bits[0]);
@@ -45,6 +58,31 @@ rtree_new(unsigned bits)
 	return (ret);
 }
 
+static void
+rtree_delete_subtree(rtree_t *rtree, void **node, unsigned level)
+{
+
+	if (level < rtree->height - 1) {
+		size_t nchildren, i;
+
+		nchildren = ZU(1) << rtree->level2bits[level];
+		for (i = 0; i < nchildren; i++) {
+			void **child = (void **)node[i];
+			if (child != NULL)
+				rtree_delete_subtree(rtree, child, level + 1);
+		}
+	}
+	rtree->dalloc(node);
+}
+
+void
+rtree_delete(rtree_t *rtree)
+{
+
+	rtree_delete_subtree(rtree, rtree->root, 0);
+	rtree->dalloc(rtree);
+}
+
 void
 rtree_prefork(rtree_t *rtree)
 {
diff --git a/dep/jemalloc/src/stats.c b/dep/jemalloc/src/stats.c
index 43f87af6700..bef2ab33cd4 100644
--- a/dep/jemalloc/src/stats.c
+++ b/dep/jemalloc/src/stats.c
@@ -345,25 +345,25 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		malloc_cprintf(write_cb, cbopaque, "Assertions %s\n",
 		    bv ? "enabled" : "disabled");
 
-#define OPT_WRITE_BOOL(n)						\
+#define	OPT_WRITE_BOOL(n)						\
 		if ((err = je_mallctl("opt."#n, &bv, &bsz, NULL, 0))	\
 		    == 0) {						\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %s\n", bv ? "true" : "false");	\
 		}
-#define OPT_WRITE_SIZE_T(n)						\
+#define	OPT_WRITE_SIZE_T(n)						\
 		if ((err = je_mallctl("opt."#n, &sv, &ssz, NULL, 0))	\
 		    == 0) {						\
 			malloc_cprintf(write_cb, cbopaque,		\
 			"  opt."#n": %zu\n", sv);			\
 		}
-#define OPT_WRITE_SSIZE_T(n)						\
+#define	OPT_WRITE_SSIZE_T(n)						\
 		if ((err = je_mallctl("opt."#n, &ssv, &sssz, NULL, 0))	\
 		    == 0) {						\
 			malloc_cprintf(write_cb, cbopaque,		\
 			    "  opt."#n": %zd\n", ssv);			\
 		}
-#define OPT_WRITE_CHAR_P(n)						\
+#define	OPT_WRITE_CHAR_P(n)						\
 		if ((err = je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0))	\
 		    == 0) {						\
 			malloc_cprintf(write_cb, cbopaque,		\
diff --git a/dep/jemalloc/src/tcache.c b/dep/jemalloc/src/tcache.c
index 98ed19edd52..6de92960b2d 100644
--- a/dep/jemalloc/src/tcache.c
+++ b/dep/jemalloc/src/tcache.c
@@ -260,8 +260,8 @@ tcache_arena_dissociate(tcache_t *tcache)
 		/* Unlink from list of extant tcaches. */
 		malloc_mutex_lock(&tcache->arena->lock);
 		ql_remove(&tcache->arena->tcache_ql, tcache, link);
-		malloc_mutex_unlock(&tcache->arena->lock);
 		tcache_stats_merge(tcache, tcache->arena);
+		malloc_mutex_unlock(&tcache->arena->lock);
 	}
 }
 
@@ -292,7 +292,7 @@ tcache_create(arena_t *arena)
 	else if (size <= tcache_maxclass)
 		tcache = (tcache_t *)arena_malloc_large(arena, size, true);
 	else
-		tcache = (tcache_t *)icallocx(size, false, arena);
+		tcache = (tcache_t *)icalloct(size, false, arena);
 
 	if (tcache == NULL)
 		return (NULL);
@@ -366,7 +366,7 @@ tcache_destroy(tcache_t *tcache)
 
 		arena_dalloc_large(arena, chunk, tcache);
 	} else
-		idallocx(tcache, false);
+		idalloct(tcache, false);
 }
 
 void
@@ -399,11 +399,14 @@ tcache_thread_cleanup(void *arg)
 	}
 }
 
+/* Caller must own arena->lock. */
 void
 tcache_stats_merge(tcache_t *tcache, arena_t *arena)
 {
 	unsigned i;
 
+	cassert(config_stats);
+
 	/* Merge and reset tcache stats. */
 	for (i = 0; i < NBINS; i++) {
 		arena_bin_t *bin = &arena->bins[i];
diff --git a/dep/jemalloc/src/tsd.c b/dep/jemalloc/src/tsd.c
index 961a546329c..700caabfe47 100644
--- a/dep/jemalloc/src/tsd.c
+++ b/dep/jemalloc/src/tsd.c
@@ -21,7 +21,7 @@ void
 malloc_tsd_dalloc(void *wrapper)
 {
 
-	idalloc(wrapper);
+	idalloct(wrapper, false);
 }
 
 void
@@ -105,3 +105,37 @@ JEMALLOC_SECTION(".CRT$XLY") JEMALLOC_ATTR(used)
 static const BOOL	(WINAPI *tls_callback)(HINSTANCE hinstDLL,
     DWORD fdwReason, LPVOID lpvReserved) = _tls_callback;
 #endif
+
+#if (!defined(JEMALLOC_MALLOC_THREAD_CLEANUP) && !defined(JEMALLOC_TLS) && \
+    !defined(_WIN32))
+void *
+tsd_init_check_recursion(tsd_init_head_t *head, tsd_init_block_t *block)
+{
+	pthread_t self = pthread_self();
+	tsd_init_block_t *iter;
+
+	/* Check whether this thread has already inserted into the list. */
+	malloc_mutex_lock(&head->lock);
+	ql_foreach(iter, &head->blocks, link) {
+		if (iter->thread == self) {
+			malloc_mutex_unlock(&head->lock);
+			return (iter->data);
+		}
+	}
+	/* Insert block into list. */
+	ql_elm_new(block, link);
+	block->thread = self;
+	ql_tail_insert(&head->blocks, block, link);
+	malloc_mutex_unlock(&head->lock);
+	return (NULL);
+}
+
+void
+tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block)
+{
+
+	malloc_mutex_lock(&head->lock);
+	ql_remove(&head->blocks, block, link);
+	malloc_mutex_unlock(&head->lock);
+}
+#endif
diff --git a/dep/jemalloc/src/util.c b/dep/jemalloc/src/util.c
index b3a01143698..93a19fd16f7 100644
--- a/dep/jemalloc/src/util.c
+++ b/dep/jemalloc/src/util.c
@@ -77,7 +77,7 @@ malloc_write(const char *s)
  * provide a wrapper.
  */
 int
-buferror(char *buf, size_t buflen)
+buferror(int err, char *buf, size_t buflen)
 {
 
 #ifdef _WIN32
@@ -85,34 +85,36 @@ buferror(char *buf, size_t buflen)
 	    (LPSTR)buf, buflen, NULL);
 	return (0);
 #elif defined(_GNU_SOURCE)
-	char *b = strerror_r(errno, buf, buflen);
+	char *b = strerror_r(err, buf, buflen);
 	if (b != buf) {
 		strncpy(buf, b, buflen);
 		buf[buflen-1] = '\0';
 	}
 	return (0);
 #else
-	return (strerror_r(errno, buf, buflen));
+	return (strerror_r(err, buf, buflen));
 #endif
 }
 
 uintmax_t
-malloc_strtoumax(const char *nptr, char **endptr, int base)
+malloc_strtoumax(const char *restrict nptr, char **restrict endptr, int base)
 {
 	uintmax_t ret, digit;
 	int b;
 	bool neg;
 	const char *p, *ns;
 
+	p = nptr;
 	if (base < 0 || base == 1 || base > 36) {
+		ns = p;
 		set_errno(EINVAL);
-		return (UINTMAX_MAX);
+		ret = UINTMAX_MAX;
+		goto label_return;
 	}
 	b = base;
 
 	/* Swallow leading whitespace and get sign, if any. */
 	neg = false;
-	p = nptr;
 	while (true) {
 		switch (*p) {
 		case '\t': case '\n': case '\v': case '\f': case '\r': case ' ':
@@ -146,7 +148,7 @@ malloc_strtoumax(const char *nptr, char **endptr, int base)
 			if (b == 8)
 				p++;
 			break;
-		case 'x':
+		case 'X': case 'x':
 			switch (p[2]) {
 			case '0': case '1': case '2': case '3': case '4':
 			case '5': case '6': case '7': case '8': case '9':
@@ -164,7 +166,9 @@ malloc_strtoumax(const char *nptr, char **endptr, int base)
 			}
 			break;
 		default:
-			break;
+			p++;
+			ret = 0;
+			goto label_return;
 		}
 	}
 	if (b == 0)
@@ -181,13 +185,22 @@ malloc_strtoumax(const char *nptr, char **endptr, int base)
 		if (ret < pret) {
 			/* Overflow. */
 			set_errno(ERANGE);
-			return (UINTMAX_MAX);
+			ret = UINTMAX_MAX;
+			goto label_return;
 		}
 		p++;
 	}
 	if (neg)
 		ret = -ret;
 
+	if (p == ns) {
+		/* No conversion performed. */
+		set_errno(EINVAL);
+		ret = UINTMAX_MAX;
+		goto label_return;
+	}
+
+label_return:
 	if (endptr != NULL) {
 		if (p == ns) {
 			/* No characters were converted. */
@@ -195,7 +208,6 @@ malloc_strtoumax(const char *nptr, char **endptr, int base)
 		} else
 			*endptr = (char *)p;
 	}
-
 	return (ret);
 }
 
@@ -331,7 +343,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			APPEND_C(' ');					\
 	}								\
 } while (0)
-#define GET_ARG_NUMERIC(val, len) do {					\
+#define	GET_ARG_NUMERIC(val, len) do {					\
 	switch (len) {							\
 	case '?':							\
 		val = va_arg(ap, int);					\
@@ -354,6 +366,9 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 	case 'j':							\
 		val = va_arg(ap, intmax_t);				\
 		break;							\
+	case 'j' | 0x80:						\
+		val = va_arg(ap, uintmax_t);				\
+		break;							\
 	case 't':							\
 		val = va_arg(ap, ptrdiff_t);				\
 		break;							\
@@ -385,11 +400,6 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			unsigned char len = '?';
 
 			f++;
-			if (*f == '%') {
-				/* %% */
-				APPEND_C(*f);
-				break;
-			}
 			/* Flags. */
 			while (true) {
 				switch (*f) {
@@ -419,6 +429,10 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			case '*':
 				width = va_arg(ap, int);
 				f++;
+				if (width < 0) {
+					left_justify = true;
+					width = -width;
+				}
 				break;
 			case '0': case '1': case '2': case '3': case '4':
 			case '5': case '6': case '7': case '8': case '9': {
@@ -428,19 +442,16 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 				assert(uwidth != UINTMAX_MAX || get_errno() !=
 				    ERANGE);
 				width = (int)uwidth;
-				if (*f == '.') {
-					f++;
-					goto label_precision;
-				} else
-					goto label_length;
 				break;
-			} case '.':
-				f++;
-				goto label_precision;
-			default: goto label_length;
+			} default:
+				break;
 			}
+			/* Width/precision separator. */
+			if (*f == '.')
+				f++;
+			else
+				goto label_length;
 			/* Precision. */
-			label_precision:
 			switch (*f) {
 			case '*':
 				prec = va_arg(ap, int);
@@ -469,16 +480,8 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 				} else
 					len = 'l';
 				break;
-			case 'j':
-				len = 'j';
-				f++;
-				break;
-			case 't':
-				len = 't';
-				f++;
-				break;
-			case 'z':
-				len = 'z';
+			case 'q': case 'j': case 't': case 'z':
+				len = *f;
 				f++;
 				break;
 			default: break;
@@ -487,6 +490,11 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 			switch (*f) {
 				char *s;
 				size_t slen;
+			case '%':
+				/* %% */
+				APPEND_C(*f);
+				f++;
+				break;
 			case 'd': case 'i': {
 				intmax_t val JEMALLOC_CC_SILENCE_INIT(0);
 				char buf[D2S_BUFSIZE];
@@ -540,7 +548,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 				assert(len == '?' || len == 'l');
 				assert_not_implemented(len != 'l');
 				s = va_arg(ap, char *);
-				slen = (prec == -1) ? strlen(s) : prec;
+				slen = (prec < 0) ? strlen(s) : prec;
 				APPEND_PADDED_S(s, slen, width, left_justify);
 				f++;
 				break;
@@ -553,8 +561,7 @@ malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap)
 				APPEND_PADDED_S(s, slen, width, left_justify);
 				f++;
 				break;
-			}
-			default: not_implemented();
+			} default: not_reached();
 			}
 			break;
 		} default: {
diff --git a/dep/jemalloc/src/zone.c b/dep/jemalloc/src/zone.c
index c62c183f65e..e0302ef4edc 100644
--- a/dep/jemalloc/src/zone.c
+++ b/dep/jemalloc/src/zone.c
@@ -137,7 +137,7 @@ zone_destroy(malloc_zone_t *zone)
 {
 
 	/* This function should never be called. */
-	assert(false);
+	not_reached();
 	return (NULL);
 }
 
diff --git a/dep/recastnavigation/Detour/CMakeLists.txt b/dep/recastnavigation/Detour/CMakeLists.txt
index b7c0853efc4..5f3542e96b9 100644
--- a/dep/recastnavigation/Detour/CMakeLists.txt
+++ b/dep/recastnavigation/Detour/CMakeLists.txt
@@ -9,13 +9,14 @@
 # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 
 set(Detour_STAT_SRCS
-    DetourAlloc.cpp 
-    DetourCommon.cpp 
-    DetourNavMesh.cpp 
-    DetourNavMeshBuilder.cpp 
-    DetourNavMeshQuery.cpp 
-    DetourNode.cpp 
+    Source/DetourAlloc.cpp 
+    Source/DetourCommon.cpp 
+    Source/DetourNavMesh.cpp 
+    Source/DetourNavMeshBuilder.cpp 
+    Source/DetourNavMeshQuery.cpp 
+    Source/DetourNode.cpp 
 )
+include_directories(Include)
 
 if(WIN32)
   include_directories(
diff --git a/dep/recastnavigation/Detour/DetourObstacleAvoidance.cpp b/dep/recastnavigation/Detour/DetourObstacleAvoidance.cpp
index a255c9b3fd1..d3f90b7ab17 100644
--- a/dep/recastnavigation/Detour/DetourObstacleAvoidance.cpp
+++ b/dep/recastnavigation/Detour/DetourObstacleAvoidance.cpp
@@ -25,6 +25,7 @@
 #include <float.h>
 #include <new>
 
+static const float DT_PI = 3.14159265f;
 
 static int sweepCircleCircle(const float* c0, const float r0, const float* v,
 							 const float* c1, const float r1,
@@ -206,12 +207,6 @@ void dtFreeObstacleAvoidanceQuery(dtObstacleAvoidanceQuery* ptr)
 
 
 dtObstacleAvoidanceQuery::dtObstacleAvoidanceQuery() :
-	m_velBias(0.0f),
-	m_weightDesVel(0.0f),
-	m_weightCurVel(0.0f),
-	m_weightSide(0.0f),
-	m_weightToi(0.0f),
-	m_horizTime(0.0f),
 	m_maxCircles(0),
 	m_circles(0),
 	m_ncircles(0),
@@ -318,11 +313,11 @@ void dtObstacleAvoidanceQuery::prepare(const float* pos, const float* dvel)
 
 float dtObstacleAvoidanceQuery::processSample(const float* vcand, const float cs,
 											  const float* pos, const float rad,
-											  const float vmax, const float* vel, const float* dvel,
+											  const float* vel, const float* dvel,
 											  dtObstacleAvoidanceDebugData* debug)
 {
 	// Find min time of impact and exit amongst all obstacles.
-	float tmin = m_horizTime;
+	float tmin = m_params.horizTime;
 	float side = 0;
 	int nside = 0;
 	
@@ -395,11 +390,10 @@ float dtObstacleAvoidanceQuery::processSample(const float* vcand, const float cs
 	if (nside)
 		side /= nside;
 	
-	const float ivmax = 1.0f / vmax;
-	const float vpen = m_weightDesVel * (dtVdist2D(vcand, dvel) * ivmax);
-	const float vcpen = m_weightCurVel * (dtVdist2D(vcand, vel) * ivmax);
-	const float spen = m_weightSide * side;
-	const float tpen = m_weightToi * (1.0f/(0.1f+tmin / m_horizTime));
+	const float vpen = m_params.weightDesVel * (dtVdist2D(vcand, dvel) * m_invVmax);
+	const float vcpen = m_params.weightCurVel * (dtVdist2D(vcand, vel) * m_invVmax);
+	const float spen = m_params.weightSide * side;
+	const float tpen = m_params.weightToi * (1.0f/(0.1f+tmin*m_invHorizTime));
 	
 	const float penalty = vpen + vcpen + spen + tpen;
 	
@@ -410,28 +404,34 @@ float dtObstacleAvoidanceQuery::processSample(const float* vcand, const float cs
 	return penalty;
 }
 
-void dtObstacleAvoidanceQuery::sampleVelocityGrid(const float* pos, const float rad, const float vmax,
-												  const float* vel, const float* dvel,
-												  float* nvel, const int gsize,
-												  dtObstacleAvoidanceDebugData* debug)
+int dtObstacleAvoidanceQuery::sampleVelocityGrid(const float* pos, const float rad, const float vmax,
+												 const float* vel, const float* dvel, float* nvel,
+												 const dtObstacleAvoidanceParams* params,
+												 dtObstacleAvoidanceDebugData* debug)
 {
 	prepare(pos, dvel);
 	
+	memcpy(&m_params, params, sizeof(dtObstacleAvoidanceParams));
+	m_invHorizTime = 1.0f / m_params.horizTime;
+	m_vmax = vmax;
+	m_invVmax = 1.0f / vmax;
+	
 	dtVset(nvel, 0,0,0);
 	
 	if (debug)
 		debug->reset();
 
-	const float cvx = dvel[0] * m_velBias;
-	const float cvz = dvel[2] * m_velBias;
-	const float cs = vmax * 2 * (1 - m_velBias) / (float)(gsize-1);
-	const float half = (gsize-1)*cs*0.5f;
+	const float cvx = dvel[0] * m_params.velBias;
+	const float cvz = dvel[2] * m_params.velBias;
+	const float cs = vmax * 2 * (1 - m_params.velBias) / (float)(m_params.gridSize-1);
+	const float half = (m_params.gridSize-1)*cs*0.5f;
 		
 	float minPenalty = FLT_MAX;
+	int ns = 0;
 		
-	for (int y = 0; y < gsize; ++y)
+	for (int y = 0; y < m_params.gridSize; ++y)
 	{
-		for (int x = 0; x < gsize; ++x)
+		for (int x = 0; x < m_params.gridSize; ++x)
 		{
 			float vcand[3];
 			vcand[0] = cvx + x*cs - half;
@@ -440,7 +440,8 @@ void dtObstacleAvoidanceQuery::sampleVelocityGrid(const float* pos, const float
 			
 			if (dtSqr(vcand[0])+dtSqr(vcand[2]) > dtSqr(vmax+cs/2)) continue;
 			
-			const float penalty = processSample(vcand, cs, pos,rad,vmax,vel,dvel, debug);
+			const float penalty = processSample(vcand, cs, pos,rad,vel,dvel, debug);
+			ns++;
 			if (penalty < minPenalty)
 			{
 				minPenalty = penalty;
@@ -448,31 +449,38 @@ void dtObstacleAvoidanceQuery::sampleVelocityGrid(const float* pos, const float
 			}
 		}
 	}
+	
+	return ns;
 }
 
 
-static const float DT_PI = 3.14159265f;
-
-void dtObstacleAvoidanceQuery::sampleVelocityAdaptive(const float* pos, const float rad, const float vmax,
-													  const float* vel, const float* dvel, float* nvel,
-													  const int ndivs, const int nrings, const int depth,
-													  dtObstacleAvoidanceDebugData* debug)
+int dtObstacleAvoidanceQuery::sampleVelocityAdaptive(const float* pos, const float rad, const float vmax,
+													 const float* vel, const float* dvel, float* nvel,
+													 const dtObstacleAvoidanceParams* params,
+													 dtObstacleAvoidanceDebugData* debug)
 {
 	prepare(pos, dvel);
 	
+	memcpy(&m_params, params, sizeof(dtObstacleAvoidanceParams));
+	m_invHorizTime = 1.0f / m_params.horizTime;
+	m_vmax = vmax;
+	m_invVmax = 1.0f / vmax;
+	
 	dtVset(nvel, 0,0,0);
 	
 	if (debug)
 		debug->reset();
-	
+
 	// Build sampling pattern aligned to desired velocity.
-	static const int MAX_PATTERN_DIVS = 32;
-	static const int MAX_PATTERN_RINGS = 4;
-	float pat[(MAX_PATTERN_DIVS*MAX_PATTERN_RINGS+1)*2];
+	float pat[(DT_MAX_PATTERN_DIVS*DT_MAX_PATTERN_RINGS+1)*2];
 	int npat = 0;
 
-	const int nd = dtClamp(ndivs, 1, MAX_PATTERN_DIVS);
-	const int nr = dtClamp(nrings, 1, MAX_PATTERN_RINGS);
+	const int ndivs = (int)m_params.adaptiveDivs;
+	const int nrings= (int)m_params.adaptiveRings;
+	const int depth = (int)m_params.adaptiveDepth;
+	
+	const int nd = dtClamp(ndivs, 1, DT_MAX_PATTERN_DIVS);
+	const int nr = dtClamp(nrings, 1, DT_MAX_PATTERN_RINGS);
 	const float da = (1.0f/nd) * DT_PI*2;
 	const float dang = atan2f(dvel[2], dvel[0]);
 	
@@ -483,21 +491,22 @@ void dtObstacleAvoidanceQuery::sampleVelocityAdaptive(const float* pos, const fl
 	
 	for (int j = 0; j < nr; ++j)
 	{
-		const float rad = (float)(nr-j)/(float)nr;
+		const float r = (float)(nr-j)/(float)nr;
 		float a = dang + (j&1)*0.5f*da;
 		for (int i = 0; i < nd; ++i)
 		{
-			pat[npat*2+0] = cosf(a)*rad;
-			pat[npat*2+1] = sinf(a)*rad;
+			pat[npat*2+0] = cosf(a)*r;
+			pat[npat*2+1] = sinf(a)*r;
 			npat++;
 			a += da;
 		}
 	}
 
 	// Start sampling.
-	float cr = vmax * (1.0f-m_velBias);
+	float cr = vmax * (1.0f - m_params.velBias);
 	float res[3];
-	dtVset(res, dvel[0] * m_velBias, 0, dvel[2] * m_velBias);
+	dtVset(res, dvel[0] * m_params.velBias, 0, dvel[2] * m_params.velBias);
+	int ns = 0;
 
 	for (int k = 0; k < depth; ++k)
 	{
@@ -514,7 +523,8 @@ void dtObstacleAvoidanceQuery::sampleVelocityAdaptive(const float* pos, const fl
 			
 			if (dtSqr(vcand[0])+dtSqr(vcand[2]) > dtSqr(vmax+0.001f)) continue;
 			
-			const float penalty = processSample(vcand,cr/10, pos,rad,vmax,vel,dvel, debug);
+			const float penalty = processSample(vcand,cr/10, pos,rad,vel,dvel, debug);
+			ns++;
 			if (penalty < minPenalty)
 			{
 				minPenalty = penalty;
@@ -528,5 +538,7 @@ void dtObstacleAvoidanceQuery::sampleVelocityAdaptive(const float* pos, const fl
 	}	
 	
 	dtVcopy(nvel, res);
+	
+	return ns;
 }
 
diff --git a/dep/recastnavigation/Detour/DetourObstacleAvoidance.h b/dep/recastnavigation/Detour/DetourObstacleAvoidance.h
index 4a7187a7998..8ff6211e867 100644
--- a/dep/recastnavigation/Detour/DetourObstacleAvoidance.h
+++ b/dep/recastnavigation/Detour/DetourObstacleAvoidance.h
@@ -21,21 +21,19 @@
 
 struct dtObstacleCircle
 {
-	float p[3];				// Position of the obstacle
-	float vel[3];			// Velocity of the obstacle
-	float dvel[3];			// Velocity of the obstacle
-	float rad;				// Radius of the obstacle
-	float dp[3], np[3];		// Use for side selection during sampling.
+	float p[3];				///< Position of the obstacle
+	float vel[3];			///< Velocity of the obstacle
+	float dvel[3];			///< Velocity of the obstacle
+	float rad;				///< Radius of the obstacle
+	float dp[3], np[3];		///< Use for side selection during sampling.
 };
 
 struct dtObstacleSegment
 {
-	float p[3], q[3];		// End points of the obstacle segment
+	float p[3], q[3];		///< End points of the obstacle segment
 	bool touch;
 };
 
-static const int RVO_SAMPLE_RAD = 15;
-static const int MAX_RVO_SAMPLES = (RVO_SAMPLE_RAD*2+1)*(RVO_SAMPLE_RAD*2+1) + 100;
 
 class dtObstacleAvoidanceDebugData
 {
@@ -75,6 +73,23 @@ dtObstacleAvoidanceDebugData* dtAllocObstacleAvoidanceDebugData();
 void dtFreeObstacleAvoidanceDebugData(dtObstacleAvoidanceDebugData* ptr);
 
 
+static const int DT_MAX_PATTERN_DIVS = 32;	///< Max numver of adaptive divs.
+static const int DT_MAX_PATTERN_RINGS = 4;	///< Max number of adaptive rings.
+
+struct dtObstacleAvoidanceParams
+{
+	float velBias;
+	float weightDesVel;
+	float weightCurVel;
+	float weightSide;
+	float weightToi;
+	float horizTime;
+	unsigned char gridSize;	///< grid
+	unsigned char adaptiveDivs;	///< adaptive
+	unsigned char adaptiveRings;	///< adaptive
+	unsigned char adaptiveDepth;	///< adaptive
+};
+
 class dtObstacleAvoidanceQuery
 {
 public:
@@ -90,22 +105,15 @@ public:
 				   
 	void addSegment(const float* p, const float* q);
 
-	inline void setVelocitySelectionBias(float v) { m_velBias = v; }
-	inline void setDesiredVelocityWeight(float w) { m_weightDesVel = w; }
-	inline void setCurrentVelocityWeight(float w) { m_weightCurVel = w; }
-	inline void setPreferredSideWeight(float w) { m_weightSide = w; }
-	inline void setCollisionTimeWeight(float w) { m_weightToi = w; }
-	inline void setTimeHorizon(float t) { m_horizTime = t; }
-
-	void sampleVelocityGrid(const float* pos, const float rad, const float vmax,
-							const float* vel, const float* dvel, float* nvel,
-							const int gsize,
-							dtObstacleAvoidanceDebugData* debug = 0);
-
-	void sampleVelocityAdaptive(const float* pos, const float rad, const float vmax,
-								const float* vel, const float* dvel, float* nvel,
-								const int ndivs, const int nrings, const int depth, 
-								dtObstacleAvoidanceDebugData* debug = 0);
+	int sampleVelocityGrid(const float* pos, const float rad, const float vmax,
+						   const float* vel, const float* dvel, float* nvel,
+						   const dtObstacleAvoidanceParams* params,
+						   dtObstacleAvoidanceDebugData* debug = 0);
+
+	int sampleVelocityAdaptive(const float* pos, const float rad, const float vmax,
+							   const float* vel, const float* dvel, float* nvel,
+							   const dtObstacleAvoidanceParams* params, 
+							   dtObstacleAvoidanceDebugData* debug = 0);
 	
 	inline int getObstacleCircleCount() const { return m_ncircles; }
 	const dtObstacleCircle* getObstacleCircle(const int i) { return &m_circles[i]; }
@@ -119,19 +127,17 @@ private:
 
 	float processSample(const float* vcand, const float cs,
 						const float* pos, const float rad,
-						const float vmax, const float* vel, const float* dvel,
+						const float* vel, const float* dvel,
 						dtObstacleAvoidanceDebugData* debug);
 
 	dtObstacleCircle* insertCircle(const float dist);
 	dtObstacleSegment* insertSegment(const float dist);
 
-	float m_velBias;
-	float m_weightDesVel;
-	float m_weightCurVel;
-	float m_weightSide;
-	float m_weightToi;
-	float m_horizTime;
-	
+	dtObstacleAvoidanceParams m_params;
+	float m_invHorizTime;
+	float m_vmax;
+	float m_invVmax;
+
 	int m_maxCircles;
 	dtObstacleCircle* m_circles;
 	int m_ncircles;
@@ -145,4 +151,4 @@ dtObstacleAvoidanceQuery* dtAllocObstacleAvoidanceQuery();
 void dtFreeObstacleAvoidanceQuery(dtObstacleAvoidanceQuery* ptr);
 
 
-#endif // DETOUROBSTACLEAVOIDANCE_H
-\ No newline at end of file
+#endif // DETOUROBSTACLEAVOIDANCE_H
diff --git a/dep/recastnavigation/Detour/DetourAlloc.h b/dep/recastnavigation/Detour/Include/DetourAlloc.h
index e814b62a716..e814b62a716 100644
--- a/dep/recastnavigation/Detour/DetourAlloc.h
+++ b/dep/recastnavigation/Detour/Include/DetourAlloc.h
diff --git a/dep/recastnavigation/Detour/DetourAssert.h b/dep/recastnavigation/Detour/Include/DetourAssert.h
index 3cf652288fa..3cf652288fa 100644
--- a/dep/recastnavigation/Detour/DetourAssert.h
+++ b/dep/recastnavigation/Detour/Include/DetourAssert.h
diff --git a/dep/recastnavigation/Detour/DetourCommon.h b/dep/recastnavigation/Detour/Include/DetourCommon.h
index ed7c5149db9..0888614ea9b 100644
--- a/dep/recastnavigation/Detour/DetourCommon.h
+++ b/dep/recastnavigation/Detour/Include/DetourCommon.h
@@ -32,6 +32,11 @@ feature to find minor members.
 /// @name General helper functions
 /// @{
 
+/// Used to ignore a function parameter.  VS complains about unused parameters
+/// and this silences the warning.
+///  @param [in] _ Unused parameter
+template<class T> void dtIgnoreUnused(const T&) { }
+
 /// Swaps the values of the two parameters.
 ///  @param[in,out]	a	Value A
 ///  @param[in,out]	b	Value B
diff --git a/dep/recastnavigation/Detour/DetourNavMesh.h b/dep/recastnavigation/Detour/Include/DetourNavMesh.h
index c094e4134d5..cdd473f1aff 100644
--- a/dep/recastnavigation/Detour/DetourNavMesh.h
+++ b/dep/recastnavigation/Detour/Include/DetourNavMesh.h
@@ -25,7 +25,8 @@
 
 // Edited by TC
 #if defined(WIN32) && !defined(__MINGW32__)
-typedef unsigned __int64    uint64;
+/// Do not rename back to uint64. Otherwise mac complains about typedef redefinition
+typedef unsigned __int64    uint64_d;
 #else
 #include <stdint.h>
 #ifndef uint64_t
@@ -33,7 +34,8 @@ typedef unsigned __int64    uint64;
 #include <linux/types.h>
 #endif
 #endif
-typedef uint64_t            uint64;
+/// Do not rename back to uint64. Otherwise mac complains about typedef redefinition
+typedef uint64_t            uint64_d;
 #endif 
 
 // Note: If you want to use 64-bit refs, change the types of both dtPolyRef & dtTileRef.
@@ -42,17 +44,13 @@ typedef uint64_t            uint64;
 // Edited by TC
 // We cannot have over 31 bits for either tile nor poly
 // without changing polyCount to use 64bits too.
-static const int STATIC_SALT_BITS = 12;
-static const int STATIC_TILE_BITS = 21;
-static const int STATIC_POLY_BITS = 31; 
-
 /// A handle to a polygon within a navigation mesh tile.
 /// @ingroup detour
-typedef uint64 dtPolyRef; // Edited by TC
+typedef uint64_d dtPolyRef; // Edited by TC
 
 /// A handle to a tile within a navigation mesh.
 /// @ingroup detour
-typedef uint64 dtTileRef; // Edited by TC
+typedef uint64_d dtTileRef; // Edited by TC
 
 /// The maximum number of vertices per navigation polygon.
 /// @ingroup detour
@@ -92,6 +90,12 @@ static const unsigned int DT_OFFMESH_CON_BIDIR = 1;
 /// @ingroup detour
 static const int DT_MAX_AREAS = 64;
 
+static const int STATIC_SALT_BITS = 12;
+static const int STATIC_TILE_BITS = 21;
+static const int STATIC_POLY_BITS = 31;
+// we cannot have over 31 bits for either tile nor poly
+// without changing polyCount to use 64bits too.
+
 /// Tile flags used for various functions and fields.
 /// For an example, see dtNavMesh::addTile().
 enum dtTileFlags
@@ -583,8 +587,7 @@ private:
 	dtPolyRef findNearestPolyInTile(const dtMeshTile* tile, const float* center,
 									const float* extents, float* nearestPt) const;
 	/// Returns closest point on polygon.
-	void closestPointOnPolyInTile(const dtMeshTile* tile, unsigned int ip,
-								  const float* pos, float* closest) const;
+	void closestPointOnPoly(dtPolyRef ref, const float* pos, float* closest, bool* posOverPoly) const;
 	
 	dtNavMeshParams m_params;			///< Current initialization params. TODO: do not store this info twice.
 	float m_orig[3];					///< Origin of the tile (0,0)
diff --git a/dep/recastnavigation/Detour/DetourNavMeshBuilder.h b/dep/recastnavigation/Detour/Include/DetourNavMeshBuilder.h
index c80d1717630..c80d1717630 100644
--- a/dep/recastnavigation/Detour/DetourNavMeshBuilder.h
+++ b/dep/recastnavigation/Detour/Include/DetourNavMeshBuilder.h
diff --git a/dep/recastnavigation/Detour/DetourNavMeshQuery.h b/dep/recastnavigation/Detour/Include/DetourNavMeshQuery.h
index d431bf177bd..4a5112c9eb9 100644
--- a/dep/recastnavigation/Detour/DetourNavMeshQuery.h
+++ b/dep/recastnavigation/Detour/Include/DetourNavMeshQuery.h
@@ -200,8 +200,8 @@ public:
 	
 	/// Finalizes and returns the results of an incomplete sliced path query, returning the path to the furthest
 	/// polygon on the existing path that was visited during the search.
-	///  @param[out]	existing		An array of polygon references for the existing path.
-	///  @param[out]	existingSize	The number of polygon in the @p existing array.
+	///  @param[in]		existing		An array of polygon references for the existing path.
+	///  @param[in]		existingSize	The number of polygon in the @p existing array.
 	///  @param[out]	path			An ordered list of polygon references representing the path. (Start to end.) 
 	///  								[(polyRef) * @p pathCount]
 	///  @param[out]	pathCount		The number of polygons returned in the @p path array.
@@ -378,8 +378,9 @@ public:
 	///  @param[in]		ref			The reference id of the polygon.
 	///  @param[in]		pos			The position to check. [(x, y, z)]
 	///  @param[out]	closest		The closest point on the polygon. [(x, y, z)]
+	///  @param[out]	posOverPoly	True of the position is over the polygon.
 	/// @returns The status flags for the query.
-	dtStatus closestPointOnPoly(dtPolyRef ref, const float* pos, float* closest) const;
+	dtStatus closestPointOnPoly(dtPolyRef ref, const float* pos, float* closest, bool* posOverPoly) const;
 	
 	/// Returns a point on the boundary closest to the source point if the source point is outside the 
 	/// polygon's xz-bounds.
@@ -428,12 +429,7 @@ private:
 	/// Queries polygons within a tile.
 	int queryPolygonsInTile(const dtMeshTile* tile, const float* qmin, const float* qmax, const dtQueryFilter* filter,
 							dtPolyRef* polys, const int maxPolys) const;
-	/// Find nearest polygon within a tile.
-	dtPolyRef findNearestPolyInTile(const dtMeshTile* tile, const float* center, const float* extents,
-									const dtQueryFilter* filter, float* nearestPt) const;
-	/// Returns closest point on polygon.
-	void closestPointOnPolyInTile(const dtMeshTile* tile, const dtPoly* poly, const float* pos, float* closest) const;
-	
+
 	/// Returns portal points between two polygons.
 	dtStatus getPortalPoints(dtPolyRef from, dtPolyRef to, float* left, float* right,
 							 unsigned char& fromType, unsigned char& toType) const;
diff --git a/dep/recastnavigation/Detour/DetourNode.h b/dep/recastnavigation/Detour/Include/DetourNode.h
index b68c922d038..b68c922d038 100644
--- a/dep/recastnavigation/Detour/DetourNode.h
+++ b/dep/recastnavigation/Detour/Include/DetourNode.h
diff --git a/dep/recastnavigation/Detour/DetourStatus.h b/dep/recastnavigation/Detour/Include/DetourStatus.h
index af822c4a92d..af822c4a92d 100644
--- a/dep/recastnavigation/Detour/DetourStatus.h
+++ b/dep/recastnavigation/Detour/Include/DetourStatus.h
diff --git a/dep/recastnavigation/Detour/DetourAlloc.cpp b/dep/recastnavigation/Detour/Source/DetourAlloc.cpp
index 5f671df5bdb..5f671df5bdb 100644
--- a/dep/recastnavigation/Detour/DetourAlloc.cpp
+++ b/dep/recastnavigation/Detour/Source/DetourAlloc.cpp
diff --git a/dep/recastnavigation/Detour/DetourCommon.cpp b/dep/recastnavigation/Detour/Source/DetourCommon.cpp
index b5700f5930b..b5700f5930b 100644
--- a/dep/recastnavigation/Detour/DetourCommon.cpp
+++ b/dep/recastnavigation/Detour/Source/DetourCommon.cpp
diff --git a/dep/recastnavigation/Detour/DetourNavMesh.cpp b/dep/recastnavigation/Detour/Source/DetourNavMesh.cpp
index 6b8e2d9d649..51740509950 100644
--- a/dep/recastnavigation/Detour/DetourNavMesh.cpp
+++ b/dep/recastnavigation/Detour/Source/DetourNavMesh.cpp
@@ -609,10 +609,12 @@ void dtNavMesh::baseOffMeshLinks(dtMeshTile* tile)
 	}
 }
 
-void dtNavMesh::closestPointOnPolyInTile(const dtMeshTile* tile, unsigned int ip,
-										 const float* pos, float* closest) const
+void dtNavMesh::closestPointOnPoly(dtPolyRef ref, const float* pos, float* closest, bool* posOverPoly) const
 {
-	const dtPoly* poly = &tile->polys[ip];
+	const dtMeshTile* tile = 0;
+	const dtPoly* poly = 0;
+	getTileAndPolyByRefUnsafe(ref, &tile, &poly);
+	
 	// Off-mesh connections don't have detail polygons.
 	if (poly->getType() == DT_POLYTYPE_OFFMESH_CONNECTION)
 	{
@@ -622,11 +624,14 @@ void dtNavMesh::closestPointOnPolyInTile(const dtMeshTile* tile, unsigned int ip
 		const float d1 = dtVdist(pos, v1);
 		const float u = d0 / (d0+d1);
 		dtVlerp(closest, v0, v1, u);
+		if (posOverPoly)
+			*posOverPoly = false;
 		return;
 	}
 	
+	const unsigned int ip = (unsigned int)(poly - tile->polys);
 	const dtPolyDetail* pd = &tile->detailMeshes[ip];
-
+	
 	// Clamp point to be inside the polygon.
 	float verts[DT_VERTS_PER_POLYGON*3];	
 	float edged[DT_VERTS_PER_POLYGON];
@@ -652,6 +657,14 @@ void dtNavMesh::closestPointOnPolyInTile(const dtMeshTile* tile, unsigned int ip
 		const float* va = &verts[imin*3];
 		const float* vb = &verts[((imin+1)%nv)*3];
 		dtVlerp(closest, va, vb, edget[imin]);
+		
+		if (posOverPoly)
+			*posOverPoly = false;
+	}
+	else
+	{
+		if (posOverPoly)
+			*posOverPoly = true;
 	}
 	
 	// Find height at the location.
@@ -694,12 +707,27 @@ dtPolyRef dtNavMesh::findNearestPolyInTile(const dtMeshTile* tile,
 	{
 		dtPolyRef ref = polys[i];
 		float closestPtPoly[3];
-		closestPointOnPolyInTile(tile, decodePolyIdPoly(ref), center, closestPtPoly);
-		float d = dtVdistSqr(center, closestPtPoly);
+		float diff[3];
+		bool posOverPoly = false;
+		float d = 0;
+		closestPointOnPoly(ref, center, closestPtPoly, &posOverPoly);
+
+		// If a point is directly over a polygon and closer than
+		// climb height, favor that instead of straight line nearest point.
+		dtVsub(diff, center, closestPtPoly);
+		if (posOverPoly)
+		{
+			d = dtAbs(diff[1]) - tile->header->walkableClimb;
+			d = d > 0 ? d*d : 0;			
+		}
+		else
+		{
+			d = dtVlenSqr(diff);
+		}
+		
 		if (d < nearestDistanceSqr)
 		{
-			if (nearestPt)
-				dtVcopy(nearestPt, closestPtPoly);
+			dtVcopy(nearestPt, closestPtPoly);
 			nearestDistanceSqr = d;
 			nearest = ref;
 		}
diff --git a/dep/recastnavigation/Detour/DetourNavMeshBuilder.cpp b/dep/recastnavigation/Detour/Source/DetourNavMeshBuilder.cpp
index 9d8471b96a1..9d8471b96a1 100644
--- a/dep/recastnavigation/Detour/DetourNavMeshBuilder.cpp
+++ b/dep/recastnavigation/Detour/Source/DetourNavMeshBuilder.cpp
diff --git a/dep/recastnavigation/Detour/DetourNavMeshQuery.cpp b/dep/recastnavigation/Detour/Source/DetourNavMeshQuery.cpp
index e6557cf707e..f1709dfd4cf 100644
--- a/dep/recastnavigation/Detour/DetourNavMeshQuery.cpp
+++ b/dep/recastnavigation/Detour/Source/DetourNavMeshQuery.cpp
@@ -502,7 +502,7 @@ dtStatus dtNavMeshQuery::findRandomPointAroundCircle(dtPolyRef startRef, const f
 ///
 /// See closestPointOnPolyBoundary() for a limited but faster option.
 ///
-dtStatus dtNavMeshQuery::closestPointOnPoly(dtPolyRef ref, const float* pos, float* closest) const
+dtStatus dtNavMeshQuery::closestPointOnPoly(dtPolyRef ref, const float* pos, float* closest, bool* posOverPoly) const
 {
 	dtAssert(m_nav);
 	const dtMeshTile* tile = 0;
@@ -511,19 +511,7 @@ dtStatus dtNavMeshQuery::closestPointOnPoly(dtPolyRef ref, const float* pos, flo
 		return DT_FAILURE | DT_INVALID_PARAM;
 	if (!tile)
 		return DT_FAILURE | DT_INVALID_PARAM;
-
-    // Edited by TC
-    if (poly->getType() == DT_POLYTYPE_OFFMESH_CONNECTION) 
-        return DT_FAILURE;
-
-	closestPointOnPolyInTile(tile, poly, pos, closest);
 	
-	return DT_SUCCESS;
-}
-
-void dtNavMeshQuery::closestPointOnPolyInTile(const dtMeshTile* tile, const dtPoly* poly,
-											  const float* pos, float* closest) const
-{
 	// Off-mesh connections don't have detail polygons.
 	if (poly->getType() == DT_POLYTYPE_OFFMESH_CONNECTION)
 	{
@@ -533,7 +521,9 @@ void dtNavMeshQuery::closestPointOnPolyInTile(const dtMeshTile* tile, const dtPo
 		const float d1 = dtVdist(pos, v1);
 		const float u = d0 / (d0+d1);
 		dtVlerp(closest, v0, v1, u);
-		return;
+		if (posOverPoly)
+			*posOverPoly = false;
+		return DT_SUCCESS;
 	}
 
 	const unsigned int ip = (unsigned int)(poly - tile->polys);
@@ -564,6 +554,14 @@ void dtNavMeshQuery::closestPointOnPolyInTile(const dtMeshTile* tile, const dtPo
 		const float* va = &verts[imin*3];
 		const float* vb = &verts[((imin+1)%nv)*3];
 		dtVlerp(closest, va, vb, edget[imin]);
+
+		if (posOverPoly)
+			*posOverPoly = false;
+	}
+	else
+	{
+		if (posOverPoly)
+			*posOverPoly = true;
 	}
 
 	// Find height at the location.
@@ -585,30 +583,8 @@ void dtNavMeshQuery::closestPointOnPolyInTile(const dtMeshTile* tile, const dtPo
 			break;
 		}
 	}
-
-/*	float closestDistSqr = FLT_MAX;
-	for (int j = 0; j < pd->triCount; ++j)
-	{
-		const unsigned char* t = &tile->detailTris[(pd->triBase+j)*4];
-		const float* v[3];
-		for (int k = 0; k < 3; ++k)
-		{
-			if (t[k] < poly->vertCount)
-				v[k] = &tile->verts[poly->verts[t[k]]*3];
-			else
-				v[k] = &tile->detailVerts[(pd->vertBase+(t[k]-poly->vertCount))*3];
-		}
-
-		float pt[3];
-		dtClosestPtPointTriangle(pt, pos, v[0], v[1], v[2]);
-		float d = dtVdistSqr(pos, pt);
-		
-		if (d < closestDistSqr)
-		{
-			dtVcopy(closest, pt);
-			closestDistSqr = d;
-		}
-	}*/
+	
+	return DT_SUCCESS;
 }
 
 /// @par
@@ -687,8 +663,8 @@ dtStatus dtNavMeshQuery::getPolyHeight(dtPolyRef ref, const float* pos, float* h
 	{
 		const float* v0 = &tile->verts[poly->verts[0]*3];
 		const float* v1 = &tile->verts[poly->verts[1]*3];
-		const float d0 = dtVdist(pos, v0);
-		const float d1 = dtVdist(pos, v1);
+		const float d0 = dtVdist2D(pos, v0);
+		const float d1 = dtVdist2D(pos, v1);
 		const float u = d0 / (d0+d1);
 		if (height)
 			*height = v0[1] + (v1[1] - v0[1]) * u;
@@ -752,8 +728,27 @@ dtStatus dtNavMeshQuery::findNearestPoly(const float* center, const float* exten
 	{
 		dtPolyRef ref = polys[i];
 		float closestPtPoly[3];
-		closestPointOnPoly(ref, center, closestPtPoly);
-		float d = dtVdistSqr(center, closestPtPoly);
+		float diff[3];
+		bool posOverPoly = false;
+		float d = 0;
+		closestPointOnPoly(ref, center, closestPtPoly, &posOverPoly);
+
+		// If a point is directly over a polygon and closer than
+		// climb height, favor that instead of straight line nearest point.
+		dtVsub(diff, center, closestPtPoly);
+		if (posOverPoly)
+		{
+			const dtMeshTile* tile = 0;
+			const dtPoly* poly = 0;
+			m_nav->getTileAndPolyByRefUnsafe(polys[i], &tile, &poly);
+			d = dtAbs(diff[1]) - tile->header->walkableClimb;
+			d = d > 0 ? d*d : 0;			
+		}
+		else
+		{
+			d = dtVlenSqr(diff);
+		}
+		
 		if (d < nearestDistanceSqr)
 		{
 			if (nearestPt)
@@ -769,42 +764,6 @@ dtStatus dtNavMeshQuery::findNearestPoly(const float* center, const float* exten
 	return DT_SUCCESS;
 }
 
-dtPolyRef dtNavMeshQuery::findNearestPolyInTile(const dtMeshTile* tile, const float* center, const float* extents,
-												const dtQueryFilter* filter, float* nearestPt) const
-{
-	dtAssert(m_nav);
-	
-	float bmin[3], bmax[3];
-	dtVsub(bmin, center, extents);
-	dtVadd(bmax, center, extents);
-	
-	// Get nearby polygons from proximity grid.
-	dtPolyRef polys[128];
-	int polyCount = queryPolygonsInTile(tile, bmin, bmax, filter, polys, 128);
-	
-	// Find nearest polygon amongst the nearby polygons.
-	dtPolyRef nearest = 0;
-	float nearestDistanceSqr = FLT_MAX;
-	for (int i = 0; i < polyCount; ++i)
-	{
-		dtPolyRef ref = polys[i];
-		const dtPoly* poly = &tile->polys[m_nav->decodePolyIdPoly(ref)];
-		float closestPtPoly[3];
-		closestPointOnPolyInTile(tile, poly, center, closestPtPoly);
-			
-		float d = dtVdistSqr(center, closestPtPoly);
-		if (d < nearestDistanceSqr)
-		{
-			if (nearestPt)
-				dtVcopy(nearestPt, closestPtPoly);
-			nearestDistanceSqr = d;
-			nearest = ref;
-		}
-	}
-	
-	return nearest;
-}
-
 int dtNavMeshQuery::queryPolygonsInTile(const dtMeshTile* tile, const float* qmin, const float* qmax,
 										const dtQueryFilter* filter,
 										dtPolyRef* polys, const int maxPolys) const
@@ -3347,7 +3306,7 @@ dtStatus dtNavMeshQuery::findDistanceToWall(dtPolyRef startRef, const float* cen
 	dtVsub(hitNormal, centerPos, hitPos);
 	dtVnormalize(hitNormal);
 	
-	*hitDist = dtSqrt(radiusSqr);
+	*hitDist = sqrtf(radiusSqr);
 	
 	return status;
 }
diff --git a/dep/recastnavigation/Detour/DetourNode.cpp b/dep/recastnavigation/Detour/Source/DetourNode.cpp
index 4c8215e20d0..4c8215e20d0 100644
--- a/dep/recastnavigation/Detour/DetourNode.cpp
+++ b/dep/recastnavigation/Detour/Source/DetourNode.cpp
diff --git a/dep/recastnavigation/Readme.txt b/dep/recastnavigation/Readme.txt
index 0c2f7b1675f..1383b01d582 100644
--- a/dep/recastnavigation/Readme.txt
+++ b/dep/recastnavigation/Readme.txt
@@ -32,9 +32,6 @@ the regions as simple polygons.
 The toolset code is located in the Recast folder and demo application using the Recast
 toolset is located in the RecastDemo folder.
 
-The project files with this distribution can be compiled with Microsoft Visual C++ 2008
-(you can download it for free) and XCode 3.1.
-
 
 Detour
 
@@ -43,78 +40,7 @@ Recast is accompanied with Detour, path-finding and spatial reasoning toolkit. Y
 Detour offers simple static navigation mesh which is suitable for many simple cases, as well as tiled navigation mesh which allows you to plug in and out pieces of the mesh. The tiled mesh allows to create systems where you stream new navigation data in and out as the player progresses the level, or you may regenerate tiles as the world changes. 
 
 
-Latest code available at http://code.google.com/p/recastnavigation/
-
-
---
-
-Release Notes
-
-----------------
-* Recast 1.4
-  Released August 24th, 2009
-
-- Added detail height mesh generation (RecastDetailMesh.cpp) for single,
-  tiled statmeshes as well as tilemesh.
-- Added feature to contour tracing which detects extra vertices along
-  tile edges which should be removed later.
-- Changed the tiled stat mesh preprocess, so that it first generated
-  polymeshes per tile and finally combines them.
-- Fixed bug in the GUI code where invisible buttons could be pressed.
-
-----------------
-* Recast 1.31
-  Released July 24th, 2009
-
-- Better cost and heuristic functions.
-- Fixed tile navmesh raycast on tile borders.
-
-----------------
-* Recast 1.3
-  Released July 14th, 2009
-
-- Added dtTileNavMesh which allows to dynamically add and remove navmesh pieces at runtime.
-- Renamed stat navmesh types to dtStat* (i.e. dtPoly is now dtStatPoly).
-- Moved common code used by tile and stat navmesh to DetourNode.h/cpp and DetourCommon.h/cpp.
-- Refactores the demo code.
-
-----------------
-* Recast 1.2
-  Released June 17th, 2009
-
-- Added tiled mesh generation. The tiled generation allows to generate navigation for
-  much larger worlds, it removes some of the artifacts that comes from distance fields
-  in open areas, and allows later streaming and dynamic runtime generation
-- Improved and added some debug draw modes
-- API change: The helper function rcBuildNavMesh does not exists anymore,
-  had to change few internal things to cope with the tiled processing,
-  similar API functionality will be added later once the tiled process matures
-- The demo is getting way too complicated, need to split demos
-- Fixed several filtering functions so that the mesh is tighter to the geometry,
-  sometimes there could be up error up to tow voxel units close to walls,
-  now it should be just one.
-
-----------------
-* Recast 1.1
-  Released April 11th, 2009
-
-This is the first release of Detour.
-
-----------------
-* Recast 1.0
-  Released March 29th, 2009
-
-This is the first release of Recast.
-
-The process is not always as robust as I would wish. The watershed phase sometimes swallows tiny islands
-which are close to edges. These droppings are handled in rcBuildContours, but the code is not
-particularly robust either.
-
-Another non-robust case is when portal contours (contours shared between two regions) are always
-assumed to be straight. That can lead to overlapping contours specially when the level has
-large open areas.
-
-
+Latest code available at https://github.com/memononen/recastnavigation
 
 Mikko Mononen
 memon@inside.org
diff --git a/dep/recastnavigation/Recast/CMakeLists.txt b/dep/recastnavigation/Recast/CMakeLists.txt
index 09f20b4ed2f..f4869bf8773 100644
--- a/dep/recastnavigation/Recast/CMakeLists.txt
+++ b/dep/recastnavigation/Recast/CMakeLists.txt
@@ -9,18 +9,20 @@
 # implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 
 set(Recast_STAT_SRCS
-    Recast.cpp 
-    RecastAlloc.cpp 
-    RecastArea.cpp 
-    RecastContour.cpp 
-    RecastFilter.cpp 
-    RecastLayers.cpp 
-    RecastMesh.cpp 
-    RecastMeshDetail.cpp 
-    RecastRasterization.cpp 
-    RecastRegion.cpp 
+    Source/Recast.cpp 
+    Source/RecastAlloc.cpp 
+    Source/RecastArea.cpp 
+    Source/RecastContour.cpp 
+    Source/RecastFilter.cpp 
+	Source/RecastLayers.cpp
+    Source/RecastMesh.cpp 
+    Source/RecastMeshDetail.cpp 
+    Source/RecastRasterization.cpp 
+    Source/RecastRegion.cpp 
 )
 
+include_directories(Include)
+
 if(WIN32)
   include_directories(
     ${CMAKE_SOURCE_DIR}/dep/zlib
diff --git a/dep/recastnavigation/Recast/Recast.h b/dep/recastnavigation/Recast/Include/Recast.h
index fb36aa4c5cf..3f4ae96d1c9 100644
--- a/dep/recastnavigation/Recast/Recast.h
+++ b/dep/recastnavigation/Recast/Include/Recast.h
@@ -219,7 +219,7 @@ struct rcConfig
 	int maxEdgeLen;
 	
 	/// The maximum distance a simplfied contour's border edges should deviate 
-	/// the original raw contour. [Limit: >=0] [Units: wu]
+	/// the original raw contour. [Limit: >=0] [Units: vx]
 	float maxSimplificationError;
 	
 	/// The minimum number of cells allowed to form isolated island areas. [Limit: >=0] [Units: vx] 
@@ -549,6 +549,11 @@ static const int RC_NOT_CONNECTED = 0x3f;
 /// @name General helper functions
 /// @{
 
+/// Used to ignore a function parameter.  VS complains about unused parameters
+/// and this silences the warning.
+///  @param [in] _ Unused parameter
+template<class T> void rcIgnoreUnused(const T&) { }
+
 /// Swaps the values of the two parameters.
 ///  @param[in,out]	a	Value A
 ///  @param[in,out]	b	Value B
diff --git a/dep/recastnavigation/Recast/RecastAlloc.h b/dep/recastnavigation/Recast/Include/RecastAlloc.h
index 438be9ea56b..438be9ea56b 100644
--- a/dep/recastnavigation/Recast/RecastAlloc.h
+++ b/dep/recastnavigation/Recast/Include/RecastAlloc.h
diff --git a/dep/recastnavigation/Recast/RecastAssert.h b/dep/recastnavigation/Recast/Include/RecastAssert.h
index 2aca0d9a14f..2aca0d9a14f 100644
--- a/dep/recastnavigation/Recast/RecastAssert.h
+++ b/dep/recastnavigation/Recast/Include/RecastAssert.h
diff --git a/dep/recastnavigation/Recast/Recast.cpp b/dep/recastnavigation/Recast/Source/Recast.cpp
index 803daac3bcf..b9d86036c3f 100644
--- a/dep/recastnavigation/Recast/Recast.cpp
+++ b/dep/recastnavigation/Recast/Source/Recast.cpp
@@ -208,12 +208,11 @@ void rcCalcGridSize(const float* bmin, const float* bmax, float cs, int* w, int*
 /// See the #rcConfig documentation for more information on the configuration parameters.
 /// 
 /// @see rcAllocHeightfield, rcHeightfield 
-bool rcCreateHeightfield(rcContext* /*ctx*/, rcHeightfield& hf, int width, int height,
+bool rcCreateHeightfield(rcContext* ctx, rcHeightfield& hf, int width, int height,
 						 const float* bmin, const float* bmax,
 						 float cs, float ch)
 {
-	// TODO: VC complains about unref formal variable, figure out a way to handle this better.
-//	rcAssert(ctx);
+	rcIgnoreUnused(ctx);
 	
 	hf.width = width;
 	hf.height = height;
@@ -245,13 +244,12 @@ static void calcTriNormal(const float* v0, const float* v1, const float* v2, flo
 /// See the #rcConfig documentation for more information on the configuration parameters.
 /// 
 /// @see rcHeightfield, rcClearUnwalkableTriangles, rcRasterizeTriangles
-void rcMarkWalkableTriangles(rcContext* /*ctx*/, const float walkableSlopeAngle,
+void rcMarkWalkableTriangles(rcContext* ctx, const float walkableSlopeAngle,
 							 const float* verts, int /*nv*/,
 							 const int* tris, int nt,
 							 unsigned char* areas)
 {
-	// TODO: VC complains about unref formal variable, figure out a way to handle this better.
-//	rcAssert(ctx);
+	rcIgnoreUnused(ctx);
 	
 	const float walkableThr = cosf(walkableSlopeAngle/180.0f*RC_PI);
 
@@ -275,13 +273,12 @@ void rcMarkWalkableTriangles(rcContext* /*ctx*/, const float walkableSlopeAngle,
 /// See the #rcConfig documentation for more information on the configuration parameters.
 /// 
 /// @see rcHeightfield, rcClearUnwalkableTriangles, rcRasterizeTriangles
-void rcClearUnwalkableTriangles(rcContext* /*ctx*/, const float walkableSlopeAngle,
+void rcClearUnwalkableTriangles(rcContext* ctx, const float walkableSlopeAngle,
 								const float* verts, int /*nv*/,
 								const int* tris, int nt,
 								unsigned char* areas)
 {
-	// TODO: VC complains about unref formal variable, figure out a way to handle this better.
-//	rcAssert(ctx);
+	rcIgnoreUnused(ctx);
 	
 	const float walkableThr = cosf(walkableSlopeAngle/180.0f*RC_PI);
 	
@@ -297,10 +294,9 @@ void rcClearUnwalkableTriangles(rcContext* /*ctx*/, const float walkableSlopeAng
 	}
 }
 
-int rcGetHeightFieldSpanCount(rcContext* /*ctx*/, rcHeightfield& hf)
+int rcGetHeightFieldSpanCount(rcContext* ctx, rcHeightfield& hf)
 {
-	// TODO: VC complains about unref formal variable, figure out a way to handle this better.
-//	rcAssert(ctx);
+	rcIgnoreUnused(ctx);
 	
 	const int w = hf.width;
 	const int h = hf.height;
diff --git a/dep/recastnavigation/Recast/RecastAlloc.cpp b/dep/recastnavigation/Recast/Source/RecastAlloc.cpp
index b5ec1516146..b5ec1516146 100644
--- a/dep/recastnavigation/Recast/RecastAlloc.cpp
+++ b/dep/recastnavigation/Recast/Source/RecastAlloc.cpp
diff --git a/dep/recastnavigation/Recast/RecastArea.cpp b/dep/recastnavigation/Recast/Source/RecastArea.cpp
index 1a338cd9b8c..1a338cd9b8c 100644
--- a/dep/recastnavigation/Recast/RecastArea.cpp
+++ b/dep/recastnavigation/Recast/Source/RecastArea.cpp
diff --git a/dep/recastnavigation/Recast/RecastContour.cpp b/dep/recastnavigation/Recast/Source/RecastContour.cpp
index 5c324bcedfe..5c324bcedfe 100644
--- a/dep/recastnavigation/Recast/RecastContour.cpp
+++ b/dep/recastnavigation/Recast/Source/RecastContour.cpp
diff --git a/dep/recastnavigation/Recast/RecastFilter.cpp b/dep/recastnavigation/Recast/Source/RecastFilter.cpp
index bf985c362c9..bf985c362c9 100644
--- a/dep/recastnavigation/Recast/RecastFilter.cpp
+++ b/dep/recastnavigation/Recast/Source/RecastFilter.cpp
diff --git a/dep/recastnavigation/Recast/RecastLayers.cpp b/dep/recastnavigation/Recast/Source/RecastLayers.cpp
index 5ea6cb79d16..204f72e8cb2 100644
--- a/dep/recastnavigation/Recast/RecastLayers.cpp
+++ b/dep/recastnavigation/Recast/Source/RecastLayers.cpp
@@ -325,7 +325,7 @@ bool rcBuildHeightfieldLayers(rcContext* ctx, rcCompactHeightfield& chf,
 					continue;
 				// Skip if the height range would become too large.
 				const int ymin = rcMin(root.ymin, regn.ymin);
-				const int ymax = rcMax(root.ymax, regn.ymax); // Edited by TC
+				const int ymax = rcMax(root.ymax, regn.ymax);
 				if ((ymax - ymin) >= 255)
 					 continue;
 
@@ -373,7 +373,7 @@ bool rcBuildHeightfieldLayers(rcContext* ctx, rcCompactHeightfield& chf,
 					continue;
 				// Skip if the height range would become too large.
 				const int ymin = rcMin(ri.ymin, rj.ymin);
-				const int ymax = rcMax(ri.ymax, rj.ymax);  // Edited by TC
+				const int ymax = rcMax(ri.ymax, rj.ymax);
 				if ((ymax - ymin) >= 255)
 				  continue;
 						  
diff --git a/dep/recastnavigation/Recast/RecastMesh.cpp b/dep/recastnavigation/Recast/Source/RecastMesh.cpp
index 13aad2af01c..8af609b79fb 100644
--- a/dep/recastnavigation/Recast/RecastMesh.cpp
+++ b/dep/recastnavigation/Recast/Source/RecastMesh.cpp
@@ -661,7 +661,8 @@ static bool removeVertex(rcContext* ctx, rcPolyMesh& mesh, const unsigned short
 			}
 			// Remove the polygon.
 			unsigned short* p2 = &mesh.polys[(mesh.npolys-1)*nvp*2];
-			memcpy(p,p2,sizeof(unsigned short)*nvp);
+			if (p != p2)
+				memcpy(p,p2,sizeof(unsigned short)*nvp);
 			memset(p+nvp,0xff,sizeof(unsigned short)*nvp);
 			mesh.regs[i] = mesh.regs[mesh.npolys-1];
 			mesh.areas[i] = mesh.areas[mesh.npolys-1];
@@ -861,7 +862,9 @@ static bool removeVertex(rcContext* ctx, rcPolyMesh& mesh, const unsigned short
 				unsigned short* pa = &polys[bestPa*nvp];
 				unsigned short* pb = &polys[bestPb*nvp];
 				mergePolys(pa, pb, bestEa, bestEb, tmpPoly, nvp);
-				memcpy(pb, &polys[(npolys-1)*nvp], sizeof(unsigned short)*nvp);
+				unsigned short* last = &polys[(npolys-1)*nvp];
+				if (pb != last)
+					memcpy(pb, last, sizeof(unsigned short)*nvp);
 				pregs[bestPb] = pregs[npolys-1];
 				pareas[bestPb] = pareas[npolys-1];
 				npolys--;
@@ -1105,7 +1108,9 @@ bool rcBuildPolyMesh(rcContext* ctx, rcContourSet& cset, const int nvp, rcPolyMe
 					unsigned short* pa = &polys[bestPa*nvp];
 					unsigned short* pb = &polys[bestPb*nvp];
 					mergePolys(pa, pb, bestEa, bestEb, tmpPoly, nvp);
-					memcpy(pb, &polys[(npolys-1)*nvp], sizeof(unsigned short)*nvp);
+					unsigned short* lastPoly = &polys[(npolys-1)*nvp];
+					if (pb != lastPoly)
+						memcpy(pb, lastPoly, sizeof(unsigned short)*nvp);
 					npolys--;
 				}
 				else
@@ -1319,6 +1324,12 @@ bool rcMergePolyMeshes(rcContext* ctx, rcPolyMesh** meshes, const int nmeshes, r
 		const unsigned short ox = (unsigned short)floorf((pmesh->bmin[0]-mesh.bmin[0])/mesh.cs+0.5f);
 		const unsigned short oz = (unsigned short)floorf((pmesh->bmin[2]-mesh.bmin[2])/mesh.cs+0.5f);
 		
+		bool isMinX = (ox == 0);
+		bool isMinZ = (oz == 0);
+		bool isMaxX = ((unsigned short)floorf((mesh.bmax[0] - pmesh->bmax[0]) / mesh.cs + 0.5f)) == 0;
+		bool isMaxZ = ((unsigned short)floorf((mesh.bmax[2] - pmesh->bmax[2]) / mesh.cs + 0.5f)) == 0;
+		bool isOnBorder = (isMinX || isMinZ || isMaxX || isMaxZ);
+
 		for (int j = 0; j < pmesh->nverts; ++j)
 		{
 			unsigned short* v = &pmesh->verts[j*3];
@@ -1339,6 +1350,36 @@ bool rcMergePolyMeshes(rcContext* ctx, rcPolyMesh** meshes, const int nmeshes, r
 				if (src[k] == RC_MESH_NULL_IDX) break;
 				tgt[k] = vremap[src[k]];
 			}
+
+			if (isOnBorder)
+			{
+				for (int k = mesh.nvp; k < mesh.nvp * 2; ++k)
+				{
+					if (src[k] & 0x8000 && src[k] != 0xffff)
+					{
+						unsigned short dir = src[k] & 0xf;
+						switch (dir)
+						{
+							case 0: // Portal x-
+								if (isMinX)
+									tgt[k] = src[k];
+								break;
+							case 1: // Portal z+
+								if (isMaxZ)
+									tgt[k] = src[k];
+								break;
+							case 2: // Portal x+
+								if (isMaxX)
+									tgt[k] = src[k];
+								break;
+							case 3: // Portal z-
+								if (isMinZ)
+									tgt[k] = src[k];
+								break;
+						}
+					}
+				}
+			}
 		}
 	}
 
diff --git a/dep/recastnavigation/Recast/RecastMeshDetail.cpp b/dep/recastnavigation/Recast/Source/RecastMeshDetail.cpp
index f49d67400c2..8325b883707 100644
--- a/dep/recastnavigation/Recast/RecastMeshDetail.cpp
+++ b/dep/recastnavigation/Recast/Source/RecastMeshDetail.cpp
@@ -200,8 +200,8 @@ static unsigned short getHeight(const float fx, const float fy, const float fz,
 {
 	int ix = (int)floorf(fx*ics + 0.01f);
 	int iz = (int)floorf(fz*ics + 0.01f);
-	ix = rcClamp(ix-hp.xmin, 0, hp.width);
-	iz = rcClamp(iz-hp.ymin, 0, hp.height);
+	ix = rcClamp(ix-hp.xmin, 0, hp.width - 1);
+	iz = rcClamp(iz-hp.ymin, 0, hp.height - 1);
 	unsigned short h = hp.data[ix+iz*hp.width];
 	if (h == RC_UNSET_HEIGHT)
 	{
@@ -554,7 +554,7 @@ static bool buildPolyDetail(rcContext* ctx, const float* in, const int nin,
 			float dx = vi[0] - vj[0];
 			float dy = vi[1] - vj[1];
 			float dz = vi[2] - vj[2];
-			float d = rcSqrt(dx*dx + dz*dz);
+			float d = sqrtf(dx*dx + dz*dz);
 			int nn = 1 + (int)floorf(d/sampleDist);
 			if (nn >= MAX_VERTS_PER_EDGE) nn = MAX_VERTS_PER_EDGE-1;
 			if (nverts+nn >= MAX_VERTS)
@@ -741,7 +741,8 @@ static bool buildPolyDetail(rcContext* ctx, const float* in, const int nin,
 	return true;
 }
 
-static void getHeightData(const rcCompactHeightfield& chf,
+
+static void getHeightDataSeedsFromVertices(const rcCompactHeightfield& chf,
 						  const unsigned short* poly, const int npoly,
 						  const unsigned short* verts, const int bs,
 						  rcHeightPatch& hp, rcIntArray& stack)
@@ -869,8 +870,83 @@ static void getHeightData(const rcCompactHeightfield& chf,
 		int idx = cx-hp.xmin+(cy-hp.ymin)*hp.width;
 		const rcCompactSpan& cs = chf.spans[ci];
 		hp.data[idx] = cs.y;
+
+		// getHeightData seeds are given in coordinates with borders 
+		stack[i+0] += bs;
+		stack[i+1] += bs;
 	}
 	
+}
+
+
+
+static void getHeightData(const rcCompactHeightfield& chf,
+						  const unsigned short* poly, const int npoly,
+						  const unsigned short* verts, const int bs,
+						  rcHeightPatch& hp, rcIntArray& stack,
+						  int region)
+{
+	// Note: Reads to the compact heightfield are offset by border size (bs)
+	// since border size offset is already removed from the polymesh vertices.
+
+	stack.resize(0);
+	memset(hp.data, 0xff, sizeof(unsigned short)*hp.width*hp.height);
+	
+	bool empty = true;
+
+	// Copy the height from the same region, and mark region borders
+	// as seed points to fill the rest.
+	for (int hy = 0; hy < hp.height; hy++)
+	{
+		int y = hp.ymin + hy + bs;
+		for (int hx = 0; hx < hp.width; hx++)
+		{
+			int x = hp.xmin + hx + bs;
+			const rcCompactCell& c = chf.cells[x+y*chf.width];
+			for (int i = (int)c.index, ni = (int)(c.index+c.count); i < ni; ++i)
+			{
+				const rcCompactSpan& s = chf.spans[i];
+				if (s.reg == region)
+				{
+					// Store height
+					hp.data[hx + hy*hp.width] = s.y;
+					empty = false;
+
+					// If any of the neighbours is not in same region,
+					// add the current location as flood fill start
+					bool border = false;
+					for (int dir = 0; dir < 4; ++dir)
+					{
+						if (rcGetCon(s, dir) != RC_NOT_CONNECTED)
+						{
+							const int ax = x + rcGetDirOffsetX(dir);
+							const int ay = y + rcGetDirOffsetY(dir);
+							const int ai = (int)chf.cells[ax+ay*chf.width].index + rcGetCon(s, dir);
+							const rcCompactSpan& as = chf.spans[ai];
+							if (as.reg != region)
+							{
+								border = true;
+								break;
+							}
+						}
+					}
+					if (border)
+					{
+						stack.push(x);
+						stack.push(y);
+						stack.push(i);
+					}
+					break;
+				}
+			}
+		}
+	}	
+
+	// if the polygon does not contian any points from the current region (rare, but happens)
+	// then use the cells closest to the polygon vertices as seeds to fill the height field
+	if (empty)
+		getHeightDataSeedsFromVertices(chf, poly, npoly, verts, bs, hp, stack);
+	
 	static const int RETRACT_SIZE = 256;
 	int head = 0;
 	
@@ -895,26 +971,25 @@ static void getHeightData(const rcCompactHeightfield& chf,
 			
 			const int ax = cx + rcGetDirOffsetX(dir);
 			const int ay = cy + rcGetDirOffsetY(dir);
+			const int hx = ax - hp.xmin - bs;
+			const int hy = ay - hp.ymin - bs;
 			
-			if (ax < hp.xmin || ax >= (hp.xmin+hp.width) ||
-				ay < hp.ymin || ay >= (hp.ymin+hp.height))
+			if (hx < 0 || hx >= hp.width || hy < 0 || hy >= hp.height)
 				continue;
 			
-			if (hp.data[ax-hp.xmin+(ay-hp.ymin)*hp.width] != RC_UNSET_HEIGHT)
+			if (hp.data[hx + hy*hp.width] != RC_UNSET_HEIGHT)
 				continue;
 			
-			const int ai = (int)chf.cells[(ax+bs)+(ay+bs)*chf.width].index + rcGetCon(cs, dir);
-			
+			const int ai = (int)chf.cells[ax + ay*chf.width].index + rcGetCon(cs, dir);
 			const rcCompactSpan& as = chf.spans[ai];
-			int idx = ax-hp.xmin+(ay-hp.ymin)*hp.width;
-			hp.data[idx] = as.y;
+
+			hp.data[hx + hy*hp.width] = as.y;
 
 			stack.push(ax);
 			stack.push(ay);
 			stack.push(ai);
 		}
 	}
-	
 }
 
 static unsigned char getEdgeFlags(const float* va, const float* vb,
@@ -1072,7 +1147,7 @@ bool rcBuildPolyMeshDetail(rcContext* ctx, const rcPolyMesh& mesh, const rcCompa
 		hp.ymin = bounds[i*4+2];
 		hp.width = bounds[i*4+1]-bounds[i*4+0];
 		hp.height = bounds[i*4+3]-bounds[i*4+2];
-		getHeightData(chf, p, npoly, mesh.verts, borderSize, hp, stack);
+		getHeightData(chf, p, npoly, mesh.verts, borderSize, hp, stack, mesh.regs[i]);
 		
 		// Build detail mesh.
 		int nverts = 0;
@@ -1242,4 +1317,3 @@ bool rcMergePolyMeshDetails(rcContext* ctx, rcPolyMeshDetail** meshes, const int
 	
 	return true;
 }
-
diff --git a/dep/recastnavigation/Recast/RecastRasterization.cpp b/dep/recastnavigation/Recast/Source/RecastRasterization.cpp
index d2bb7c98f18..45a7d35bf3e 100644
--- a/dep/recastnavigation/Recast/RecastRasterization.cpp
+++ b/dep/recastnavigation/Recast/Source/RecastRasterization.cpp
@@ -95,7 +95,7 @@ static void addSpan(rcHeightfield& hf, const int x, const int y,
 	s->area = area;
 	s->next = 0;
 	
-	// Empty cell, add he first span.
+	// Empty cell, add the first span.
 	if (!hf.spans[idx])
 	{
 		hf.spans[idx] = s;
@@ -169,36 +169,64 @@ void rcAddSpan(rcContext* /*ctx*/, rcHeightfield& hf, const int x, const int y,
 	addSpan(hf, x,y, smin, smax, area, flagMergeThr);
 }
 
-static int clipPoly(const float* in, int n, float* out, float pnx, float pnz, float pd)
+// divides a convex polygons into two convex polygons on both sides of a line
+static void dividePoly(const float* in, int nin,
+					  float* out1, int* nout1,
+					  float* out2, int* nout2,
+					  float x, int axis)
 {
 	float d[12];
-	for (int i = 0; i < n; ++i)
-		d[i] = pnx*in[i*3+0] + pnz*in[i*3+2] + pd;
-	
-	int m = 0;
-	for (int i = 0, j = n-1; i < n; j=i, ++i)
+	for (int i = 0; i < nin; ++i)
+		d[i] = x - in[i*3+axis];
+
+	int m = 0, n = 0;
+	for (int i = 0, j = nin-1; i < nin; j=i, ++i)
 	{
 		bool ina = d[j] >= 0;
 		bool inb = d[i] >= 0;
 		if (ina != inb)
 		{
 			float s = d[j] / (d[j] - d[i]);
-			out[m*3+0] = in[j*3+0] + (in[i*3+0] - in[j*3+0])*s;
-			out[m*3+1] = in[j*3+1] + (in[i*3+1] - in[j*3+1])*s;
-			out[m*3+2] = in[j*3+2] + (in[i*3+2] - in[j*3+2])*s;
+			out1[m*3+0] = in[j*3+0] + (in[i*3+0] - in[j*3+0])*s;
+			out1[m*3+1] = in[j*3+1] + (in[i*3+1] - in[j*3+1])*s;
+			out1[m*3+2] = in[j*3+2] + (in[i*3+2] - in[j*3+2])*s;
+			rcVcopy(out2 + n*3, out1 + m*3);
 			m++;
+			n++;
+			// add the i'th point to the right polygon. Do NOT add points that are on the dividing line
+			// since these were already added above
+			if (d[i] > 0)
+			{
+				rcVcopy(out1 + m*3, in + i*3);
+				m++;
+			}
+			else if (d[i] < 0)
+			{
+				rcVcopy(out2 + n*3, in + i*3);
+				n++;
+			}
 		}
-		if (inb)
+		else // same side
 		{
-			out[m*3+0] = in[i*3+0];
-			out[m*3+1] = in[i*3+1];
-			out[m*3+2] = in[i*3+2];
-			m++;
+			// add the i'th point to the right polygon. Addition is done even for points on the dividing line
+			if (d[i] >= 0)
+			{
+				rcVcopy(out1 + m*3, in + i*3);
+				m++;
+				if (d[i] != 0)
+					continue;
+			}
+			rcVcopy(out2 + n*3, in + i*3);
+			n++;
 		}
 	}
-	return m;
+
+	*nout1 = m;
+	*nout2 = n;
 }
 
+
+
 static void rasterizeTri(const float* v0, const float* v1, const float* v2,
 						 const unsigned char area, rcHeightfield& hf,
 						 const float* bmin, const float* bmax,
@@ -222,48 +250,57 @@ static void rasterizeTri(const float* v0, const float* v1, const float* v2,
 	if (!overlapBounds(bmin, bmax, tmin, tmax))
 		return;
 	
-	// Calculate the footpring of the triangle on the grid.
-	int x0 = (int)((tmin[0] - bmin[0])*ics);
+	// Calculate the footprint of the triangle on the grid's y-axis
 	int y0 = (int)((tmin[2] - bmin[2])*ics);
-	int x1 = (int)((tmax[0] - bmin[0])*ics);
 	int y1 = (int)((tmax[2] - bmin[2])*ics);
-	x0 = rcClamp(x0, 0, w-1);
 	y0 = rcClamp(y0, 0, h-1);
-	x1 = rcClamp(x1, 0, w-1);
 	y1 = rcClamp(y1, 0, h-1);
 	
 	// Clip the triangle into all grid cells it touches.
-	float in[7*3], out[7*3], inrow[7*3];
+	float buf[7*3*4];
+	float *in = buf, *inrow = buf+7*3, *p1 = inrow+7*3, *p2 = p1+7*3;
+
+	rcVcopy(&in[0], v0);
+	rcVcopy(&in[1*3], v1);
+	rcVcopy(&in[2*3], v2);
+	int nvrow, nvIn = 3;
 	
 	for (int y = y0; y <= y1; ++y)
 	{
-		// Clip polygon to row.
-		rcVcopy(&in[0], v0);
-		rcVcopy(&in[1*3], v1);
-		rcVcopy(&in[2*3], v2);
-		int nvrow = 3;
+		// Clip polygon to row. Store the remaining polygon as well
 		const float cz = bmin[2] + y*cs;
-		nvrow = clipPoly(in, nvrow, out, 0, 1, -cz);
-		if (nvrow < 3) continue;
-		nvrow = clipPoly(out, nvrow, inrow, 0, -1, cz+cs);
+		dividePoly(in, nvIn, inrow, &nvrow, p1, &nvIn, cz+cs, 2);
+		rcSwap(in, p1);
 		if (nvrow < 3) continue;
 		
+		// find the horizontal bounds in the row
+		float minX = inrow[0], maxX = inrow[0];
+		for (int i=1; i<nvrow; ++i)
+		{
+			if (minX > inrow[i*3])	minX = inrow[i*3];
+			if (maxX < inrow[i*3])	maxX = inrow[i*3];
+		}
+		int x0 = (int)((minX - bmin[0])*ics);
+		int x1 = (int)((maxX - bmin[0])*ics);
+		x0 = rcClamp(x0, 0, w-1);
+		x1 = rcClamp(x1, 0, w-1);
+
+		int nv, nv2 = nvrow;
+
 		for (int x = x0; x <= x1; ++x)
 		{
-			// Clip polygon to column.
-			int nv = nvrow;
+			// Clip polygon to column. store the remaining polygon as well
 			const float cx = bmin[0] + x*cs;
-			nv = clipPoly(inrow, nv, out, 1, 0, -cx);
-			if (nv < 3) continue;
-			nv = clipPoly(out, nv, in, -1, 0, cx+cs);
+			dividePoly(inrow, nv2, p1, &nv, p2, &nv2, cx+cs, 0);
+			rcSwap(inrow, p2);
 			if (nv < 3) continue;
 			
 			// Calculate min and max of the span.
-			float smin = in[1], smax = in[1];
+			float smin = p1[1], smax = p1[1];
 			for (int i = 1; i < nv; ++i)
 			{
-				smin = rcMin(smin, in[i*3+1]);
-				smax = rcMax(smax, in[i*3+1]);
+				smin = rcMin(smin, p1[i*3+1]);
+				smax = rcMax(smax, p1[i*3+1]);
 			}
 			smin -= bmin[1];
 			smax -= bmin[1];
diff --git a/dep/recastnavigation/Recast/RecastRegion.cpp b/dep/recastnavigation/Recast/Source/RecastRegion.cpp
index 76e631cc5fb..589fac29203 100644
--- a/dep/recastnavigation/Recast/RecastRegion.cpp
+++ b/dep/recastnavigation/Recast/Source/RecastRegion.cpp
@@ -286,7 +286,10 @@ static bool floodRegion(int x, int y, int i,
 				if (nr & RC_BORDER_REG) // Do not take borders into account.
 					continue;
 				if (nr != 0 && nr != r)
+				{
 					ar = nr;
+					break;
+				}
 				
 				const rcCompactSpan& as = chf.spans[ai];
 				
@@ -300,7 +303,10 @@ static bool floodRegion(int x, int y, int i,
 						continue;
 					unsigned short nr2 = srcReg[ai2];
 					if (nr2 != 0 && nr2 != r)
+					{
 						ar = nr2;
+						break;
+					}
 				}				
 			}
 		}
@@ -340,30 +346,44 @@ static unsigned short* expandRegions(int maxIter, unsigned short level,
 									 rcCompactHeightfield& chf,
 									 unsigned short* srcReg, unsigned short* srcDist,
 									 unsigned short* dstReg, unsigned short* dstDist, 
-									 rcIntArray& stack)
+									 rcIntArray& stack,
+									 bool fillStack)
 {
 	const int w = chf.width;
 	const int h = chf.height;
 
-	// Find cells revealed by the raised level.
-	stack.resize(0);
-	for (int y = 0; y < h; ++y)
+	if (fillStack)
 	{
-		for (int x = 0; x < w; ++x)
+		// Find cells revealed by the raised level.
+		stack.resize(0);
+		for (int y = 0; y < h; ++y)
 		{
-			const rcCompactCell& c = chf.cells[x+y*w];
-			for (int i = (int)c.index, ni = (int)(c.index+c.count); i < ni; ++i)
+			for (int x = 0; x < w; ++x)
 			{
-				if (chf.dist[i] >= level && srcReg[i] == 0 && chf.areas[i] != RC_NULL_AREA)
+				const rcCompactCell& c = chf.cells[x+y*w];
+				for (int i = (int)c.index, ni = (int)(c.index+c.count); i < ni; ++i)
 				{
-					stack.push(x);
-					stack.push(y);
-					stack.push(i);
+					if (chf.dist[i] >= level && srcReg[i] == 0 && chf.areas[i] != RC_NULL_AREA)
+					{
+						stack.push(x);
+						stack.push(y);
+						stack.push(i);
+					}
 				}
 			}
 		}
 	}
-	
+	else // use cells in the input stack
+	{
+		// mark all cells which already have a region
+		for (int j=0; j<stack.size(); j+=3)
+		{
+			int i = stack[j+2];
+			if (srcReg[i] != 0)
+				stack[j+2] = -1;
+		}
+	}
+
 	int iter = 0;
 	while (stack.size() > 0)
 	{
@@ -434,6 +454,61 @@ static unsigned short* expandRegions(int maxIter, unsigned short level,
 }
 
 
+
+static void sortCellsByLevel(unsigned short startLevel,
+							  rcCompactHeightfield& chf,
+							  unsigned short* srcReg,
+							  unsigned int nbStacks, rcIntArray* stacks,
+							  unsigned short loglevelsPerStack) // the levels per stack (2 in our case) as a bit shift
+{
+	const int w = chf.width;
+	const int h = chf.height;
+	startLevel = startLevel >> loglevelsPerStack;
+
+	for (unsigned int j=0; j<nbStacks; ++j)
+		stacks[j].resize(0);
+
+	// put all cells in the level range into the appropriate stacks
+	for (int y = 0; y < h; ++y)
+	{
+		for (int x = 0; x < w; ++x)
+		{
+			const rcCompactCell& c = chf.cells[x+y*w];
+			for (int i = (int)c.index, ni = (int)(c.index+c.count); i < ni; ++i)
+			{
+				if (chf.areas[i] == RC_NULL_AREA || srcReg[i] != 0)
+					continue;
+
+				int level = chf.dist[i] >> loglevelsPerStack;
+				int sId = startLevel - level;
+				if (sId >= (int)nbStacks)
+					continue;
+				if (sId < 0)
+					sId = 0;
+
+				stacks[sId].push(x);
+				stacks[sId].push(y);
+				stacks[sId].push(i);
+			}
+		}
+	}
+}
+
+
+static void appendStacks(rcIntArray& srcStack, rcIntArray& dstStack,
+						 unsigned short* srcReg)
+{
+	for (int j=0; j<srcStack.size(); j+=3)
+	{
+		int i = srcStack[j+2];
+		if ((i < 0) || (srcReg[i] != 0))
+			continue;
+		dstStack.push(srcStack[j]);
+		dstStack.push(srcStack[j+1]);
+		dstStack.push(srcStack[j+2]);
+	}
+}
+
 struct rcRegion
 {
 	inline rcRegion(unsigned short i) :
@@ -1236,7 +1311,13 @@ bool rcBuildRegions(rcContext* ctx, rcCompactHeightfield& chf,
 	}
 	
 	ctx->startTimer(RC_TIMER_BUILD_REGIONS_WATERSHED);
-	
+
+	const int LOG_NB_STACKS = 3;
+	const int NB_STACKS = 1 << LOG_NB_STACKS;
+	rcIntArray lvlStacks[NB_STACKS];
+	for (int i=0; i<NB_STACKS; ++i)
+		lvlStacks[i].resize(1024);
+
 	rcIntArray stack(1024);
 	rcIntArray visited(1024);
 	
@@ -1271,14 +1352,25 @@ bool rcBuildRegions(rcContext* ctx, rcCompactHeightfield& chf,
 		chf.borderSize = borderSize;
 	}
 	
+	int sId = -1;
 	while (level > 0)
 	{
 		level = level >= 2 ? level-2 : 0;
-		
+		sId = (sId+1) & (NB_STACKS-1);
+
+//		ctx->startTimer(RC_TIMER_DIVIDE_TO_LEVELS);
+
+		if (sId == 0)
+			sortCellsByLevel(level, chf, srcReg, NB_STACKS, lvlStacks, 1);
+		else 
+			appendStacks(lvlStacks[sId-1], lvlStacks[sId], srcReg); // copy left overs from last level
+
+//		ctx->stopTimer(RC_TIMER_DIVIDE_TO_LEVELS);
+
 		ctx->startTimer(RC_TIMER_BUILD_REGIONS_EXPAND);
 		
 		// Expand current regions until no empty connected cells found.
-		if (expandRegions(expandIters, level, chf, srcReg, srcDist, dstReg, dstDist, stack) != srcReg)
+		if (expandRegions(expandIters, level, chf, srcReg, srcDist, dstReg, dstDist, lvlStacks[sId], false) != srcReg)
 		{
 			rcSwap(srcReg, dstReg);
 			rcSwap(srcDist, dstDist);
@@ -1289,18 +1381,15 @@ bool rcBuildRegions(rcContext* ctx, rcCompactHeightfield& chf,
 		ctx->startTimer(RC_TIMER_BUILD_REGIONS_FLOOD);
 		
 		// Mark new regions with IDs.
-		for (int y = 0; y < h; ++y)
+		for (int j=0; j<lvlStacks[sId].size(); j+=3)
 		{
-			for (int x = 0; x < w; ++x)
+			int x = lvlStacks[sId][j];
+			int y = lvlStacks[sId][j+1];
+			int i = lvlStacks[sId][j+2];
+			if (i >= 0 && srcReg[i] == 0)
 			{
-				const rcCompactCell& c = chf.cells[x+y*w];
-				for (int i = (int)c.index, ni = (int)(c.index+c.count); i < ni; ++i)
-				{
-					if (chf.dist[i] < level || srcReg[i] != 0 || chf.areas[i] == RC_NULL_AREA)
-						continue;
-					if (floodRegion(x, y, i, level, regionId, chf, srcReg, srcDist, stack))
-						regionId++;
-				}
+				if (floodRegion(x, y, i, level, regionId, chf, srcReg, srcDist, stack))
+					regionId++;
 			}
 		}
 		
@@ -1308,7 +1397,7 @@ bool rcBuildRegions(rcContext* ctx, rcCompactHeightfield& chf,
 	}
 	
 	// Expand current regions until no empty connected cells found.
-	if (expandRegions(expandIters*8, 0, chf, srcReg, srcDist, dstReg, dstDist, stack) != srcReg)
+	if (expandRegions(expandIters*8, 0, chf, srcReg, srcDist, dstReg, dstDist, stack, true) != srcReg)
 	{
 		rcSwap(srcReg, dstReg);
 		rcSwap(srcDist, dstDist);
diff --git a/dep/recastnavigation/TODO.txt b/dep/recastnavigation/TODO.txt
deleted file mode 100644
index b911c0e4720..00000000000
--- a/dep/recastnavigation/TODO.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-TODO/Roadmap
-
-Summer/Autumn 2009
-
-- Off mesh links (jump links)
-- Area annotations
-- Embed extra data per polygon
-- Height conforming navmesh
-
-
-Autumn/Winter 2009/2010
-
-- Detour path following
-- More dynamic example with tile navmesh
-- Faster small tile process
-
-
-More info at http://digestingduck.blogspot.com/2009/07/recast-and-detour-roadmap.html
-
--
diff --git a/dep/recastnavigation/recast_hotfix1.diff b/dep/recastnavigation/recast_hotfix1.diff
deleted file mode 100644
index d370b0a68c8..00000000000
--- a/dep/recastnavigation/recast_hotfix1.diff
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/dep/recastnavigation/Detour/DetourNavMesh.h b/dep/recastnavigation/Detour/DetourNavMesh.h
-index 52d2c50..99e30c7 100644
---- a/dep/recastnavigation/Detour/DetourNavMesh.h
-+++ b/dep/recastnavigation/Detour/DetourNavMesh.h
-@@ -21,7 +21,7 @@
- 
- #include "DetourAlloc.h"
- 
--#ifdef WIN32
-+#if defined(WIN32) && !defined(__MINGW32__)
-     typedef unsigned __int64   uint64;
- #else
- #include <stdint.h>
diff --git a/dep/recastnavigation/recastnavigation.diff b/dep/recastnavigation/recastnavigation.diff
new file mode 100644
index 00000000000..68e976c955e
--- /dev/null
+++ b/dep/recastnavigation/recastnavigation.diff
@@ -0,0 +1,4066 @@
+ Detour/Include/DetourNavMesh.h                 |   76 +-
+ Detour/Include/DetourNavMeshQuery.h            |    0
+ Detour/Include/DetourNode.h                    |    0
+ Detour/Source/DetourCommon.cpp                 |    4 +-
+ Detour/Source/DetourNavMesh.cpp                |   32 +-
+ Detour/Source/DetourNavMeshBuilder.cpp         |    6 +-
+ Detour/Source/DetourNavMeshQuery.cpp           |    9 +-
+ Detour/Source/DetourNode.cpp                   |   29 +-
+ DetourCrowd/Include/DetourObstacleAvoidance.h  |    0
+ DetourCrowd/Source/DetourObstacleAvoidance.cpp |    6 +-
+ Recast/Include/Recast.h                        |    8 +-
+ Recast/Source/RecastContour.cpp                |    0
+ Recast/Source/RecastLayers.cpp                 |    0
+ Recast/Source/RecastMesh.cpp                   |    0
+ Recast/Source/RecastMeshDetail.cpp             |    0
+ Recast/Source/RecastRegion.cpp                 |    0
+ RecastDemo/Contrib/stb_truetype.h              | 3612 ++++++++++++------------
+ RecastDemo/Include/NavMeshPruneTool.h          |    0
+ 18 files changed, 1865 insertions(+), 1917 deletions(-)
+
+diff --git a/Detour/Include/DetourNavMesh.h b/Detour/Include/DetourNavMesh.h
+index 95a63e4..cdd473f 100644
+--- a/Detour/Include/DetourNavMesh.h
++++ b/Detour/Include/DetourNavMesh.h
+@@ -22,39 +22,35 @@
+ #include "DetourAlloc.h"
+ #include "DetourStatus.h"
+ 
+-// Undefine (or define in a build cofnig) the following line to use 64bit polyref.
+-// Generally not needed, useful for very large worlds.
+-// Note: tiles build using 32bit refs are not compatible with 64bit refs!
+-//#define DT_POLYREF64 1
+-
+-#ifdef DT_POLYREF64
+-// TODO: figure out a multiplatform version of uint64_t
+-// - maybe: https://code.google.com/p/msinttypes/
+-// - or: http://www.azillionmonkeys.com/qed/pstdint.h
++
++// Edited by TC
++#if defined(WIN32) && !defined(__MINGW32__)
++/// Do not rename back to uint64. Otherwise mac complains about typedef redefinition
++typedef unsigned __int64    uint64_d;
++#else
+ #include <stdint.h>
++#ifndef uint64_t
++#ifdef __linux__
++#include <linux/types.h>
++#endif
+ #endif
++/// Do not rename back to uint64. Otherwise mac complains about typedef redefinition
++typedef uint64_t            uint64_d;
++#endif 
+ 
+ // Note: If you want to use 64-bit refs, change the types of both dtPolyRef & dtTileRef.
+ // It is also recommended that you change dtHashRef() to a proper 64-bit hash.
+ 
++// Edited by TC
++// We cannot have over 31 bits for either tile nor poly
++// without changing polyCount to use 64bits too.
+ /// A handle to a polygon within a navigation mesh tile.
+ /// @ingroup detour
+-#ifdef DT_POLYREF64
+-static const unsigned int DT_SALT_BITS = 16;
+-static const unsigned int DT_TILE_BITS = 28;
+-static const unsigned int DT_POLY_BITS = 20;
+-typedef uint64_t dtPolyRef;
+-#else
+-typedef unsigned int dtPolyRef;
+-#endif
++typedef uint64_d dtPolyRef; // Edited by TC
+ 
+ /// A handle to a tile within a navigation mesh.
+ /// @ingroup detour
+-#ifdef DT_POLYREF64
+-typedef uint64_t dtTileRef;
+-#else
+-typedef unsigned int dtTileRef;
+-#endif
++typedef uint64_d dtTileRef; // Edited by TC
+ 
+ /// The maximum number of vertices per navigation polygon.
+ /// @ingroup detour
+@@ -94,6 +90,12 @@ static const unsigned int DT_OFFMESH_CON_BIDIR = 1;
+ /// @ingroup detour
+ static const int DT_MAX_AREAS = 64;
+ 
++static const int STATIC_SALT_BITS = 12;
++static const int STATIC_TILE_BITS = 21;
++static const int STATIC_POLY_BITS = 31;
++// we cannot have over 31 bits for either tile nor poly
++// without changing polyCount to use 64bits too.
++
+ /// Tile flags used for various functions and fields.
+ /// For an example, see dtNavMesh::addTile().
+ enum dtTileFlags
+@@ -492,11 +494,7 @@ public:
+ 	///  @param[in]	ip		The index of the polygon within the tile.
+ 	inline dtPolyRef encodePolyId(unsigned int salt, unsigned int it, unsigned int ip) const
+ 	{
+-#ifdef DT_POLYREF64
+-		return ((dtPolyRef)salt << (DT_POLY_BITS+DT_TILE_BITS)) | ((dtPolyRef)it << DT_POLY_BITS) | (dtPolyRef)ip;
+-#else
+ 		return ((dtPolyRef)salt << (m_polyBits+m_tileBits)) | ((dtPolyRef)it << m_polyBits) | (dtPolyRef)ip;
+-#endif
+ 	}
+ 	
+ 	/// Decodes a standard polygon reference.
+@@ -508,21 +506,12 @@ public:
+ 	///  @see #encodePolyId
+ 	inline void decodePolyId(dtPolyRef ref, unsigned int& salt, unsigned int& it, unsigned int& ip) const
+ 	{
+-#ifdef DT_POLYREF64
+-		const dtPolyRef saltMask = ((dtPolyRef)1<<DT_SALT_BITS)-1;
+-		const dtPolyRef tileMask = ((dtPolyRef)1<<DT_TILE_BITS)-1;
+-		const dtPolyRef polyMask = ((dtPolyRef)1<<DT_POLY_BITS)-1;
+-		salt = (unsigned int)((ref >> (DT_POLY_BITS+DT_TILE_BITS)) & saltMask);
+-		it = (unsigned int)((ref >> DT_POLY_BITS) & tileMask);
+-		ip = (unsigned int)(ref & polyMask);
+-#else
+ 		const dtPolyRef saltMask = ((dtPolyRef)1<<m_saltBits)-1;
+ 		const dtPolyRef tileMask = ((dtPolyRef)1<<m_tileBits)-1;
+ 		const dtPolyRef polyMask = ((dtPolyRef)1<<m_polyBits)-1;
+ 		salt = (unsigned int)((ref >> (m_polyBits+m_tileBits)) & saltMask);
+ 		it = (unsigned int)((ref >> m_polyBits) & tileMask);
+ 		ip = (unsigned int)(ref & polyMask);
+-#endif
+ 	}
+ 
+ 	/// Extracts a tile's salt value from the specified polygon reference.
+@@ -531,13 +520,8 @@ public:
+ 	///  @see #encodePolyId
+ 	inline unsigned int decodePolyIdSalt(dtPolyRef ref) const
+ 	{
+-#ifdef DT_POLYREF64
+-		const dtPolyRef saltMask = ((dtPolyRef)1<<DT_SALT_BITS)-1;
+-		return (unsigned int)((ref >> (DT_POLY_BITS+DT_TILE_BITS)) & saltMask);
+-#else
+ 		const dtPolyRef saltMask = ((dtPolyRef)1<<m_saltBits)-1;
+ 		return (unsigned int)((ref >> (m_polyBits+m_tileBits)) & saltMask);
+-#endif
+ 	}
+ 	
+ 	/// Extracts the tile's index from the specified polygon reference.
+@@ -546,13 +530,8 @@ public:
+ 	///  @see #encodePolyId
+ 	inline unsigned int decodePolyIdTile(dtPolyRef ref) const
+ 	{
+-#ifdef DT_POLYREF64
+-		const dtPolyRef tileMask = ((dtPolyRef)1<<DT_TILE_BITS)-1;
+-		return (unsigned int)((ref >> DT_POLY_BITS) & tileMask);
+-#else
+ 		const dtPolyRef tileMask = ((dtPolyRef)1<<m_tileBits)-1;
+ 		return (unsigned int)((ref >> m_polyBits) & tileMask);
+-#endif
+ 	}
+ 	
+ 	/// Extracts the polygon's index (within its tile) from the specified polygon reference.
+@@ -561,13 +540,8 @@ public:
+ 	///  @see #encodePolyId
+ 	inline unsigned int decodePolyIdPoly(dtPolyRef ref) const
+ 	{
+-#ifdef DT_POLYREF64
+-		const dtPolyRef polyMask = ((dtPolyRef)1<<DT_POLY_BITS)-1;
+-		return (unsigned int)(ref & polyMask);
+-#else
+ 		const dtPolyRef polyMask = ((dtPolyRef)1<<m_polyBits)-1;
+ 		return (unsigned int)(ref & polyMask);
+-#endif
+ 	}
+ 
+ 	/// @}
+@@ -626,11 +600,9 @@ private:
+ 	dtMeshTile* m_nextFree;				///< Freelist of tiles.
+ 	dtMeshTile* m_tiles;				///< List of tiles.
+ 		
+-#ifndef DT_POLYREF64
+ 	unsigned int m_saltBits;			///< Number of salt bits in the tile ID.
+ 	unsigned int m_tileBits;			///< Number of tile bits in the tile ID.
+ 	unsigned int m_polyBits;			///< Number of poly bits in the tile ID.
+-#endif
+ };
+ 
+ /// Allocates a navigation mesh object using the Detour allocator.
+diff --git a/Detour/Source/DetourCommon.cpp b/Detour/Source/DetourCommon.cpp
+index a98d8c8..b5700f5 100644
+--- a/Detour/Source/DetourCommon.cpp
++++ b/Detour/Source/DetourCommon.cpp
+@@ -16,14 +16,14 @@
+ // 3. This notice may not be removed or altered from any source distribution.
+ //
+ 
++#include <math.h>
+ #include "DetourCommon.h"
+-#include "DetourMath.h"
+ 
+ //////////////////////////////////////////////////////////////////////////////////////////
+ 
+ float dtSqrt(float x)
+ {
+-	return dtMathSqrtf(x);
++	return sqrtf(x);
+ }
+ 
+ void dtClosestPtPointTriangle(float* closest, const float* p,
+diff --git a/Detour/Source/DetourNavMesh.cpp b/Detour/Source/DetourNavMesh.cpp
+index 9d627be..5174050 100644
+--- a/Detour/Source/DetourNavMesh.cpp
++++ b/Detour/Source/DetourNavMesh.cpp
+@@ -16,13 +16,13 @@
+ // 3. This notice may not be removed or altered from any source distribution.
+ //
+ 
++#include <math.h>
+ #include <float.h>
+ #include <string.h>
+ #include <stdio.h>
+ #include "DetourNavMesh.h"
+ #include "DetourNode.h"
+ #include "DetourCommon.h"
+-#include "DetourMath.h"
+ #include "DetourAlloc.h"
+ #include "DetourAssert.h"
+ #include <new>
+@@ -193,13 +193,11 @@ dtNavMesh::dtNavMesh() :
+ 	m_tileLutMask(0),
+ 	m_posLookup(0),
+ 	m_nextFree(0),
+-	m_tiles(0)
++	m_tiles(0),
++	m_saltBits(0),
++	m_tileBits(0),
++	m_polyBits(0)
+ {
+-#ifndef DT_POLYREF64
+-	m_saltBits = 0;
+-	m_tileBits = 0;
+-	m_polyBits = 0;
+-#endif
+ 	memset(&m_params, 0, sizeof(dtNavMeshParams));
+ 	m_orig[0] = 0;
+ 	m_orig[1] = 0;
+@@ -250,17 +248,11 @@ dtStatus dtNavMesh::init(const dtNavMeshParams* params)
+ 		m_nextFree = &m_tiles[i];
+ 	}
+ 	
+-	// Init ID generator values.
+-#ifndef DT_POLYREF64
+-	m_tileBits = dtIlog2(dtNextPow2((unsigned int)params->maxTiles));
+-	m_polyBits = dtIlog2(dtNextPow2((unsigned int)params->maxPolys));
+-	// Only allow 31 salt bits, since the salt mask is calculated using 32bit uint and it will overflow.
+-	m_saltBits = dtMin((unsigned int)31, 32 - m_tileBits - m_polyBits);
+-
+-	if (m_saltBits < 10)
+-		return DT_FAILURE | DT_INVALID_PARAM;
+-#endif
+-	
++    // Edited by TC
++    m_tileBits = STATIC_TILE_BITS;
++    m_polyBits = STATIC_POLY_BITS; 
++    m_saltBits = STATIC_SALT_BITS; 
++
+ 	return DT_SUCCESS;
+ }
+ 
+@@ -1242,11 +1234,7 @@ dtStatus dtNavMesh::removeTile(dtTileRef ref, unsigned char** data, int* dataSiz
+ 	tile->offMeshCons = 0;
+ 
+ 	// Update salt, salt should never be zero.
+-#ifdef DT_POLYREF64
+-	tile->salt = (tile->salt+1) & ((1<<DT_SALT_BITS)-1);
+-#else
+ 	tile->salt = (tile->salt+1) & ((1<<m_saltBits)-1);
+-#endif
+ 	if (tile->salt == 0)
+ 		tile->salt++;
+ 
+diff --git a/Detour/Source/DetourNavMeshBuilder.cpp b/Detour/Source/DetourNavMeshBuilder.cpp
+index 1bf271b..9d8471b 100644
+--- a/Detour/Source/DetourNavMeshBuilder.cpp
++++ b/Detour/Source/DetourNavMeshBuilder.cpp
+@@ -16,13 +16,13 @@
+ // 3. This notice may not be removed or altered from any source distribution.
+ //
+ 
++#include <math.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <string.h>
+ #include <float.h>
+ #include "DetourNavMesh.h"
+ #include "DetourCommon.h"
+-#include "DetourMath.h"
+ #include "DetourNavMeshBuilder.h"
+ #include "DetourAlloc.h"
+ #include "DetourAssert.h"
+@@ -202,8 +202,8 @@ static int createBVTree(const unsigned short* verts, const int /*nverts*/,
+ 			if (z > it.bmax[2]) it.bmax[2] = z;
+ 		}
+ 		// Remap y
+-		it.bmin[1] = (unsigned short)dtMathFloorf((float)it.bmin[1]*ch/cs);
+-		it.bmax[1] = (unsigned short)dtMathCeilf((float)it.bmax[1]*ch/cs);
++		it.bmin[1] = (unsigned short)floorf((float)it.bmin[1]*ch/cs);
++		it.bmax[1] = (unsigned short)ceilf((float)it.bmax[1]*ch/cs);
+ 	}
+ 	
+ 	int curNode = 0;
+diff --git a/Detour/Source/DetourNavMeshQuery.cpp b/Detour/Source/DetourNavMeshQuery.cpp
+index 2e30464..f1709df 100644
+--- a/Detour/Source/DetourNavMeshQuery.cpp
++++ b/Detour/Source/DetourNavMeshQuery.cpp
+@@ -16,13 +16,13 @@
+ // 3. This notice may not be removed or altered from any source distribution.
+ //
+ 
++#include <math.h>
+ #include <float.h>
+ #include <string.h>
+ #include "DetourNavMeshQuery.h"
+ #include "DetourNavMesh.h"
+ #include "DetourNode.h"
+ #include "DetourCommon.h"
+-#include "DetourMath.h"
+ #include "DetourAlloc.h"
+ #include "DetourAssert.h"
+ #include <new>
+@@ -99,8 +99,9 @@ inline float dtQueryFilter::getCost(const float* pa, const float* pb,
+ 	return dtVdist(pa, pb) * m_areaCost[curPoly->getArea()];
+ }
+ #endif	
+-	
+-static const float H_SCALE = 0.999f; // Search heuristic scale.
++
++// Edited by TC
++static const float H_SCALE = 2.0f; // Search heuristic scale. 
+ 
+ 
+ dtNavMeshQuery* dtAllocNavMeshQuery()
+@@ -3305,7 +3306,7 @@ dtStatus dtNavMeshQuery::findDistanceToWall(dtPolyRef startRef, const float* cen
+ 	dtVsub(hitNormal, centerPos, hitPos);
+ 	dtVnormalize(hitNormal);
+ 	
+-	*hitDist = dtMathSqrtf(radiusSqr);
++	*hitDist = sqrtf(radiusSqr);
+ 	
+ 	return status;
+ }
+diff --git a/Detour/Source/DetourNode.cpp b/Detour/Source/DetourNode.cpp
+index 57cb206..4c8215e 100644
+--- a/Detour/Source/DetourNode.cpp
++++ b/Detour/Source/DetourNode.cpp
+@@ -22,30 +22,17 @@
+ #include "DetourCommon.h"
+ #include <string.h>
+ 
+-#ifdef DT_POLYREF64
+-// From Thomas Wang, https://gist.github.com/badboy/6267743
+ inline unsigned int dtHashRef(dtPolyRef a)
+ {
+-	a = (~a) + (a << 18); // a = (a << 18) - a - 1;
+-	a = a ^ (a >> 31);
+-	a = a * 21; // a = (a + (a << 2)) + (a << 4);
+-	a = a ^ (a >> 11);
+-	a = a + (a << 6);
+-	a = a ^ (a >> 22);
+-	return (unsigned int)a;
++    // Edited by TC
++    a = (~a) + (a << 18);
++    a = a ^ (a >> 31);
++    a = a * 21;
++    a = a ^ (a >> 11);
++    a = a + (a << 6);
++    a = a ^ (a >> 22);
++    return (unsigned int)a;
+ }
+-#else
+-inline unsigned int dtHashRef(dtPolyRef a)
+-{
+-	a += ~(a<<15);
+-	a ^=  (a>>10);
+-	a +=  (a<<3);
+-	a ^=  (a>>6);
+-	a += ~(a<<11);
+-	a ^=  (a>>16);
+-	return (unsigned int)a;
+-}
+-#endif
+ 
+ //////////////////////////////////////////////////////////////////////////////////////////
+ dtNodePool::dtNodePool(int maxNodes, int hashSize) :
+diff --git a/DetourCrowd/Source/DetourObstacleAvoidance.cpp b/DetourCrowd/Source/DetourObstacleAvoidance.cpp
+index 0fad9ef..d3f90b7 100644
+--- a/DetourCrowd/Source/DetourObstacleAvoidance.cpp
++++ b/DetourCrowd/Source/DetourObstacleAvoidance.cpp
+@@ -18,10 +18,10 @@
+ 
+ #include "DetourObstacleAvoidance.h"
+ #include "DetourCommon.h"
+-#include "DetourMath.h"
+ #include "DetourAlloc.h"
+ #include "DetourAssert.h"
+ #include <string.h>
++#include <math.h>
+ #include <float.h>
+ #include <new>
+ 
+@@ -58,7 +58,7 @@ static int isectRaySeg(const float* ap, const float* u,
+ 	dtVsub(v,bq,bp);
+ 	dtVsub(w,ap,bp);
+ 	float d = dtVperp2D(u,v);
+-	if (dtMathFabs(d) < 1e-6f) return 0;
++	if (fabsf(d) < 1e-6f) return 0;
+ 	d = 1.0f/d;
+ 	t = dtVperp2D(v,w) * d;
+ 	if (t < 0 || t > 1) return 0;
+@@ -482,7 +482,7 @@ int dtObstacleAvoidanceQuery::sampleVelocityAdaptive(const float* pos, const flo
+ 	const int nd = dtClamp(ndivs, 1, DT_MAX_PATTERN_DIVS);
+ 	const int nr = dtClamp(nrings, 1, DT_MAX_PATTERN_RINGS);
+ 	const float da = (1.0f/nd) * DT_PI*2;
+-	const float dang = dtMathAtan2f(dvel[2], dvel[0]);
++	const float dang = atan2f(dvel[2], dvel[0]);
+ 	
+ 	// Always add sample at zero
+ 	pat[npat*2+0] = 0;
+diff --git a/Recast/Include/Recast.h b/Recast/Include/Recast.h
+index 336837e..3f4ae96 100644
+--- a/Recast/Include/Recast.h
++++ b/Recast/Include/Recast.h
+@@ -243,7 +243,7 @@ struct rcConfig
+ };
+ 
+ /// Defines the number of bits allocated to rcSpan::smin and rcSpan::smax.
+-static const int RC_SPAN_HEIGHT_BITS = 13;
++static const int RC_SPAN_HEIGHT_BITS = 16; // EDITED BY TC
+ /// Defines the maximum value for rcSpan::smin and rcSpan::smax.
+ static const int RC_SPAN_MAX_HEIGHT = (1<<RC_SPAN_HEIGHT_BITS)-1;
+ 
+@@ -255,9 +255,9 @@ static const int RC_SPANS_PER_POOL = 2048;
+ /// @see rcHeightfield
+ struct rcSpan
+ {
+-	unsigned int smin : 13;			///< The lower limit of the span. [Limit: < #smax]
+-	unsigned int smax : 13;			///< The upper limit of the span. [Limit: <= #RC_SPAN_MAX_HEIGHT]
+-	unsigned int area : 6;			///< The area id assigned to the span.
++	unsigned int smin : 16;			///< The lower limit of the span. [Limit: < #smax]
++	unsigned int smax : 16;			///< The upper limit of the span. [Limit: <= #RC_SPAN_MAX_HEIGHT]
++	unsigned char area;			    ///< The area id assigned to the span.
+ 	rcSpan* next;					///< The next span higher up in column.
+ };
+ 
+diff --git a/RecastDemo/Contrib/stb_truetype.h b/RecastDemo/Contrib/stb_truetype.h
+index fd72578..92dc8c2 100644
+--- a/RecastDemo/Contrib/stb_truetype.h
++++ b/RecastDemo/Contrib/stb_truetype.h
+@@ -1,1806 +1,1806 @@
+-// stb_truetype.h - v0.3 - public domain - 2009 Sean Barrett / RAD Game Tools
+-//
+-//   This library processes TrueType files:
+-//        parse files
+-//        extract glyph metrics
+-//        extract glyph shapes
+-//        render glyphs to one-channel bitmaps with antialiasing (box filter)
+-//
+-//   Todo:
+-//        non-MS cmaps
+-//        crashproof on bad data
+-//        hinting
+-//        subpixel positioning when rendering bitmap
+-//        cleartype-style AA
+-//
+-// ADDITIONAL CONTRIBUTORS
+-//
+-//   Mikko Mononen: compound shape support, more cmap formats
+-//
+-// VERSIONS
+-//
+-//   0.3 (2009-06-24) cmap fmt=12, compound shapes (MM)
+-//                    userdata, malloc-from-userdata, non-zero fill (STB)
+-//   0.2 (2009-03-11) Fix unsigned/signed char warnings
+-//   0.1 (2009-03-09) First public release
+-//
+-// USAGE
+-//
+-//   Include this file in whatever places neeed to refer to it. In ONE C/C++
+-//   file, write:
+-//      #define STB_TRUETYPE_IMPLEMENTATION
+-//   before the #include of this file. This expands out the actual
+-//   implementation into that C/C++ file.
+-//
+-//   Look at the header-file sections below for the API, but here's a quick skim:
+-//
+-//   Simple 3D API (don't ship this, but it's fine for tools and quick start,
+-//                  and you can cut and paste from it to move to more advanced)
+-//           stbtt_BakeFontBitmap()               -- bake a font to a bitmap for use as texture
+-//           stbtt_GetBakedQuad()                 -- compute quad to draw for a given char
+-//
+-//   "Load" a font file from a memory buffer (you have to keep the buffer loaded)
+-//           stbtt_InitFont()
+-//           stbtt_GetFontOffsetForIndex()        -- use for TTC font collections
+-//
+-//   Render a unicode codepoint to a bitmap
+-//           stbtt_GetCodepointBitmap()           -- allocates and returns a bitmap
+-//           stbtt_MakeCodepointBitmap()          -- renders into bitmap you provide
+-//           stbtt_GetCodepointBitmapBox()        -- how big the bitmap must be
+-//
+-//   Character advance/positioning
+-//           stbtt_GetCodepointHMetrics()
+-//           stbtt_GetFontVMetrics()
+-//
+-// NOTES
+-//
+-//   The system uses the raw data found in the .ttf file without changing it
+-//   and without building auxiliary data structures. This is a bit inefficient
+-//   on little-endian systems (the data is big-endian), but assuming you're
+-//   caching the bitmaps or glyph shapes this shouldn't be a big deal.
+-//
+-//   It appears to be very hard to programmatically determine what font a
+-//   given file is in a general way. I provide an API for this, but I don't
+-//   recommend it.
+-//
+-//
+-// SOURCE STATISTICS (based on v0.3, 1800 LOC)
+-//
+-//   Documentation & header file        350 LOC  \___ 500 LOC documentation
+-//   Sample code                        140 LOC  /
+-//   Truetype parsing                   580 LOC  ---- 600 LOC TrueType
+-//   Software rasterization             240 LOC  \                           .
+-//   Curve tesselation                  120 LOC   \__ 500 LOC Bitmap creation
+-//   Bitmap management                   70 LOC   /
+-//   Baked bitmap interface              70 LOC  /
+-//   Font name matching & access        150 LOC  ---- 150 
+-//   C runtime library abstraction       60 LOC  ----  60
+-
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//////////////////////////////////////////////////////////////////////////////
+-////
+-////  SAMPLE PROGRAMS
+-////
+-//
+-//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless
+-//
+-#if 0
+-#define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
+-#include "stb_truetype.h"
+-
+-char ttf_buffer[1<<20];
+-unsigned char temp_bitmap[512*512];
+-
+-stbtt_chardata cdata[96]; // ASCII 32..126 is 95 glyphs
+-GLstbtt_uint ftex;
+-
+-void my_stbtt_initfont(void)
+-{
+-   fread(ttf_buffer, 1, 1<<20, fopen("c:/windows/fonts/times.ttf", "rb"));
+-   stbtt_BakeFontBitmap(data,0, 32.0, temp_bitmap,512,512, 32,96, cdata); // no guarantee this fits!
+-   // can free ttf_buffer at this point
+-   glGenTextures(1, &ftex);
+-   glBindTexture(GL_TEXTURE_2D, ftex);
+-   glTexImage2D(GL_TEXTURE_2D, 0, GL_ALPHA, 512,512, 0, GL_ALPHA, GL_UNSIGNED_BYTE, temp_bitmap);
+-   // can free temp_bitmap at this point
+-   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
+-}
+-
+-void my_stbtt_print(float x, float y, char *text)
+-{
+-   // assume orthographic projection with units = screen pixels, origin at top left
+-   glBindTexture(GL_TEXTURE_2D, ftex);
+-   glBegin(GL_QUADS);
+-   while (*text) {
+-      if (*text >= 32 && *text < 128) {
+-         stbtt_aligned_quad q;
+-         stbtt_GetBakedQuad(cdata, 512,512, *text-32, &x,&y,&q,1);//1=opengl,0=old d3d
+-         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y0);
+-         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y0);
+-         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y1);
+-         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y1);
+-      }
+-      ++text;
+-   }
+-   glEnd();
+-}
+-#endif
+-//
+-//
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// Complete program (this compiles): get a single bitmap, print as ASCII art
+-//
+-#if 0
+-#include <stdio.h>
+-#define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
+-#include "stb_truetype.h"
+-
+-char ttf_buffer[1<<25];
+-
+-int main(int argc, char **argv)
+-{
+-   stbtt_fontinfo font;
+-   unsigned char *bitmap;
+-   int w,h,i,j,c = (argc > 1 ? atoi(argv[1]) : 'a'), s = (argc > 2 ? atoi(argv[2]) : 20);
+-
+-   fread(ttf_buffer, 1, 1<<25, fopen(argc > 3 ? argv[3] : "c:/windows/fonts/arialbd.ttf", "rb"));
+-
+-   stbtt_InitFont(&font, ttf_buffer, stbtt_GetFontOffsetForIndex(ttf_buffer,0));
+-   bitmap = stbtt_GetCodepointBitmap(&font, 0,stbtt_ScaleForPixelHeight(&font, s), c, &w, &h, 0,0);
+-
+-   for (j=0; j < h; ++j) {
+-      for (i=0; i < w; ++i)
+-         putchar(" .:ioVM@"[bitmap[j*w+i]>>5]);
+-      putchar('\n');
+-   }
+-   return 0;
+-}
+-#endif 
+-//
+-// Output:
+-//
+-//     .ii.
+-//    @@@@@@.
+-//   V@Mio@@o
+-//   :i.  V@V
+-//     :oM@@M
+-//   :@@@MM@M
+-//   @@o  o@M
+-//  :@@.  M@M
+-//   @@@o@@@@
+-//   :M@@V:@@.
+-//  
+-//////////////////////////////////////////////////////////////////////////////
+-// 
+-// Complete program: print "Hello World!" banner, with bugs
+-//
+-#if 0
+-int main(int arg, char **argv)
+-{
+-   unsigned char screen[20][79];
+-   int i,j, pos=0;
+-   float scale;
+-   char *text = "Heljo World!";
+-
+-   fread(buffer, 1, 1000000, fopen("c:/windows/fonts/arialbd.ttf", "rb"));
+-   stbtt_InitFont(&font, buffer, 0);
+-
+-   scale = stbtt_ScaleForPixelHeight(&font, 16);
+-   memset(screen, 0, sizeof(screen));
+-
+-   while (*text) {
+-      int advance,lsb,x0,y0,x1,y1, newpos, baseline=13;
+-      stbtt_GetCodepointHMetrics(&font, *text, &advance, &lsb);
+-      stbtt_GetCodepointBitmapBox(&font, *text, scale,scale, &x0,&y0,&x1,&y1);
+-      newpos = pos + (int) (lsb * scale) + x0;
+-      stbtt_MakeCodepointBitmap(&font, &screen[baseline + y0][newpos], x1-x0,y1-y0, 79, scale,scale, *text);
+-      // note that this stomps the old data, so where character boxes overlap (e.g. 'lj') it's wrong
+-      // because this API is really for baking character bitmaps into textures
+-      pos += (int) (advance * scale);
+-      ++text;
+-   }
+-
+-   for (j=0; j < 20; ++j) {
+-      for (i=0; i < 79; ++i)
+-         putchar(" .:ioVM@"[screen[j][i]>>5]);
+-      putchar('\n');
+-   }
+-
+-   return 0;
+-}
+-#endif
+-
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//////////////////////////////////////////////////////////////////////////////
+-////
+-////   INTEGRATION WITH RUNTIME LIBRARIES
+-////
+-
+-#ifdef STB_TRUETYPE_IMPLEMENTATION
+-   // #define your own (u)stbtt_int8/16/32 before including to override this
+-   #ifndef stbtt_uint8
+-   typedef unsigned char   stbtt_uint8;
+-   typedef signed   char   stbtt_int8;
+-   typedef unsigned short  stbtt_uint16;
+-   typedef signed   short  stbtt_int16;
+-   typedef unsigned int    stbtt_uint32;
+-   typedef signed   int    stbtt_int32;
+-   #endif
+-
+-   typedef char stbtt__check_size32[sizeof(stbtt_int32)==4 ? 1 : -1];
+-   typedef char stbtt__check_size16[sizeof(stbtt_int16)==2 ? 1 : -1];
+-
+-   // #define your own STBTT_sort() to override this to avoid qsort
+-   #ifndef STBTT_sort
+-   #include <stdlib.h>
+-   #define STBTT_sort(data,num_items,item_size,compare_func)   qsort(data,num_items,item_size,compare_func)
+-   #endif
+-
+-   // #define your own STBTT_ifloor/STBTT_iceil() to avoid math.h
+-   #ifndef STBTT_ifloor
+-   #include <math.h>
+-   #define STBTT_ifloor(x)   ((int) floor(x))
+-   #define STBTT_iceil(x)    ((int) ceil(x))
+-   #endif
+-
+-   // #define your own functions "STBTT_malloc" / "STBTT_free" to avoid malloc.h
+-   #ifndef STBTT_malloc
+-   #include <malloc.h>
+-   #define STBTT_malloc(x,u)  malloc(x)
+-   #define STBTT_free(x,u)    free(x)
+-   #endif
+-
+-   #ifndef STBTT_assert
+-   #include <assert.h>
+-   #define STBTT_assert(x)    assert(x)
+-   #endif
+-
+-   #ifndef STBTT_strlen
+-   #include <string.h>
+-   #define STBTT_strlen(x)    strlen(x)
+-   #endif
+-
+-   #ifndef STBTT_memcpy
+-   #include <memory.h>
+-   #define STBTT_memcpy       memcpy
+-   #define STBTT_memset       memset
+-   #endif
+-#endif
+-
+-///////////////////////////////////////////////////////////////////////////////
+-///////////////////////////////////////////////////////////////////////////////
+-////
+-////   INTERFACE
+-////
+-////
+-
+-#ifndef __STB_INCLUDE_STB_TRUETYPE_H__
+-#define __STB_INCLUDE_STB_TRUETYPE_H__
+-
+-#ifdef __cplusplus
+-extern "C" {
+-#endif
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// TEXTURE BAKING API
+-//
+-// If you use this API, you only have to call two functions ever.
+-//
+-
+-typedef struct
+-{
+-   unsigned short x0,y0,x1,y1; // coordinates of bbox in bitmap
+-   float xoff,yoff,xadvance;   
+-} stbtt_bakedchar;
+-
+-extern int stbtt_BakeFontBitmap(const unsigned char *data, int offset,  // font location (use offset=0 for plain .ttf)
+-                                float pixel_height,                     // height of font in pixels
+-                                unsigned char *pixels, int pw, int ph,  // bitmap to be filled in
+-                                int first_char, int num_chars,          // characters to bake
+-                                stbtt_bakedchar *chardata);             // you allocate this, it's num_chars long
+-// if return is positive, the first unused row of the bitmap
+-// if return is negative, returns the negative of the number of characters that fit
+-// if return is 0, no characters fit and no rows were used
+-// This uses a very crappy packing.
+-
+-typedef struct
+-{
+-   float x0,y0,s0,t0; // top-left
+-   float x1,y1,s1,t1; // bottom-right
+-} stbtt_aligned_quad;
+-
+-extern void stbtt_GetBakedQuad(stbtt_bakedchar *chardata, int pw, int ph,  // same data as above
+-                               int char_index,             // character to display
+-                               float *xpos, float *ypos,   // pointers to current position in screen pixel space
+-                               stbtt_aligned_quad *q,      // output: quad to draw
+-                               int opengl_fillrule);       // true if opengl fill rule; false if DX9 or earlier
+-// Call GetBakedQuad with char_index = 'character - first_char', and it
+-// creates the quad you need to draw and advances the current position.
+-// It's inefficient; you might want to c&p it and optimize it.
+-
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// FONT LOADING
+-//
+-//
+-
+-extern int stbtt_GetFontOffsetForIndex(const unsigned char *data, int index);
+-// Each .ttf file may have more than one font. Each has a sequential index
+-// number starting from 0. Call this function to get the font offset for a
+-// given index; it returns -1 if the index is out of range. A regular .ttf
+-// file will only define one font and it always be at offset 0, so it will
+-// return '0' for index 0, and -1 for all other indices. You can just skip
+-// this step if you know it's that kind of font.
+-
+-
+-// The following structure is defined publically so you can declare one on
+-// the stack or as a global or etc.
+-typedef struct
+-{
+-   void           *userdata;
+-   unsigned char  *data;         // pointer to .ttf file
+-   int             fontstart;    // offset of start of font
+-
+-   int numGlyphs;                // number of glyphs, needed for range checking
+-
+-   int loca,head,glyf,hhea,hmtx; // table locations as offset from start of .ttf
+-   int index_map;                // a cmap mapping for our chosen character encoding
+-   int indexToLocFormat;         // format needed to map from glyph index to glyph
+-} stbtt_fontinfo;
+-
+-extern int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data, int offset);
+-// Given an offset into the file that defines a font, this function builds
+-// the necessary cached info for the rest of the system. You must allocate
+-// the stbtt_fontinfo yourself, and stbtt_InitFont will fill it out. You don't
+-// need to do anything special to free it, because the contents are a pure
+-// cache with no additional data structures. Returns 0 on failure.
+-
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// CHARACTER TO GLYPH-INDEX CONVERSIOn
+-
+-int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codepoint);
+-// If you're going to perform multiple operations on the same character
+-// and you want a speed-up, call this function with the character you're
+-// going to process, then use glyph-based functions instead of the
+-// codepoint-based functions.
+-
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// CHARACTER PROPERTIES
+-//
+-
+-extern float stbtt_ScaleForPixelHeight(const stbtt_fontinfo *info, float pixels);
+-// computes a scale factor to produce a font whose "height" is 'pixels' tall.
+-// Height is measured as the distance from the highest ascender to the lowest
+-// descender; in other words, it's equivalent to calling stbtt_GetFontVMetrics
+-// and computing:
+-//       scale = pixels / (ascent - descent)
+-// so if you prefer to measure height by the ascent only, use a similar calculation.
+-
+-extern void stbtt_GetFontVMetrics(const stbtt_fontinfo *info, int *ascent, int *descent, int *lineGap);
+-// ascent is the coordinate above the baseline the font extends; descent
+-// is the coordinate below the baseline the font extends (i.e. it is typically negative)
+-// lineGap is the spacing between one row's descent and the next row's ascent...
+-// so you should advance the vertical position by "*ascent - *descent + *lineGap"
+-//   these are expressed in unscaled coordinates
+-
+-extern void stbtt_GetCodepointHMetrics(const stbtt_fontinfo *info, int codepoint, int *advanceWidth, int *leftSideBearing);
+-// leftSideBearing is the offset from the current horizontal position to the left edge of the character
+-// advanceWidth is the offset from the current horizontal position to the next horizontal position
+-//   these are expressed in unscaled coordinates
+-
+-extern int  stbtt_GetCodepointKernAdvance(const stbtt_fontinfo *info, int ch1, int ch2);
+-// an additional amount to add to the 'advance' value between ch1 and ch2
+-// @TODO; for now always returns 0!
+-
+-extern int stbtt_GetCodepointBox(const stbtt_fontinfo *info, int codepoint, int *x0, int *y0, int *x1, int *y1);
+-// Gets the bounding box of the visible part of the glyph, in unscaled coordinates
+-
+-extern void stbtt_GetGlyphHMetrics(const stbtt_fontinfo *info, int glyph_index, int *advanceWidth, int *leftSideBearing);
+-extern int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2);
+-extern int  stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1);
+-// as above, but takes one or more glyph indices for greater efficiency
+-
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// GLYPH SHAPES (you probably don't need these, but they have to go before
+-// the bitmaps for C declaration-order reasons)
+-//
+-
+-#ifndef STBTT_vmove // you can predefine these to use different values (but why?)
+-   enum {
+-      STBTT_vmove=1,
+-      STBTT_vline,
+-      STBTT_vcurve
+-   };
+-#endif
+-
+-#ifndef stbtt_vertex // you can predefine this to use different values
+-                   // (we share this with other code at RAD)
+-   #define stbtt_vertex_type short // can't use stbtt_int16 because that's not visible in the header file
+-   typedef struct
+-   {
+-      stbtt_vertex_type x,y,cx,cy;
+-      unsigned char type,padding;
+-   } stbtt_vertex;
+-#endif
+-
+-extern int stbtt_GetCodepointShape(const stbtt_fontinfo *info, int unicode_codepoint, stbtt_vertex **vertices);
+-extern int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **vertices);
+-// returns # of vertices and fills *vertices with the pointer to them
+-//   these are expressed in "unscaled" coordinates
+-
+-extern void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *vertices);
+-// frees the data allocated above
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// BITMAP RENDERING
+-//
+-
+-extern void stbtt_FreeBitmap(unsigned char *bitmap, void *userdata);
+-// frees the bitmap allocated below
+-
+-extern unsigned char *stbtt_GetCodepointBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int codepoint, int *width, int *height, int *xoff, int *yoff);
+-// allocates a large-enough single-channel 8bpp bitmap and renders the
+-// specified character/glyph at the specified scale into it, with
+-// antialiasing. 0 is no coverage (transparent), 255 is fully covered (opaque).
+-// *width & *height are filled out with the width & height of the bitmap,
+-// which is stored left-to-right, top-to-bottom.
+-//
+-// xoff/yoff are the offset it pixel space from the glyph origin to the top-left of the bitmap
+-
+-extern void stbtt_MakeCodepointBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int codepoint);
+-// the same as above, but you pass in storage for the bitmap in the form
+-// of 'output', with row spacing of 'out_stride' bytes. the bitmap is
+-// clipped to out_w/out_h bytes. call the next function to get the
+-// height and width and positioning info
+-
+-extern void stbtt_GetCodepointBitmapBox(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
+-// get the bbox of the bitmap centered around the glyph origin; so the
+-// bitmap width is ix1-ix0, height is iy1-iy0, and location to place
+-// the bitmap top left is (leftSideBearing*scale,iy0).
+-// (Note that the bitmap uses y-increases-down, but the shape uses
+-// y-increases-up, so CodepointBitmapBox and CodepointBox are inverted.)
+-
+-extern unsigned char *stbtt_GetGlyphBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int glyph, int *width, int *height, int *xoff, int *yoff);
+-extern void stbtt_GetGlyphBitmapBox(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
+-extern void stbtt_MakeGlyphBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int glyph);
+-
+-//extern void stbtt_get_true_bbox(stbtt_vertex *vertices, int num_verts, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
+-
+-// @TODO: don't expose this structure
+-typedef struct
+-{
+-   int w,h,stride;
+-   unsigned char *pixels;
+-} stbtt__bitmap;
+-
+-extern void stbtt_Rasterize(stbtt__bitmap *result, float flatness_in_pixels, stbtt_vertex *vertices, int num_verts, float scale_x, float scale_y, int x_off, int y_off, int invert, void *userdata);
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// Finding the right font...
+-//
+-// You should really just solve this offline, keep your own tables
+-// of what font is what, and don't try to get it out of the .ttf file.
+-// That's because getting it out of the .ttf file is really hard, because
+-// the names in the file can appear in many possible encodings, in many
+-// possible languages, and e.g. if you need a case-insensitive comparison,
+-// the details of that depend on the encoding & language in a complex way
+-// (actually underspecified in truetype, but also gigantic).
+-//
+-// But you can use the provided functions in two possible ways:
+-//     stbtt_FindMatchingFont() will use *case-sensitive* comparisons on
+-//             unicode-encoded names to try to find the font you want;
+-//             you can run this before calling stbtt_InitFont()
+-//
+-//     stbtt_GetFontNameString() lets you get any of the various strings
+-//             from the file yourself and do your own comparisons on them.
+-//             You have to have called stbtt_InitFont() first.
+-
+-
+-extern int stbtt_FindMatchingFont(const unsigned char *fontdata, const char *name, int flags);
+-// returns the offset (not index) of the font that matches, or -1 if none
+-//   if you use STBTT_MACSTYLE_DONTCARE, use a font name like "Arial Bold".
+-//   if you use any other flag, use a font name like "Arial"; this checks
+-//     the 'macStyle' header field; i don't know if fonts set this consistently
+-#define STBTT_MACSTYLE_DONTCARE     0
+-#define STBTT_MACSTYLE_BOLD         1
+-#define STBTT_MACSTYLE_ITALIC       2
+-#define STBTT_MACSTYLE_UNDERSCORE   4
+-#define STBTT_MACSTYLE_NONE         8   // <= not same as 0, this makes us check the bitfield is 0
+-
+-extern int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const char *s2, int len2);
+-// returns 1/0 whether the first string interpreted as utf8 is identical to
+-// the second string interpreted as big-endian utf16... useful for strings from next func
+-
+-extern char *stbtt_GetFontNameString(const stbtt_fontinfo *font, int *length, int platformID, int encodingID, int languageID, int nameID);
+-// returns the string (which may be big-endian double byte, e.g. for unicode)
+-// and puts the length in bytes in *length.
+-//
+-// some of the values for the IDs are below; for more see the truetype spec:
+-//     http://developer.apple.com/textfonts/TTRefMan/RM06/Chap6name.html
+-//     http://www.microsoft.com/typography/otspec/name.htm
+-
+-enum { // platformID
+-   STBTT_PLATFORM_ID_UNICODE   =0,
+-   STBTT_PLATFORM_ID_MAC       =1,
+-   STBTT_PLATFORM_ID_ISO       =2,
+-   STBTT_PLATFORM_ID_MICROSOFT =3
+-};
+-
+-enum { // encodingID for STBTT_PLATFORM_ID_UNICODE
+-   STBTT_UNICODE_EID_UNICODE_1_0    =0,
+-   STBTT_UNICODE_EID_UNICODE_1_1    =1,
+-   STBTT_UNICODE_EID_ISO_10646      =2,
+-   STBTT_UNICODE_EID_UNICODE_2_0_BMP=3,
+-   STBTT_UNICODE_EID_UNICODE_2_0_FULL=4,
+-};
+-
+-enum { // encodingID for STBTT_PLATFORM_ID_MICROSOFT
+-   STBTT_MS_EID_SYMBOL        =0,
+-   STBTT_MS_EID_UNICODE_BMP   =1,
+-   STBTT_MS_EID_SHIFTJIS      =2,
+-   STBTT_MS_EID_UNICODE_FULL  =10,
+-};
+-
+-enum { // encodingID for STBTT_PLATFORM_ID_MAC; same as Script Manager codes
+-   STBTT_MAC_EID_ROMAN        =0,   STBTT_MAC_EID_ARABIC       =4,
+-   STBTT_MAC_EID_JAPANESE     =1,   STBTT_MAC_EID_HEBREW       =5,
+-   STBTT_MAC_EID_CHINESE_TRAD =2,   STBTT_MAC_EID_GREEK        =6,
+-   STBTT_MAC_EID_KOREAN       =3,   STBTT_MAC_EID_RUSSIAN      =7,
+-};
+-
+-enum { // languageID for STBTT_PLATFORM_ID_MICROSOFT; same as LCID...
+-       // problematic because there are e.g. 16 english LCIDs and 16 arabic LCIDs
+-   STBTT_MS_LANG_ENGLISH     =0x0409,   STBTT_MS_LANG_ITALIAN     =0x0410,
+-   STBTT_MS_LANG_CHINESE     =0x0804,   STBTT_MS_LANG_JAPANESE    =0x0411,
+-   STBTT_MS_LANG_DUTCH       =0x0413,   STBTT_MS_LANG_KOREAN      =0x0412,
+-   STBTT_MS_LANG_FRENCH      =0x040c,   STBTT_MS_LANG_RUSSIAN     =0x0419,
+-   STBTT_MS_LANG_GERMAN      =0x0407,   STBTT_MS_LANG_SPANISH     =0x0409,
+-   STBTT_MS_LANG_HEBREW      =0x040d,   STBTT_MS_LANG_SWEDISH     =0x041D,
+-};
+-
+-enum { // languageID for STBTT_PLATFORM_ID_MAC
+-   STBTT_MAC_LANG_ENGLISH      =0 ,   STBTT_MAC_LANG_JAPANESE     =11,
+-   STBTT_MAC_LANG_ARABIC       =12,   STBTT_MAC_LANG_KOREAN       =23,
+-   STBTT_MAC_LANG_DUTCH        =4 ,   STBTT_MAC_LANG_RUSSIAN      =32,
+-   STBTT_MAC_LANG_FRENCH       =1 ,   STBTT_MAC_LANG_SPANISH      =6 ,
+-   STBTT_MAC_LANG_GERMAN       =2 ,   STBTT_MAC_LANG_SWEDISH      =5 ,
+-   STBTT_MAC_LANG_HEBREW       =10,   STBTT_MAC_LANG_CHINESE_SIMPLIFIED =33,
+-   STBTT_MAC_LANG_ITALIAN      =3 ,   STBTT_MAC_LANG_CHINESE_TRAD =19,
+-};
+-
+-#ifdef __cplusplus
+-}
+-#endif
+-
+-#endif // __STB_INCLUDE_STB_TRUETYPE_H__
+-
+-///////////////////////////////////////////////////////////////////////////////
+-///////////////////////////////////////////////////////////////////////////////
+-////
+-////   IMPLEMENTATION
+-////
+-////
+-
+-#ifdef STB_TRUETYPE_IMPLEMENTATION
+-
+-//////////////////////////////////////////////////////////////////////////
+-//
+-// accessors to parse data from file
+-//
+-
+-// on platforms that don't allow misaligned reads, if we want to allow
+-// truetype fonts that aren't padded to alignment, define ALLOW_UNALIGNED_TRUETYPE
+-
+-#define ttBYTE(p)     (* (stbtt_uint8 *) (p))
+-#define ttCHAR(p)     (* (stbtt_int8 *) (p))
+-#define ttFixed(p)    ttLONG(p)
+-
+-#if defined(STB_TRUETYPE_BIGENDIAN) && !defined(ALLOW_UNALIGNED_TRUETYPE)
+-
+-   #define ttUSHORT(p)   (* (stbtt_uint16 *) (p))
+-   #define ttSHORT(p)    (* (stbtt_int16 *) (p))
+-   #define ttULONG(p)    (* (stbtt_uint32 *) (p))
+-   #define ttLONG(p)     (* (stbtt_int32 *) (p))
+-
+-#else
+-
+-   stbtt_uint16 ttUSHORT(const stbtt_uint8 *p) { return p[0]*256 + p[1]; }
+-   stbtt_int16 ttSHORT(const stbtt_uint8 *p)   { return p[0]*256 + p[1]; }
+-   stbtt_uint32 ttULONG(const stbtt_uint8 *p)  { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
+-   stbtt_int32 ttLONG(const stbtt_uint8 *p)    { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
+-
+-#endif
+-
+-#define stbtt_tag4(p,c0,c1,c2,c3) ((p)[0] == (c0) && (p)[1] == (c1) && (p)[2] == (c2) && (p)[3] == (c3))
+-#define stbtt_tag(p,str)           stbtt_tag4(p,str[0],str[1],str[2],str[3])
+-
+-static int stbtt__isfont(const stbtt_uint8 *font)
+-{
+-   // check the version number
+-   if (stbtt_tag(font, "1"))   return 1; // TrueType 1
+-   if (stbtt_tag(font, "typ1"))   return 1; // TrueType with type 1 font -- we don't support this!
+-   if (stbtt_tag(font, "OTTO"))   return 1; // OpenType with CFF
+-   if (stbtt_tag4(font, 0,1,0,0)) return 1; // OpenType 1.0
+-   return 0;
+-}
+-
+-// @OPTIMIZE: binary search
+-static stbtt_uint32 stbtt__find_table(stbtt_uint8 *data, stbtt_uint32 fontstart, const char *tag)
+-{
+-   stbtt_int32 num_tables = ttUSHORT(data+fontstart+4);
+-   stbtt_uint32 tabledir = fontstart + 12;
+-   stbtt_int32 i;
+-   for (i=0; i < num_tables; ++i) {
+-      stbtt_uint32 loc = tabledir + 16*i;
+-      if (stbtt_tag(data+loc+0, tag))
+-         return ttULONG(data+loc+8);
+-   }
+-   return 0;
+-}
+-
+-int stbtt_GetFontOffsetForIndex(const unsigned char *font_collection, int index)
+-{
+-   // if it's just a font, there's only one valid index
+-   if (stbtt__isfont(font_collection))
+-      return index == 0 ? 0 : -1;
+-
+-   // check if it's a TTC
+-   if (stbtt_tag(font_collection, "ttcf")) {
+-      // version 1?
+-      if (ttULONG(font_collection+4) == 0x00010000 || ttULONG(font_collection+4) == 0x00020000) {
+-         stbtt_int32 n = ttLONG(font_collection+8);
+-         if (index >= n)
+-            return -1;
+-         return ttULONG(font_collection+12+index*14);
+-      }
+-   }
+-   return -1;
+-}
+-
+-int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data2, int fontstart)
+-{
+-   stbtt_uint8 *data = (stbtt_uint8 *) data2;
+-   stbtt_uint32 cmap, t;
+-   stbtt_int32 i,numTables;
+-
+-   info->data = data;
+-   info->fontstart = fontstart;
+-
+-   cmap = stbtt__find_table(data, fontstart, "cmap");
+-   info->loca = stbtt__find_table(data, fontstart, "loca");
+-   info->head = stbtt__find_table(data, fontstart, "head");
+-   info->glyf = stbtt__find_table(data, fontstart, "glyf");
+-   info->hhea = stbtt__find_table(data, fontstart, "hhea");
+-   info->hmtx = stbtt__find_table(data, fontstart, "hmtx");
+-   if (!cmap || !info->loca || !info->head || !info->glyf || !info->hhea || !info->hmtx)
+-      return 0;
+-
+-   t = stbtt__find_table(data, fontstart, "maxp");
+-   if (t)
+-      info->numGlyphs = ttUSHORT(data+t+4);
+-   else
+-      info->numGlyphs = 0xffff;
+-
+-   // find a cmap encoding table we understand *now* to avoid searching
+-   // later. (todo: could make this installable)
+-   // the same regardless of glyph.
+-   numTables = ttUSHORT(data + cmap + 2);
+-   info->index_map = 0;
+-   for (i=0; i < numTables; ++i) {
+-      stbtt_uint32 encoding_record = cmap + 4 + 8 * i;
+-      // find an encoding we understand:
+-      switch(ttUSHORT(data+encoding_record)) {
+-         case STBTT_PLATFORM_ID_MICROSOFT:
+-            switch (ttUSHORT(data+encoding_record+2)) {
+-               case STBTT_MS_EID_UNICODE_BMP:
+-               case STBTT_MS_EID_UNICODE_FULL:
+-                  // MS/Unicode
+-                  info->index_map = cmap + ttULONG(data+encoding_record+4);
+-                  break;
+-            }
+-            break;
+-      }
+-   }
+-   if (info->index_map == 0)
+-      return 0;
+-
+-   info->indexToLocFormat = ttUSHORT(data+info->head + 50);
+-   return 1;
+-}
+-
+-int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codepoint)
+-{
+-   stbtt_uint8 *data = info->data;
+-   stbtt_uint32 index_map = info->index_map;
+-
+-   stbtt_uint16 format = ttUSHORT(data + index_map + 0);
+-   if (format == 0) { // apple byte encoding
+-      stbtt_int32 bytes = ttUSHORT(data + index_map + 2);
+-      if (unicode_codepoint < bytes-6)
+-         return ttBYTE(data + index_map + 6 + unicode_codepoint);
+-      return 0;
+-   } else if (format == 6) {
+-      stbtt_uint32 first = ttUSHORT(data + index_map + 6);
+-      stbtt_uint32 count = ttUSHORT(data + index_map + 8);
+-      if ((stbtt_uint32) unicode_codepoint >= first && (stbtt_uint32) unicode_codepoint < first+count)
+-         return ttUSHORT(data + index_map + 10 + (unicode_codepoint - first)*2);
+-      return 0;
+-   } else if (format == 2) {
+-      STBTT_assert(0); // @TODO: high-byte mapping for japanese/chinese/korean
+-      return 0;
+-   } else if (format == 4) { // standard mapping for windows fonts: binary search collection of ranges
+-      stbtt_uint16 segcount = ttUSHORT(data+index_map+6) >> 1;
+-      stbtt_uint16 searchRange = ttUSHORT(data+index_map+8) >> 1;
+-      stbtt_uint16 entrySelector = ttUSHORT(data+index_map+10);
+-      stbtt_uint16 rangeShift = ttUSHORT(data+index_map+12) >> 1;
+-      stbtt_uint16 item, offset, start, end;
+-
+-      // do a binary search of the segments
+-      stbtt_uint32 endCount = index_map + 14;
+-      stbtt_uint32 search = endCount;
+-
+-      if (unicode_codepoint > 0xffff)
+-         return 0;
+-
+-      // they lie from endCount .. endCount + segCount
+-      // but searchRange is the nearest power of two, so...
+-      if (unicode_codepoint >= ttUSHORT(data + search + rangeShift*2))
+-         search += rangeShift*2;
+-
+-      // now decrement to bias correctly to find smallest
+-      search -= 2;
+-      while (entrySelector) {
+-         stbtt_uint16 start, end;
+-         searchRange >>= 1;
+-         start = ttUSHORT(data + search + 2 + segcount*2 + 2);
+-         end = ttUSHORT(data + search + 2);
+-         start = ttUSHORT(data + search + searchRange*2 + segcount*2 + 2);
+-         end = ttUSHORT(data + search + searchRange*2);
+-         if (unicode_codepoint > end)
+-            search += searchRange*2;
+-         --entrySelector;
+-      }
+-      search += 2;
+-
+-      item = (stbtt_uint16) ((search - endCount) >> 1);
+-
+-      STBTT_assert(unicode_codepoint <= ttUSHORT(data + endCount + 2*item));
+-      start = ttUSHORT(data + index_map + 14 + segcount*2 + 2 + 2*item);
+-      end = ttUSHORT(data + index_map + 14 + 2 + 2*item);
+-      if (unicode_codepoint < start)
+-         return 0;
+-
+-      offset = ttUSHORT(data + index_map + 14 + segcount*6 + 2 + 2*item);
+-      if (offset == 0)
+-         return unicode_codepoint + ttSHORT(data + index_map + 14 + segcount*4 + 2 + 2*item);
+-
+-      return ttUSHORT(data + offset + (unicode_codepoint-start)*2 + index_map + 14 + segcount*6 + 2 + 2*item);
+-   } else if (format == 12) {
+-      stbtt_uint16 ngroups = ttUSHORT(data+index_map+6);
+-      stbtt_int32 low,high;
+-      low = 0; high = (stbtt_int32)ngroups;
+-      // Binary search the right group.
+-      while (low <= high) {
+-         stbtt_int32 mid = low + ((high-low) >> 1); // rounds down, so low <= mid < high
+-         stbtt_uint32 start_char = ttULONG(data+index_map+16+mid*12);
+-         stbtt_uint32 end_char = ttULONG(data+index_map+16+mid*12+4);
+-         if ((stbtt_uint32) unicode_codepoint < start_char)
+-            high = mid-1;
+-         else if ((stbtt_uint32) unicode_codepoint > end_char)
+-            low = mid+1;
+-         else {
+-            stbtt_uint32 start_glyph = ttULONG(data+index_map+16+mid*12+8);
+-            return start_glyph + unicode_codepoint-start_char;
+-         }
+-      }
+-      return 0; // not found
+-   }
+-   // @TODO
+-   STBTT_assert(0);
+-   return 0;
+-}
+-
+-int stbtt_GetCodepointShape(const stbtt_fontinfo *info, int unicode_codepoint, stbtt_vertex **vertices)
+-{
+-   return stbtt_GetGlyphShape(info, stbtt_FindGlyphIndex(info, unicode_codepoint), vertices);
+-}
+-
+-static void stbtt_setvertex(stbtt_vertex *v, stbtt_uint8 type, stbtt_int16 x, stbtt_int16 y, stbtt_int16 cx, stbtt_int16 cy)
+-{
+-   v->type = type;
+-   v->x = x;
+-   v->y = y;
+-   v->cx = cx;
+-   v->cy = cy;
+-}
+-
+-static int stbtt__GetGlyfOffset(const stbtt_fontinfo *info, int glyph_index)
+-{
+-   int g1,g2;
+-
+-   if (glyph_index >= info->numGlyphs) return -1; // glyph index out of range
+-   if (info->indexToLocFormat >= 2)    return -1; // unknown index->glyph map format
+-
+-   if (info->indexToLocFormat == 0) {
+-      g1 = info->glyf + ttUSHORT(info->data + info->loca + glyph_index * 2) * 2;
+-      g2 = info->glyf + ttUSHORT(info->data + info->loca + glyph_index * 2 + 2) * 2;
+-   } else {
+-      g1 = info->glyf + ttULONG (info->data + info->loca + glyph_index * 4);
+-      g2 = info->glyf + ttULONG (info->data + info->loca + glyph_index * 4 + 4);
+-   }
+-
+-   return g1==g2 ? -1 : g1; // if length is 0, return -1
+-}
+-
+-int stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1)
+-{
+-   int g = stbtt__GetGlyfOffset(info, glyph_index);
+-   if (g < 0) return 0;
+-
+-   if (x0) *x0 = ttSHORT(info->data + g + 2);
+-   if (y0) *y0 = ttSHORT(info->data + g + 4);
+-   if (x1) *x1 = ttSHORT(info->data + g + 6);
+-   if (y1) *y1 = ttSHORT(info->data + g + 8);
+-   return 1;
+-}
+-
+-int stbtt_GetCodepointBox(const stbtt_fontinfo *info, int codepoint, int *x0, int *y0, int *x1, int *y1)
+-{
+-   return stbtt_GetGlyphBox(info, stbtt_FindGlyphIndex(info,codepoint), x0,y0,x1,y1);
+-}
+-
+-int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **pvertices)
+-{
+-   stbtt_int16 numberOfContours;
+-   stbtt_uint8 *endPtsOfContours;
+-   stbtt_uint8 *data = info->data;
+-   stbtt_vertex *vertices=0;
+-   int num_vertices=0;
+-   int g = stbtt__GetGlyfOffset(info, glyph_index);
+-
+-   *pvertices = NULL;
+-
+-   if (g < 0) return 0;
+-
+-   numberOfContours = ttSHORT(data + g);
+-
+-   if (numberOfContours > 0) {
+-      stbtt_uint8 flags=0,flagcount;
+-      stbtt_int32 ins, i,j=0,m,n, next_move, was_off=0, off;
+-      stbtt_int16 x,y,cx,cy,sx,sy;
+-      stbtt_uint8 *points;
+-      endPtsOfContours = (data + g + 10);
+-      ins = ttUSHORT(data + g + 10 + numberOfContours * 2);
+-      points = data + g + 10 + numberOfContours * 2 + 2 + ins;
+-
+-      n = 1+ttUSHORT(endPtsOfContours + numberOfContours*2-2);
+-
+-      m = n + numberOfContours;  // a loose bound on how many vertices we might need
+-      vertices = (stbtt_vertex *) STBTT_malloc(m * sizeof(vertices[0]), info->userdata);
+-      if (vertices == 0)
+-         return 0;
+-
+-      next_move = 0;
+-      flagcount=0;
+-
+-      // in first pass, we load uninterpreted data into the allocated array
+-      // above, shifted to the end of the array so we won't overwrite it when
+-      // we create our final data starting from the front
+-
+-      off = m - n; // starting offset for uninterpreted data, regardless of how m ends up being calculated
+-
+-      // first load flags
+-
+-      for (i=0; i < n; ++i) {
+-         if (flagcount == 0) {
+-            flags = *points++;
+-            if (flags & 8)
+-               flagcount = *points++;
+-         } else
+-            --flagcount;
+-         vertices[off+i].type = flags;
+-      }
+-
+-      // now load x coordinates
+-      x=0;
+-      for (i=0; i < n; ++i) {
+-         flags = vertices[off+i].type;
+-         if (flags & 2) {
+-            stbtt_int16 dx = *points++;
+-            x += (flags & 16) ? dx : -dx; // ???
+-         } else {
+-            if (!(flags & 16)) {
+-               x = x + (stbtt_int16) (points[0]*256 + points[1]);
+-               points += 2;
+-            }
+-         }
+-         vertices[off+i].x = x;
+-      }
+-
+-      // now load y coordinates
+-      y=0;
+-      for (i=0; i < n; ++i) {
+-         flags = vertices[off+i].type;
+-         if (flags & 4) {
+-            stbtt_int16 dy = *points++;
+-            y += (flags & 32) ? dy : -dy; // ???
+-         } else {
+-            if (!(flags & 32)) {
+-               y = y + (stbtt_int16) (points[0]*256 + points[1]);
+-               points += 2;
+-            }
+-         }
+-         vertices[off+i].y = y;
+-      }
+-
+-      // now convert them to our format
+-      num_vertices=0;
+-      sx = sy = cx = cy = 0;
+-      for (i=0; i < n; ++i) {
+-         flags = vertices[off+i].type;
+-         x     = (stbtt_int16) vertices[off+i].x;
+-         y     = (stbtt_int16) vertices[off+i].y;
+-         if (next_move == i) {
+-            // when we get to the end, we have to close the shape explicitly
+-            if (i != 0) {
+-               if (was_off)
+-                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve,sx,sy,cx,cy);
+-               else
+-                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vline,sx,sy,0,0);
+-            }
+-
+-            // now start the new one               
+-            stbtt_setvertex(&vertices[num_vertices++], STBTT_vmove,x,y,0,0);
+-            next_move = 1 + ttUSHORT(endPtsOfContours+j*2);
+-            ++j;
+-            was_off = 0;
+-            sx = x;
+-            sy = y;
+-         } else {
+-            if (!(flags & 1)) { // if it's a curve
+-               if (was_off) // two off-curve control points in a row means interpolate an on-curve midpoint
+-                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, (cx+x)>>1, (cy+y)>>1, cx, cy);
+-               cx = x;
+-               cy = y;
+-               was_off = 1;
+-            } else {
+-               if (was_off)
+-                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, x,y, cx, cy);
+-               else
+-                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vline, x,y,0,0);
+-               was_off = 0;
+-            }
+-         }
+-      }
+-      if (i != 0) {
+-         if (was_off)
+-            stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve,sx,sy,cx,cy);
+-         else
+-            stbtt_setvertex(&vertices[num_vertices++], STBTT_vline,sx,sy,0,0);
+-      }
+-   } else if (numberOfContours == -1) {
+-      // Compound shapes.
+-      int more = 1;
+-      stbtt_uint8 *comp = data + g + 10;
+-      num_vertices = 0;
+-      vertices = 0;
+-      while (more) {
+-         stbtt_uint16 flags, gidx;
+-         int comp_num_verts = 0, i;
+-         stbtt_vertex *comp_verts = 0, *tmp = 0;
+-         float mtx[6] = {1,0,0,1,0,0}, m, n;
+-         
+-         flags = ttSHORT(comp); comp+=2;
+-         gidx = ttSHORT(comp); comp+=2;
+-
+-         if (flags & 2) { // XY values
+-            if (flags & 1) { // shorts
+-               mtx[4] = ttSHORT(comp); comp+=2;
+-               mtx[5] = ttSHORT(comp); comp+=2;
+-            } else {
+-               mtx[4] = ttCHAR(comp); comp+=1;
+-               mtx[5] = ttCHAR(comp); comp+=1;
+-            }
+-         }
+-         else {
+-            // @TODO handle matching point
+-            STBTT_assert(0);
+-         }
+-         if (flags & (1<<3)) { // WE_HAVE_A_SCALE
+-            mtx[0] = mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
+-            mtx[1] = mtx[2] = 0;
+-         } else if (flags & (1<<6)) { // WE_HAVE_AN_X_AND_YSCALE
+-            mtx[0] = ttSHORT(comp)/16384.0f; comp+=2;
+-            mtx[1] = mtx[2] = 0;
+-            mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
+-         } else if (flags & (1<<7)) { // WE_HAVE_A_TWO_BY_TWO
+-            mtx[0] = ttSHORT(comp)/16384.0f; comp+=2;
+-            mtx[1] = ttSHORT(comp)/16384.0f; comp+=2;
+-            mtx[2] = ttSHORT(comp)/16384.0f; comp+=2;
+-            mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
+-         }
+-         
+-         // Find transformation scales.
+-         m = (float) sqrt(mtx[0]*mtx[0] + mtx[1]*mtx[1]);
+-         n = (float) sqrt(mtx[2]*mtx[2] + mtx[3]*mtx[3]);
+-
+-         // Get indexed glyph.
+-         comp_num_verts = stbtt_GetGlyphShape(info, gidx, &comp_verts);
+-         if (comp_num_verts > 0) {
+-            // Transform vertices.
+-            for (i = 0; i < comp_num_verts; ++i) {
+-               stbtt_vertex* v = &comp_verts[i];
+-               stbtt_vertex_type x,y;
+-               x=v->x; y=v->y;
+-               v->x = (stbtt_vertex_type)(m * (mtx[0]*x + mtx[2]*y + mtx[4]));
+-               v->y = (stbtt_vertex_type)(n * (mtx[1]*x + mtx[3]*y + mtx[5]));
+-               x=v->cx; y=v->cy;
+-               v->cx = (stbtt_vertex_type)(m * (mtx[0]*x + mtx[2]*y + mtx[4]));
+-               v->cy = (stbtt_vertex_type)(n * (mtx[1]*x + mtx[3]*y + mtx[5]));
+-            }
+-            // Append vertices.
+-            tmp = (stbtt_vertex*)STBTT_malloc((num_vertices+comp_num_verts)*sizeof(stbtt_vertex), info->userdata);
+-            if (!tmp) {
+-               if (vertices) STBTT_free(vertices, info->userdata);
+-               if (comp_verts) STBTT_free(comp_verts, info->userdata);
+-               return 0;
+-            }
+-            if (num_vertices > 0) memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex));
+-            memcpy(tmp+num_vertices, comp_verts, comp_num_verts*sizeof(stbtt_vertex));
+-            if (vertices) STBTT_free(vertices, info->userdata);
+-            vertices = tmp;
+-            STBTT_free(comp_verts, info->userdata);
+-            num_vertices += comp_num_verts;
+-         }
+-         // More components ?
+-         more = flags & (1<<5);
+-      }
+-   } else if (numberOfContours < 0) {
+-      // @TODO other compound variations?
+-      STBTT_assert(0);
+-   } else {
+-      // numberOfCounters == 0, do nothing
+-   }
+-
+-   *pvertices = vertices;
+-   return num_vertices;
+-}
+-
+-void stbtt_GetGlyphHMetrics(const stbtt_fontinfo *info, int glyph_index, int *advanceWidth, int *leftSideBearing)
+-{
+-   stbtt_uint16 numOfLongHorMetrics = ttUSHORT(info->data+info->hhea + 34);
+-   if (glyph_index < numOfLongHorMetrics) {
+-      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*glyph_index);
+-      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*glyph_index + 2);
+-   } else {
+-      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*(numOfLongHorMetrics-1));
+-      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*numOfLongHorMetrics + 2*(glyph_index - numOfLongHorMetrics));
+-   }
+-}
+-
+-int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo * /*info*/, int /*glyph1*/, int /*glyph2*/)
+-{
+-   return 0;
+-}
+-
+-int  stbtt_GetCodepointKernAdvance(const stbtt_fontinfo * /*info*/, int /*ch1*/, int /*ch2*/)
+-{
+-   return 0;
+-}
+-
+-void stbtt_GetCodepointHMetrics(const stbtt_fontinfo *info, int codepoint, int *advanceWidth, int *leftSideBearing)
+-{
+-   stbtt_GetGlyphHMetrics(info, stbtt_FindGlyphIndex(info,codepoint), advanceWidth, leftSideBearing);
+-}
+-
+-void stbtt_GetFontVMetrics(const stbtt_fontinfo *info, int *ascent, int *descent, int *lineGap)
+-{
+-   if (ascent ) *ascent  = ttSHORT(info->data+info->hhea + 4);
+-   if (descent) *descent = ttSHORT(info->data+info->hhea + 6);
+-   if (lineGap) *lineGap = ttSHORT(info->data+info->hhea + 8);
+-}
+-
+-float stbtt_ScaleForPixelHeight(const stbtt_fontinfo *info, float height)
+-{
+-   int fheight = ttSHORT(info->data + info->hhea + 4) - ttSHORT(info->data + info->hhea + 6);
+-   return (float) height / fheight;
+-}
+-
+-void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *v)
+-{
+-   STBTT_free(v, info->userdata);
+-}
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// antialiasing software rasterizer
+-//
+-
+-void stbtt_GetGlyphBitmapBox(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1)
+-{
+-   int x0,y0,x1,y1;
+-   if (!stbtt_GetGlyphBox(font, glyph, &x0,&y0,&x1,&y1))
+-      x0=y0=x1=y1=0; // e.g. space character
+-   // now move to integral bboxes (treating pixels as little squares, what pixels get touched)?
+-   if (ix0) *ix0 =  STBTT_ifloor(x0 * scale_x);
+-   if (iy0) *iy0 = -STBTT_iceil (y1 * scale_y);
+-   if (ix1) *ix1 =  STBTT_iceil (x1 * scale_x);
+-   if (iy1) *iy1 = -STBTT_ifloor(y0 * scale_y);
+-}
+-
+-void stbtt_GetCodepointBitmapBox(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1)
+-{
+-   stbtt_GetGlyphBitmapBox(font, stbtt_FindGlyphIndex(font,codepoint), scale_x, scale_y, ix0,iy0,ix1,iy1);
+-}
+-
+-typedef struct stbtt__edge {
+-   float x0,y0, x1,y1;
+-   int invert;
+-} stbtt__edge;
+-
+-typedef struct stbtt__active_edge
+-{
+-   int x,dx;
+-   float ey;
+-   struct stbtt__active_edge *next;
+-   int valid;
+-} stbtt__active_edge;
+-
+-#define FIXSHIFT   10
+-#define FIX        (1 << FIXSHIFT)
+-#define FIXMASK    (FIX-1)
+-
+-static stbtt__active_edge *new_active(stbtt__edge *e, int off_x, float start_point, void *userdata)
+-{
+-   stbtt__active_edge *z = (stbtt__active_edge *) STBTT_malloc(sizeof(*z), userdata); // @TODO: make a pool of these!!!
+-   float dxdy = (e->x1 - e->x0) / (e->y1 - e->y0);
+-   STBTT_assert(e->y0 <= start_point);
+-   if (!z) return z;
+-   // round dx down to avoid going too far
+-   if (dxdy < 0)
+-      z->dx = -STBTT_ifloor(FIX * -dxdy);
+-   else
+-      z->dx = STBTT_ifloor(FIX * dxdy);
+-   z->x = STBTT_ifloor(FIX * (e->x0 + dxdy * (start_point - e->y0)));
+-   z->x -= off_x * FIX;
+-   z->ey = e->y1;
+-   z->next = 0;
+-   z->valid = e->invert ? 1 : -1;
+-   return z;
+-}
+-
+-// note: this routine clips fills that extend off the edges... ideally this
+-// wouldn't happen, but it could happen if the truetype glyph bounding boxes
+-// are wrong, or if the user supplies a too-small bitmap
+-static void stbtt__fill_active_edges(unsigned char *scanline, int len, stbtt__active_edge *e, int max_weight)
+-{
+-   // non-zero winding fill
+-   int x0=0, w=0;
+-
+-   while (e) {
+-      if (w == 0) {
+-         // if we're currently at zero, we need to record the edge start point
+-         x0 = e->x; w += e->valid;
+-      } else {
+-         int x1 = e->x; w += e->valid;
+-         // if we went to zero, we need to draw
+-         if (w == 0) {
+-            int i = x0 >> FIXSHIFT;
+-            int j = x1 >> FIXSHIFT;
+-
+-            if (i < len && j >= 0) {
+-               if (i == j) {
+-                  // x0,x1 are the same pixel, so compute combined coverage
+-                  scanline[i] = scanline[i] + (stbtt_uint8) ((x1 - x0) * max_weight >> FIXSHIFT);
+-               } else {
+-                  if (i >= 0) // add antialiasing for x0
+-                     scanline[i] = scanline[i] + (stbtt_uint8) (((FIX - (x0 & FIXMASK)) * max_weight) >> FIXSHIFT);
+-                  else
+-                     i = -1; // clip
+-
+-                  if (j < len) // add antialiasing for x1
+-                     scanline[j] = scanline[j] + (stbtt_uint8) (((x1 & FIXMASK) * max_weight) >> FIXSHIFT);
+-                  else
+-                     j = len; // clip
+-
+-                  for (++i; i < j; ++i) // fill pixels between x0 and x1
+-                     scanline[i] = scanline[i] + (stbtt_uint8) max_weight;
+-               }
+-            }
+-         }
+-      }
+-      
+-      e = e->next;
+-   }
+-}
+-
+-static void stbtt__rasterize_sorted_edges(stbtt__bitmap *result, stbtt__edge *e, int n, int vsubsample, int off_x, int off_y, void *userdata)
+-{
+-   stbtt__active_edge *active = NULL;
+-   int y,j=0;
+-   int max_weight = (255 / vsubsample);  // weight per vertical scanline
+-   int s; // vertical subsample index
+-   unsigned char scanline_data[512], *scanline;
+-
+-   if (result->w > 512)
+-      scanline = (unsigned char *) STBTT_malloc(result->w, userdata);
+-   else
+-      scanline = scanline_data;
+-
+-   y = off_y * vsubsample;
+-   e[n].y0 = (off_y + result->h) * (float) vsubsample + 1;
+-
+-   while (j < result->h) {
+-      STBTT_memset(scanline, 0, result->w);
+-      for (s=0; s < vsubsample; ++s) {
+-         // find center of pixel for this scanline
+-         float scan_y = y + 0.5f;
+-         stbtt__active_edge **step = &active;
+-
+-         // update all active edges;
+-         // remove all active edges that terminate before the center of this scanline
+-         while (*step) {
+-            stbtt__active_edge * z = *step;
+-            if (z->ey <= scan_y) {
+-               *step = z->next; // delete from list
+-               STBTT_assert(z->valid);
+-               z->valid = 0;
+-               STBTT_free(z, userdata);
+-            } else {
+-               z->x += z->dx; // advance to position for current scanline
+-               step = &((*step)->next); // advance through list
+-            }
+-         }
+-
+-         // resort the list if needed
+-         for(;;) {
+-            int changed=0;
+-            step = &active;
+-            while (*step && (*step)->next) {
+-               if ((*step)->x > (*step)->next->x) {
+-                  stbtt__active_edge *t = *step;
+-                  stbtt__active_edge *q = t->next;
+-
+-                  t->next = q->next;
+-                  q->next = t;
+-                  *step = q;
+-                  changed = 1;
+-               }
+-               step = &(*step)->next;
+-            }
+-            if (!changed) break;
+-         }
+-
+-         // insert all edges that start before the center of this scanline -- omit ones that also end on this scanline
+-         while (e->y0 <= scan_y) {
+-            if (e->y1 > scan_y) {
+-               stbtt__active_edge *z = new_active(e, off_x, scan_y, userdata);
+-               // find insertion point
+-               if (active == NULL)
+-                  active = z;
+-               else if (z->x < active->x) {
+-                  // insert at front
+-                  z->next = active;
+-                  active = z;
+-               } else {
+-                  // find thing to insert AFTER
+-                  stbtt__active_edge *p = active;
+-                  while (p->next && p->next->x < z->x)
+-                     p = p->next;
+-                  // at this point, p->next->x is NOT < z->x
+-                  z->next = p->next;
+-                  p->next = z;
+-               }
+-            }
+-            ++e;
+-         }
+-
+-         // now process all active edges in XOR fashion
+-         if (active)
+-            stbtt__fill_active_edges(scanline, result->w, active, max_weight);
+-
+-         ++y;
+-      }
+-      STBTT_memcpy(result->pixels + j * result->stride, scanline, result->w);
+-      ++j;
+-   }
+-
+-   while (active) {
+-      stbtt__active_edge *z = active;
+-      active = active->next;
+-      STBTT_free(z, userdata);
+-   }
+-
+-   if (scanline != scanline_data)
+-      STBTT_free(scanline, userdata);
+-}
+-
+-static int stbtt__edge_compare(const void *p, const void *q)
+-{
+-   stbtt__edge *a = (stbtt__edge *) p;
+-   stbtt__edge *b = (stbtt__edge *) q;
+-
+-   if (a->y0 < b->y0) return -1;
+-   if (a->y0 > b->y0) return  1;
+-   return 0;
+-}
+-
+-typedef struct
+-{
+-   float x,y;
+-} stbtt__point;
+-
+-static void stbtt__rasterize(stbtt__bitmap *result, stbtt__point *pts, int *wcount, int windings, float scale_x, float scale_y, int off_x, int off_y, int invert, void *userdata)
+-{
+-   float y_scale_inv = invert ? -scale_y : scale_y;
+-   stbtt__edge *e;
+-   int n,i,j,k,m;
+-   int vsubsample = result->h < 8 ? 15 : 5;
+-   // vsubsample should divide 255 evenly; otherwise we won't reach full opacity
+-
+-   // now we have to blow out the windings into explicit edge lists
+-   n = 0;
+-   for (i=0; i < windings; ++i)
+-      n += wcount[i];
+-
+-   e = (stbtt__edge *) STBTT_malloc(sizeof(*e) * (n+1), userdata); // add an extra one as a sentinel
+-   if (e == 0) return;
+-   n = 0;
+-
+-   m=0;
+-   for (i=0; i < windings; ++i) {
+-      stbtt__point *p = pts + m;
+-      m += wcount[i];
+-      j = wcount[i]-1;
+-      for (k=0; k < wcount[i]; j=k++) {
+-         int a=k,b=j;
+-         // skip the edge if horizontal
+-         if (p[j].y == p[k].y)
+-            continue;
+-         // add edge from j to k to the list
+-         e[n].invert = 0;
+-         if (invert ? p[j].y > p[k].y : p[j].y < p[k].y) {
+-            e[n].invert = 1;
+-            a=j,b=k;
+-         }
+-         e[n].x0 = p[a].x * scale_x;
+-         e[n].y0 = p[a].y * y_scale_inv * vsubsample;
+-         e[n].x1 = p[b].x * scale_x;
+-         e[n].y1 = p[b].y * y_scale_inv * vsubsample;
+-         ++n;
+-      }
+-   }
+-
+-   // now sort the edges by their highest point (should snap to integer, and then by x)
+-   STBTT_sort(e, n, sizeof(e[0]), stbtt__edge_compare);
+-
+-   // now, traverse the scanlines and find the intersections on each scanline, use xor winding rule
+-   stbtt__rasterize_sorted_edges(result, e, n, vsubsample, off_x, off_y, userdata);
+-
+-   STBTT_free(e, userdata);
+-}
+-
+-static void stbtt__add_point(stbtt__point *points, int n, float x, float y)
+-{
+-   if (!points) return; // during first pass, it's unallocated
+-   points[n].x = x;
+-   points[n].y = y;
+-}
+-
+-// tesselate until threshhold p is happy... @TODO warped to compensate for non-linear stretching
+-static int stbtt__tesselate_curve(stbtt__point *points, int *num_points, float x0, float y0, float x1, float y1, float x2, float y2, float objspace_flatness_squared, int n)
+-{
+-   // midpoint
+-   float mx = (x0 + 2*x1 + x2)/4;
+-   float my = (y0 + 2*y1 + y2)/4;
+-   // versus directly drawn line
+-   float dx = (x0+x2)/2 - mx;
+-   float dy = (y0+y2)/2 - my;
+-   if (n > 16) // 65536 segments on one curve better be enough!
+-      return 1;
+-   if (dx*dx+dy*dy > objspace_flatness_squared) { // half-pixel error allowed... need to be smaller if AA
+-      stbtt__tesselate_curve(points, num_points, x0,y0, (x0+x1)/2.0f,(y0+y1)/2.0f, mx,my, objspace_flatness_squared,n+1);
+-      stbtt__tesselate_curve(points, num_points, mx,my, (x1+x2)/2.0f,(y1+y2)/2.0f, x2,y2, objspace_flatness_squared,n+1);
+-   } else {
+-      stbtt__add_point(points, *num_points,x2,y2);
+-      *num_points = *num_points+1;
+-   }
+-   return 1;
+-}
+-
+-// returns number of contours
+-stbtt__point *stbtt_FlattenCurves(stbtt_vertex *vertices, int num_verts, float objspace_flatness, int **contour_lengths, int *num_contours, void *userdata)
+-{
+-   stbtt__point *points=0;
+-   int num_points=0;
+-
+-   float objspace_flatness_squared = objspace_flatness * objspace_flatness;
+-   int i,n=0,start=0, pass;
+-
+-   // count how many "moves" there are to get the contour count
+-   for (i=0; i < num_verts; ++i)
+-      if (vertices[i].type == STBTT_vmove)
+-         ++n;
+-
+-   *num_contours = n;
+-   if (n == 0) return 0;
+-
+-   *contour_lengths = (int *) STBTT_malloc(sizeof(**contour_lengths) * n, userdata);
+-
+-   if (*contour_lengths == 0) {
+-      *num_contours = 0;
+-      return 0;
+-   }
+-
+-   // make two passes through the points so we don't need to realloc
+-   for (pass=0; pass < 2; ++pass) {
+-      float x=0,y=0;
+-      if (pass == 1) {
+-         points = (stbtt__point *) STBTT_malloc(num_points * sizeof(points[0]), userdata);
+-         if (points == NULL) goto error;
+-      }
+-      num_points = 0;
+-      n= -1;
+-      for (i=0; i < num_verts; ++i) {
+-         switch (vertices[i].type) {
+-            case STBTT_vmove:
+-               // start the next contour
+-               if (n >= 0)
+-                  (*contour_lengths)[n] = num_points - start;
+-               ++n;
+-               start = num_points;
+-
+-               x = vertices[i].x, y = vertices[i].y;
+-               stbtt__add_point(points, num_points++, x,y);
+-               break;
+-            case STBTT_vline:
+-               x = vertices[i].x, y = vertices[i].y;
+-               stbtt__add_point(points, num_points++, x, y);
+-               break;
+-            case STBTT_vcurve:
+-               stbtt__tesselate_curve(points, &num_points, x,y,
+-                                        vertices[i].cx, vertices[i].cy,
+-                                        vertices[i].x,  vertices[i].y,
+-                                        objspace_flatness_squared, 0);
+-               x = vertices[i].x, y = vertices[i].y;
+-               break;
+-         }
+-      }
+-      (*contour_lengths)[n] = num_points - start;
+-   }
+-
+-   return points;
+-error:
+-   STBTT_free(points, userdata);
+-   STBTT_free(*contour_lengths, userdata);
+-   *contour_lengths = 0;
+-   *num_contours = 0;
+-   return NULL;
+-}
+-
+-void stbtt_Rasterize(stbtt__bitmap *result, float flatness_in_pixels, stbtt_vertex *vertices, int num_verts, float scale_x, float scale_y, int x_off, int y_off, int invert, void *userdata)
+-{
+-   float scale = scale_x > scale_y ? scale_y : scale_x;
+-   int winding_count, *winding_lengths;
+-   stbtt__point *windings = stbtt_FlattenCurves(vertices, num_verts, flatness_in_pixels / scale, &winding_lengths, &winding_count, userdata);
+-   if (windings) {
+-      stbtt__rasterize(result, windings, winding_lengths, winding_count, scale_x, scale_y, x_off, y_off, invert, userdata);
+-      STBTT_free(winding_lengths, userdata);
+-      STBTT_free(windings, userdata);
+-   }
+-}
+-
+-void stbtt_FreeBitmap(unsigned char *bitmap, void *userdata)
+-{
+-   STBTT_free(bitmap, userdata);
+-}
+-
+-unsigned char *stbtt_GetGlyphBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int glyph, int *width, int *height, int *xoff, int *yoff)
+-{
+-   int ix0,iy0,ix1,iy1;
+-   stbtt__bitmap gbm;
+-   stbtt_vertex *vertices;   
+-   int num_verts = stbtt_GetGlyphShape(info, glyph, &vertices);
+-
+-   if (scale_x == 0) scale_x = scale_y;
+-   if (scale_y == 0) {
+-      if (scale_x == 0) return NULL;
+-      scale_y = scale_x;
+-   }
+-
+-   stbtt_GetGlyphBitmapBox(info, glyph, scale_x, scale_y, &ix0,&iy0,&ix1,&iy1);
+-
+-   // now we get the size
+-   gbm.w = (ix1 - ix0);
+-   gbm.h = (iy1 - iy0);
+-   gbm.pixels = NULL; // in case we error
+-
+-   if (width ) *width  = gbm.w;
+-   if (height) *height = gbm.h;
+-   if (xoff  ) *xoff   = ix0;
+-   if (yoff  ) *yoff   = iy0;
+-   
+-   if (gbm.w && gbm.h) {
+-      gbm.pixels = (unsigned char *) STBTT_malloc(gbm.w * gbm.h, info->userdata);
+-      if (gbm.pixels) {
+-         gbm.stride = gbm.w;
+-
+-         stbtt_Rasterize(&gbm, 0.35f, vertices, num_verts, scale_x, scale_y, ix0, iy0, 1, info->userdata);
+-      }
+-   }
+-   STBTT_free(vertices, info->userdata);
+-   return gbm.pixels;
+-}   
+-
+-void stbtt_MakeGlyphBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int glyph)
+-{
+-   int ix0,iy0;
+-   stbtt_vertex *vertices;   
+-   int num_verts = stbtt_GetGlyphShape(info, glyph, &vertices);
+-   stbtt__bitmap gbm;   
+-
+-   stbtt_GetGlyphBitmapBox(info, glyph, scale_x, scale_y, &ix0,&iy0,0,0);
+-   gbm.pixels = output;
+-   gbm.w = out_w;
+-   gbm.h = out_h;
+-   gbm.stride = out_stride;
+-
+-   if (gbm.w && gbm.h)
+-      stbtt_Rasterize(&gbm, 0.35f, vertices, num_verts, scale_x, scale_y, ix0,iy0, 1, info->userdata);
+-
+-   STBTT_free(vertices, info->userdata);
+-}
+-
+-unsigned char *stbtt_GetCodepointBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int codepoint, int *width, int *height, int *xoff, int *yoff)
+-{
+-   return stbtt_GetGlyphBitmap(info, scale_x, scale_y, stbtt_FindGlyphIndex(info,codepoint), width,height,xoff,yoff);
+-}   
+-
+-void stbtt_MakeCodepointBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int codepoint)
+-{
+-   stbtt_MakeGlyphBitmap(info, output, out_w, out_h, out_stride, scale_x, scale_y, stbtt_FindGlyphIndex(info,codepoint));
+-}
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// bitmap baking
+-//
+-// This is SUPER-SHITTY packing to keep source code small
+-
+-extern int stbtt_BakeFontBitmap(const unsigned char *data, int offset,  // font location (use offset=0 for plain .ttf)
+-                                float pixel_height,                     // height of font in pixels
+-                                unsigned char *pixels, int pw, int ph,  // bitmap to be filled in
+-                                int first_char, int num_chars,          // characters to bake
+-                                stbtt_bakedchar *chardata)
+-{
+-   float scale;
+-   int x,y,bottom_y, i;
+-   stbtt_fontinfo f;
+-   stbtt_InitFont(&f, data, offset);
+-   STBTT_memset(pixels, 0, pw*ph); // background of 0 around pixels
+-   x=y=1;
+-   bottom_y = 1;
+-
+-   scale = stbtt_ScaleForPixelHeight(&f, pixel_height);
+-
+-   for (i=0; i < num_chars; ++i) {
+-      int advance, lsb, x0,y0,x1,y1,gw,gh;
+-      int g = stbtt_FindGlyphIndex(&f, first_char + i);
+-      stbtt_GetGlyphHMetrics(&f, g, &advance, &lsb);
+-      stbtt_GetGlyphBitmapBox(&f, g, scale,scale, &x0,&y0,&x1,&y1);
+-      gw = x1-x0;
+-      gh = y1-y0;
+-      if (x + gw + 1 >= pw)
+-         y = bottom_y, x = 1; // advance to next row
+-      if (y + gh + 1 >= ph) // check if it fits vertically AFTER potentially moving to next row
+-         return -i;
+-      STBTT_assert(x+gw < pw);
+-      STBTT_assert(y+gh < ph);
+-      stbtt_MakeGlyphBitmap(&f, pixels+x+y*pw, gw,gh,pw, scale,scale, g);
+-      chardata[i].x0 = (stbtt_int16) x;
+-      chardata[i].y0 = (stbtt_int16) y;
+-      chardata[i].x1 = (stbtt_int16) (x + gw);
+-      chardata[i].y1 = (stbtt_int16) (y + gh);
+-      chardata[i].xadvance = scale * advance;
+-      chardata[i].xoff     = (float) x0;
+-      chardata[i].yoff     = (float) y0;
+-      x = x + gw + 2;
+-      if (y+gh+2 > bottom_y)
+-         bottom_y = y+gh+2;
+-   }
+-   return bottom_y;
+-}
+-
+-void stbtt_GetBakedQuad(stbtt_bakedchar *chardata, int pw, int ph, int char_index, float *xpos, float *ypos, stbtt_aligned_quad *q, int opengl_fillrule)
+-{
+-   float d3d_bias = opengl_fillrule ? 0 : -0.5f;
+-   float ipw = 1.0f / pw, iph = 1.0f / ph;
+-   stbtt_bakedchar *b = chardata + char_index;
+-   int round_x = STBTT_ifloor((*xpos + b->xoff) + 0.5);
+-   int round_y = STBTT_ifloor((*ypos + b->yoff) + 0.5);
+-
+-   q->x0 = round_x + d3d_bias;
+-   q->y0 = round_y + d3d_bias;
+-   q->x1 = round_x + b->x1 - b->x0 + d3d_bias;
+-   q->y1 = round_y + b->y1 - b->y0 + d3d_bias;
+-
+-   q->s0 = b->x0 * ipw;
+-   q->t0 = b->y0 * ipw;
+-   q->s1 = b->x1 * iph;
+-   q->t1 = b->y1 * iph;
+-
+-   *xpos += b->xadvance;
+-}
+-
+-//////////////////////////////////////////////////////////////////////////////
+-//
+-// font name matching -- recommended not to use this
+-//
+-
+-// check if a utf8 string contains a prefix which is the utf16 string; if so return length of matching utf8 string
+-static stbtt_int32 stbtt__CompareUTF8toUTF16_bigendian_prefix(stbtt_uint8 *s1, stbtt_int32 len1, stbtt_uint8 *s2, stbtt_int32 len2) 
+-{
+-   stbtt_int32 i=0;
+-
+-   // convert utf16 to utf8 and compare the results while converting
+-   while (len2) {
+-      stbtt_uint16 ch = s2[0]*256 + s2[1];
+-      if (ch < 0x80) {
+-         if (i >= len1) return -1;
+-         if (s1[i++] != ch) return -1;
+-      } else if (ch < 0x800) {
+-         if (i+1 >= len1) return -1;
+-         if (s1[i++] != 0xc0 + (ch >> 6)) return -1;
+-         if (s1[i++] != 0x80 + (ch & 0x3f)) return -1;
+-      } else if (ch >= 0xd800 && ch < 0xdc00) {
+-         stbtt_uint32 c;
+-         stbtt_uint16 ch2 = s2[2]*256 + s2[3];
+-         if (i+3 >= len1) return -1;
+-         c = ((ch - 0xd800) << 10) + (ch2 - 0xdc00) + 0x10000;
+-         if (s1[i++] != 0xf0 + (c >> 18)) return -1;
+-         if (s1[i++] != 0x80 + ((c >> 12) & 0x3f)) return -1;
+-         if (s1[i++] != 0x80 + ((c >>  6) & 0x3f)) return -1;
+-         if (s1[i++] != 0x80 + ((c      ) & 0x3f)) return -1;
+-         s2 += 2; // plus another 2 below
+-         len2 -= 2;
+-      } else if (ch >= 0xdc00 && ch < 0xe000) {
+-         return -1;
+-      } else {
+-         if (i+2 >= len1) return -1;
+-         if (s1[i++] != 0xe0 + (ch >> 12)) return -1;
+-         if (s1[i++] != 0x80 + ((ch >> 6) & 0x3f)) return -1;
+-         if (s1[i++] != 0x80 + ((ch     ) & 0x3f)) return -1;
+-      }
+-      s2 += 2;
+-      len2 -= 2;
+-   }
+-   return i;
+-}
+-
+-int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const char *s2, int len2) 
+-{
+-   return len1 == stbtt__CompareUTF8toUTF16_bigendian_prefix((stbtt_uint8*) s1, len1, (stbtt_uint8*) s2, len2);
+-}
+-
+-// returns results in whatever encoding you request... but note that 2-byte encodings
+-// will be BIG-ENDIAN... use stbtt_CompareUTF8toUTF16_bigendian() to compare
+-char *stbtt_GetFontNameString(const stbtt_fontinfo *font, int *length, int platformID, int encodingID, int languageID, int nameID)
+-{
+-   stbtt_int32 i,count,stringOffset;
+-   stbtt_uint8 *fc = font->data;
+-   stbtt_uint32 offset = font->fontstart;
+-   stbtt_uint32 nm = stbtt__find_table(fc, offset, "name");
+-   if (!nm) return NULL;
+-
+-   count = ttUSHORT(fc+nm+2);
+-   stringOffset = nm + ttUSHORT(fc+nm+4);
+-   for (i=0; i < count; ++i) {
+-      stbtt_uint32 loc = nm + 6 + 12 * i;
+-      if (platformID == ttUSHORT(fc+loc+0) && encodingID == ttUSHORT(fc+loc+2)
+-          && languageID == ttUSHORT(fc+loc+4) && nameID == ttUSHORT(fc+loc+6)) {
+-         *length = ttUSHORT(fc+loc+8);
+-         return (char *) (fc+stringOffset+ttUSHORT(fc+loc+10));
+-      }
+-   }
+-   return NULL;
+-}
+-
+-static int stbtt__matchpair(stbtt_uint8 *fc, stbtt_uint32 nm, stbtt_uint8 *name, stbtt_int32 nlen, stbtt_int32 target_id, stbtt_int32 next_id)
+-{
+-   stbtt_int32 i;
+-   stbtt_int32 count = ttUSHORT(fc+nm+2);
+-   stbtt_int32 stringOffset = nm + ttUSHORT(fc+nm+4);
+-
+-   for (i=0; i < count; ++i) {
+-      stbtt_uint32 loc = nm + 6 + 12 * i;
+-      stbtt_int32 id = ttUSHORT(fc+loc+6);
+-      if (id == target_id) {
+-         // find the encoding
+-         stbtt_int32 platform = ttUSHORT(fc+loc+0), encoding = ttUSHORT(fc+loc+2), language = ttUSHORT(fc+loc+4);
+-
+-         // is this a Unicode encoding?
+-         if (platform == 0 || (platform == 3 && encoding == 1) || (platform == 3 && encoding == 10)) {
+-            stbtt_int32 slen = ttUSHORT(fc+loc+8), off = ttUSHORT(fc+loc+10);
+-
+-            // check if there's a prefix match
+-            stbtt_int32 matchlen = stbtt__CompareUTF8toUTF16_bigendian_prefix(name, nlen, fc+stringOffset+off,slen);
+-            if (matchlen >= 0) {
+-               // check for target_id+1 immediately following, with same encoding & language
+-               if (i+1 < count && ttUSHORT(fc+loc+12+6) == next_id && ttUSHORT(fc+loc+12) == platform && ttUSHORT(fc+loc+12+2) == encoding && ttUSHORT(fc+loc+12+4) == language) {
+-                  stbtt_int32 slen = ttUSHORT(fc+loc+12+8), off = ttUSHORT(fc+loc+12+10);
+-                  if (slen == 0) {
+-                     if (matchlen == nlen)
+-                        return 1;
+-                  } else if (matchlen < nlen && name[matchlen] == ' ') {
+-                     ++matchlen;
+-                     if (stbtt_CompareUTF8toUTF16_bigendian((char*) (name+matchlen), nlen-matchlen, (char*)(fc+stringOffset+off),slen))
+-                        return 1;
+-                  }
+-               } else {
+-                  // if nothing immediately following
+-                  if (matchlen == nlen)
+-                     return 1;
+-               }
+-            }
+-         }
+-
+-         // @TODO handle other encodings
+-      }
+-   }
+-   return 0;
+-}
+-
+-static int stbtt__matches(stbtt_uint8 *fc, stbtt_uint32 offset, stbtt_uint8 *name, stbtt_int32 flags)
+-{
+-   stbtt_int32 nlen = STBTT_strlen((char *) name);
+-   stbtt_uint32 nm,hd;
+-   if (!stbtt__isfont(fc+offset)) return 0;
+-
+-   // check italics/bold/underline flags in macStyle...
+-   if (flags) {
+-      hd = stbtt__find_table(fc, offset, "head");
+-      if ((ttUSHORT(fc+hd+44) & 7) != (flags & 7)) return 0;
+-   }
+-
+-   nm = stbtt__find_table(fc, offset, "name");
+-   if (!nm) return 0;
+-
+-   if (flags) {
+-      // if we checked the macStyle flags, then just check the family and ignore the subfamily
+-      if (stbtt__matchpair(fc, nm, name, nlen, 16, -1))  return 1;
+-      if (stbtt__matchpair(fc, nm, name, nlen,  1, -1))  return 1;
+-      if (stbtt__matchpair(fc, nm, name, nlen,  3, -1))  return 1;
+-   } else {
+-      if (stbtt__matchpair(fc, nm, name, nlen, 16, 17))  return 1;
+-      if (stbtt__matchpair(fc, nm, name, nlen,  1,  2))  return 1;
+-      if (stbtt__matchpair(fc, nm, name, nlen,  3, -1))  return 1;
+-   }
+-
+-   return 0;
+-}
+-
+-int stbtt_FindMatchingFont(const unsigned char *font_collection, const char *name_utf8, stbtt_int32 flags)
+-{
+-   stbtt_int32 i;
+-   for (i=0;;++i) {
+-      stbtt_int32 off = stbtt_GetFontOffsetForIndex(font_collection, i);
+-      if (off < 0) return off;
+-      if (stbtt__matches((stbtt_uint8 *) font_collection, off, (stbtt_uint8*) name_utf8, flags))
+-         return off;
+-   }
+-}
+-
+-#endif // STB_TRUETYPE_IMPLEMENTATION
++// stb_truetype.h - v0.3 - public domain - 2009 Sean Barrett / RAD Game Tools
++//
++//   This library processes TrueType files:
++//        parse files
++//        extract glyph metrics
++//        extract glyph shapes
++//        render glyphs to one-channel bitmaps with antialiasing (box filter)
++//
++//   Todo:
++//        non-MS cmaps
++//        crashproof on bad data
++//        hinting
++//        subpixel positioning when rendering bitmap
++//        cleartype-style AA
++//
++// ADDITIONAL CONTRIBUTORS
++//
++//   Mikko Mononen: compound shape support, more cmap formats
++//
++// VERSIONS
++//
++//   0.3 (2009-06-24) cmap fmt=12, compound shapes (MM)
++//                    userdata, malloc-from-userdata, non-zero fill (STB)
++//   0.2 (2009-03-11) Fix unsigned/signed char warnings
++//   0.1 (2009-03-09) First public release
++//
++// USAGE
++//
++//   Include this file in whatever places neeed to refer to it. In ONE C/C++
++//   file, write:
++//      #define STB_TRUETYPE_IMPLEMENTATION
++//   before the #include of this file. This expands out the actual
++//   implementation into that C/C++ file.
++//
++//   Look at the header-file sections below for the API, but here's a quick skim:
++//
++//   Simple 3D API (don't ship this, but it's fine for tools and quick start,
++//                  and you can cut and paste from it to move to more advanced)
++//           stbtt_BakeFontBitmap()               -- bake a font to a bitmap for use as texture
++//           stbtt_GetBakedQuad()                 -- compute quad to draw for a given char
++//
++//   "Load" a font file from a memory buffer (you have to keep the buffer loaded)
++//           stbtt_InitFont()
++//           stbtt_GetFontOffsetForIndex()        -- use for TTC font collections
++//
++//   Render a unicode codepoint to a bitmap
++//           stbtt_GetCodepointBitmap()           -- allocates and returns a bitmap
++//           stbtt_MakeCodepointBitmap()          -- renders into bitmap you provide
++//           stbtt_GetCodepointBitmapBox()        -- how big the bitmap must be
++//
++//   Character advance/positioning
++//           stbtt_GetCodepointHMetrics()
++//           stbtt_GetFontVMetrics()
++//
++// NOTES
++//
++//   The system uses the raw data found in the .ttf file without changing it
++//   and without building auxiliary data structures. This is a bit inefficient
++//   on little-endian systems (the data is big-endian), but assuming you're
++//   caching the bitmaps or glyph shapes this shouldn't be a big deal.
++//
++//   It appears to be very hard to programmatically determine what font a
++//   given file is in a general way. I provide an API for this, but I don't
++//   recommend it.
++//
++//
++// SOURCE STATISTICS (based on v0.3, 1800 LOC)
++//
++//   Documentation & header file        350 LOC  \___ 500 LOC documentation
++//   Sample code                        140 LOC  /
++//   Truetype parsing                   580 LOC  ---- 600 LOC TrueType
++//   Software rasterization             240 LOC  \                           .
++//   Curve tesselation                  120 LOC   \__ 500 LOC Bitmap creation
++//   Bitmap management                   70 LOC   /
++//   Baked bitmap interface              70 LOC  /
++//   Font name matching & access        150 LOC  ---- 150 
++//   C runtime library abstraction       60 LOC  ----  60
++
++
++//////////////////////////////////////////////////////////////////////////////
++//////////////////////////////////////////////////////////////////////////////
++////
++////  SAMPLE PROGRAMS
++////
++//
++//  Incomplete text-in-3d-api example, which draws quads properly aligned to be lossless
++//
++#if 0
++#define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
++#include "stb_truetype.h"
++
++char ttf_buffer[1<<20];
++unsigned char temp_bitmap[512*512];
++
++stbtt_chardata cdata[96]; // ASCII 32..126 is 95 glyphs
++GLstbtt_uint ftex;
++
++void my_stbtt_initfont(void)
++{
++   fread(ttf_buffer, 1, 1<<20, fopen("c:/windows/fonts/times.ttf", "rb"));
++   stbtt_BakeFontBitmap(data,0, 32.0, temp_bitmap,512,512, 32,96, cdata); // no guarantee this fits!
++   // can free ttf_buffer at this point
++   glGenTextures(1, &ftex);
++   glBindTexture(GL_TEXTURE_2D, ftex);
++   glTexImage2D(GL_TEXTURE_2D, 0, GL_ALPHA, 512,512, 0, GL_ALPHA, GL_UNSIGNED_BYTE, temp_bitmap);
++   // can free temp_bitmap at this point
++   glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
++}
++
++void my_stbtt_print(float x, float y, char *text)
++{
++   // assume orthographic projection with units = screen pixels, origin at top left
++   glBindTexture(GL_TEXTURE_2D, ftex);
++   glBegin(GL_QUADS);
++   while (*text) {
++      if (*text >= 32 && *text < 128) {
++         stbtt_aligned_quad q;
++         stbtt_GetBakedQuad(cdata, 512,512, *text-32, &x,&y,&q,1);//1=opengl,0=old d3d
++         glTexCoord2f(q.s0,q.t1); glVertex2f(q.x0,q.y0);
++         glTexCoord2f(q.s1,q.t1); glVertex2f(q.x1,q.y0);
++         glTexCoord2f(q.s1,q.t0); glVertex2f(q.x1,q.y1);
++         glTexCoord2f(q.s0,q.t0); glVertex2f(q.x0,q.y1);
++      }
++      ++text;
++   }
++   glEnd();
++}
++#endif
++//
++//
++//////////////////////////////////////////////////////////////////////////////
++//
++// Complete program (this compiles): get a single bitmap, print as ASCII art
++//
++#if 0
++#include <stdio.h>
++#define STB_TRUETYPE_IMPLEMENTATION  // force following include to generate implementation
++#include "stb_truetype.h"
++
++char ttf_buffer[1<<25];
++
++int main(int argc, char **argv)
++{
++   stbtt_fontinfo font;
++   unsigned char *bitmap;
++   int w,h,i,j,c = (argc > 1 ? atoi(argv[1]) : 'a'), s = (argc > 2 ? atoi(argv[2]) : 20);
++
++   fread(ttf_buffer, 1, 1<<25, fopen(argc > 3 ? argv[3] : "c:/windows/fonts/arialbd.ttf", "rb"));
++
++   stbtt_InitFont(&font, ttf_buffer, stbtt_GetFontOffsetForIndex(ttf_buffer,0));
++   bitmap = stbtt_GetCodepointBitmap(&font, 0,stbtt_ScaleForPixelHeight(&font, s), c, &w, &h, 0,0);
++
++   for (j=0; j < h; ++j) {
++      for (i=0; i < w; ++i)
++         putchar(" .:ioVM@"[bitmap[j*w+i]>>5]);
++      putchar('\n');
++   }
++   return 0;
++}
++#endif 
++//
++// Output:
++//
++//     .ii.
++//    @@@@@@.
++//   V@Mio@@o
++//   :i.  V@V
++//     :oM@@M
++//   :@@@MM@M
++//   @@o  o@M
++//  :@@.  M@M
++//   @@@o@@@@
++//   :M@@V:@@.
++//  
++//////////////////////////////////////////////////////////////////////////////
++// 
++// Complete program: print "Hello World!" banner, with bugs
++//
++#if 0
++int main(int arg, char **argv)
++{
++   unsigned char screen[20][79];
++   int i,j, pos=0;
++   float scale;
++   char *text = "Heljo World!";
++
++   fread(buffer, 1, 1000000, fopen("c:/windows/fonts/arialbd.ttf", "rb"));
++   stbtt_InitFont(&font, buffer, 0);
++
++   scale = stbtt_ScaleForPixelHeight(&font, 16);
++   memset(screen, 0, sizeof(screen));
++
++   while (*text) {
++      int advance,lsb,x0,y0,x1,y1, newpos, baseline=13;
++      stbtt_GetCodepointHMetrics(&font, *text, &advance, &lsb);
++      stbtt_GetCodepointBitmapBox(&font, *text, scale,scale, &x0,&y0,&x1,&y1);
++      newpos = pos + (int) (lsb * scale) + x0;
++      stbtt_MakeCodepointBitmap(&font, &screen[baseline + y0][newpos], x1-x0,y1-y0, 79, scale,scale, *text);
++      // note that this stomps the old data, so where character boxes overlap (e.g. 'lj') it's wrong
++      // because this API is really for baking character bitmaps into textures
++      pos += (int) (advance * scale);
++      ++text;
++   }
++
++   for (j=0; j < 20; ++j) {
++      for (i=0; i < 79; ++i)
++         putchar(" .:ioVM@"[screen[j][i]>>5]);
++      putchar('\n');
++   }
++
++   return 0;
++}
++#endif
++
++
++//////////////////////////////////////////////////////////////////////////////
++//////////////////////////////////////////////////////////////////////////////
++////
++////   INTEGRATION WITH RUNTIME LIBRARIES
++////
++
++#ifdef STB_TRUETYPE_IMPLEMENTATION
++   // #define your own (u)stbtt_int8/16/32 before including to override this
++   #ifndef stbtt_uint8
++   typedef unsigned char   stbtt_uint8;
++   typedef signed   char   stbtt_int8;
++   typedef unsigned short  stbtt_uint16;
++   typedef signed   short  stbtt_int16;
++   typedef unsigned int    stbtt_uint32;
++   typedef signed   int    stbtt_int32;
++   #endif
++
++   typedef char stbtt__check_size32[sizeof(stbtt_int32)==4 ? 1 : -1];
++   typedef char stbtt__check_size16[sizeof(stbtt_int16)==2 ? 1 : -1];
++
++   // #define your own STBTT_sort() to override this to avoid qsort
++   #ifndef STBTT_sort
++   #include <stdlib.h>
++   #define STBTT_sort(data,num_items,item_size,compare_func)   qsort(data,num_items,item_size,compare_func)
++   #endif
++
++   // #define your own STBTT_ifloor/STBTT_iceil() to avoid math.h
++   #ifndef STBTT_ifloor
++   #include <math.h>
++   #define STBTT_ifloor(x)   ((int) floor(x))
++   #define STBTT_iceil(x)    ((int) ceil(x))
++   #endif
++
++   // #define your own functions "STBTT_malloc" / "STBTT_free" to avoid malloc.h
++   #ifndef STBTT_malloc
++   #include <malloc.h>
++   #define STBTT_malloc(x,u)  malloc(x)
++   #define STBTT_free(x,u)    free(x)
++   #endif
++
++   #ifndef STBTT_assert
++   #include <assert.h>
++   #define STBTT_assert(x)    assert(x)
++   #endif
++
++   #ifndef STBTT_strlen
++   #include <string.h>
++   #define STBTT_strlen(x)    strlen(x)
++   #endif
++
++   #ifndef STBTT_memcpy
++   #include <memory.h>
++   #define STBTT_memcpy       memcpy
++   #define STBTT_memset       memset
++   #endif
++#endif
++
++///////////////////////////////////////////////////////////////////////////////
++///////////////////////////////////////////////////////////////////////////////
++////
++////   INTERFACE
++////
++////
++
++#ifndef __STB_INCLUDE_STB_TRUETYPE_H__
++#define __STB_INCLUDE_STB_TRUETYPE_H__
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// TEXTURE BAKING API
++//
++// If you use this API, you only have to call two functions ever.
++//
++
++typedef struct
++{
++   unsigned short x0,y0,x1,y1; // coordinates of bbox in bitmap
++   float xoff,yoff,xadvance;   
++} stbtt_bakedchar;
++
++extern int stbtt_BakeFontBitmap(const unsigned char *data, int offset,  // font location (use offset=0 for plain .ttf)
++                                float pixel_height,                     // height of font in pixels
++                                unsigned char *pixels, int pw, int ph,  // bitmap to be filled in
++                                int first_char, int num_chars,          // characters to bake
++                                stbtt_bakedchar *chardata);             // you allocate this, it's num_chars long
++// if return is positive, the first unused row of the bitmap
++// if return is negative, returns the negative of the number of characters that fit
++// if return is 0, no characters fit and no rows were used
++// This uses a very crappy packing.
++
++typedef struct
++{
++   float x0,y0,s0,t0; // top-left
++   float x1,y1,s1,t1; // bottom-right
++} stbtt_aligned_quad;
++
++extern void stbtt_GetBakedQuad(stbtt_bakedchar *chardata, int pw, int ph,  // same data as above
++                               int char_index,             // character to display
++                               float *xpos, float *ypos,   // pointers to current position in screen pixel space
++                               stbtt_aligned_quad *q,      // output: quad to draw
++                               int opengl_fillrule);       // true if opengl fill rule; false if DX9 or earlier
++// Call GetBakedQuad with char_index = 'character - first_char', and it
++// creates the quad you need to draw and advances the current position.
++// It's inefficient; you might want to c&p it and optimize it.
++
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// FONT LOADING
++//
++//
++
++extern int stbtt_GetFontOffsetForIndex(const unsigned char *data, int index);
++// Each .ttf file may have more than one font. Each has a sequential index
++// number starting from 0. Call this function to get the font offset for a
++// given index; it returns -1 if the index is out of range. A regular .ttf
++// file will only define one font and it always be at offset 0, so it will
++// return '0' for index 0, and -1 for all other indices. You can just skip
++// this step if you know it's that kind of font.
++
++
++// The following structure is defined publically so you can declare one on
++// the stack or as a global or etc.
++typedef struct
++{
++   void           *userdata;
++   unsigned char  *data;         // pointer to .ttf file
++   int             fontstart;    // offset of start of font
++
++   int numGlyphs;                // number of glyphs, needed for range checking
++
++   int loca,head,glyf,hhea,hmtx; // table locations as offset from start of .ttf
++   int index_map;                // a cmap mapping for our chosen character encoding
++   int indexToLocFormat;         // format needed to map from glyph index to glyph
++} stbtt_fontinfo;
++
++extern int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data, int offset);
++// Given an offset into the file that defines a font, this function builds
++// the necessary cached info for the rest of the system. You must allocate
++// the stbtt_fontinfo yourself, and stbtt_InitFont will fill it out. You don't
++// need to do anything special to free it, because the contents are a pure
++// cache with no additional data structures. Returns 0 on failure.
++
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// CHARACTER TO GLYPH-INDEX CONVERSIOn
++
++int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codepoint);
++// If you're going to perform multiple operations on the same character
++// and you want a speed-up, call this function with the character you're
++// going to process, then use glyph-based functions instead of the
++// codepoint-based functions.
++
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// CHARACTER PROPERTIES
++//
++
++extern float stbtt_ScaleForPixelHeight(const stbtt_fontinfo *info, float pixels);
++// computes a scale factor to produce a font whose "height" is 'pixels' tall.
++// Height is measured as the distance from the highest ascender to the lowest
++// descender; in other words, it's equivalent to calling stbtt_GetFontVMetrics
++// and computing:
++//       scale = pixels / (ascent - descent)
++// so if you prefer to measure height by the ascent only, use a similar calculation.
++
++extern void stbtt_GetFontVMetrics(const stbtt_fontinfo *info, int *ascent, int *descent, int *lineGap);
++// ascent is the coordinate above the baseline the font extends; descent
++// is the coordinate below the baseline the font extends (i.e. it is typically negative)
++// lineGap is the spacing between one row's descent and the next row's ascent...
++// so you should advance the vertical position by "*ascent - *descent + *lineGap"
++//   these are expressed in unscaled coordinates
++
++extern void stbtt_GetCodepointHMetrics(const stbtt_fontinfo *info, int codepoint, int *advanceWidth, int *leftSideBearing);
++// leftSideBearing is the offset from the current horizontal position to the left edge of the character
++// advanceWidth is the offset from the current horizontal position to the next horizontal position
++//   these are expressed in unscaled coordinates
++
++extern int  stbtt_GetCodepointKernAdvance(const stbtt_fontinfo *info, int ch1, int ch2);
++// an additional amount to add to the 'advance' value between ch1 and ch2
++// @TODO; for now always returns 0!
++
++extern int stbtt_GetCodepointBox(const stbtt_fontinfo *info, int codepoint, int *x0, int *y0, int *x1, int *y1);
++// Gets the bounding box of the visible part of the glyph, in unscaled coordinates
++
++extern void stbtt_GetGlyphHMetrics(const stbtt_fontinfo *info, int glyph_index, int *advanceWidth, int *leftSideBearing);
++extern int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo *info, int glyph1, int glyph2);
++extern int  stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1);
++// as above, but takes one or more glyph indices for greater efficiency
++
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// GLYPH SHAPES (you probably don't need these, but they have to go before
++// the bitmaps for C declaration-order reasons)
++//
++
++#ifndef STBTT_vmove // you can predefine these to use different values (but why?)
++   enum {
++      STBTT_vmove=1,
++      STBTT_vline,
++      STBTT_vcurve
++   };
++#endif
++
++#ifndef stbtt_vertex // you can predefine this to use different values
++                   // (we share this with other code at RAD)
++   #define stbtt_vertex_type short // can't use stbtt_int16 because that's not visible in the header file
++   typedef struct
++   {
++      stbtt_vertex_type x,y,cx,cy;
++      unsigned char type,padding;
++   } stbtt_vertex;
++#endif
++
++extern int stbtt_GetCodepointShape(const stbtt_fontinfo *info, int unicode_codepoint, stbtt_vertex **vertices);
++extern int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **vertices);
++// returns # of vertices and fills *vertices with the pointer to them
++//   these are expressed in "unscaled" coordinates
++
++extern void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *vertices);
++// frees the data allocated above
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// BITMAP RENDERING
++//
++
++extern void stbtt_FreeBitmap(unsigned char *bitmap, void *userdata);
++// frees the bitmap allocated below
++
++extern unsigned char *stbtt_GetCodepointBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int codepoint, int *width, int *height, int *xoff, int *yoff);
++// allocates a large-enough single-channel 8bpp bitmap and renders the
++// specified character/glyph at the specified scale into it, with
++// antialiasing. 0 is no coverage (transparent), 255 is fully covered (opaque).
++// *width & *height are filled out with the width & height of the bitmap,
++// which is stored left-to-right, top-to-bottom.
++//
++// xoff/yoff are the offset it pixel space from the glyph origin to the top-left of the bitmap
++
++extern void stbtt_MakeCodepointBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int codepoint);
++// the same as above, but you pass in storage for the bitmap in the form
++// of 'output', with row spacing of 'out_stride' bytes. the bitmap is
++// clipped to out_w/out_h bytes. call the next function to get the
++// height and width and positioning info
++
++extern void stbtt_GetCodepointBitmapBox(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
++// get the bbox of the bitmap centered around the glyph origin; so the
++// bitmap width is ix1-ix0, height is iy1-iy0, and location to place
++// the bitmap top left is (leftSideBearing*scale,iy0).
++// (Note that the bitmap uses y-increases-down, but the shape uses
++// y-increases-up, so CodepointBitmapBox and CodepointBox are inverted.)
++
++extern unsigned char *stbtt_GetGlyphBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int glyph, int *width, int *height, int *xoff, int *yoff);
++extern void stbtt_GetGlyphBitmapBox(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
++extern void stbtt_MakeGlyphBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int glyph);
++
++//extern void stbtt_get_true_bbox(stbtt_vertex *vertices, int num_verts, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1);
++
++// @TODO: don't expose this structure
++typedef struct
++{
++   int w,h,stride;
++   unsigned char *pixels;
++} stbtt__bitmap;
++
++extern void stbtt_Rasterize(stbtt__bitmap *result, float flatness_in_pixels, stbtt_vertex *vertices, int num_verts, float scale_x, float scale_y, int x_off, int y_off, int invert, void *userdata);
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// Finding the right font...
++//
++// You should really just solve this offline, keep your own tables
++// of what font is what, and don't try to get it out of the .ttf file.
++// That's because getting it out of the .ttf file is really hard, because
++// the names in the file can appear in many possible encodings, in many
++// possible languages, and e.g. if you need a case-insensitive comparison,
++// the details of that depend on the encoding & language in a complex way
++// (actually underspecified in truetype, but also gigantic).
++//
++// But you can use the provided functions in two possible ways:
++//     stbtt_FindMatchingFont() will use *case-sensitive* comparisons on
++//             unicode-encoded names to try to find the font you want;
++//             you can run this before calling stbtt_InitFont()
++//
++//     stbtt_GetFontNameString() lets you get any of the various strings
++//             from the file yourself and do your own comparisons on them.
++//             You have to have called stbtt_InitFont() first.
++
++
++extern int stbtt_FindMatchingFont(const unsigned char *fontdata, const char *name, int flags);
++// returns the offset (not index) of the font that matches, or -1 if none
++//   if you use STBTT_MACSTYLE_DONTCARE, use a font name like "Arial Bold".
++//   if you use any other flag, use a font name like "Arial"; this checks
++//     the 'macStyle' header field; i don't know if fonts set this consistently
++#define STBTT_MACSTYLE_DONTCARE     0
++#define STBTT_MACSTYLE_BOLD         1
++#define STBTT_MACSTYLE_ITALIC       2
++#define STBTT_MACSTYLE_UNDERSCORE   4
++#define STBTT_MACSTYLE_NONE         8   // <= not same as 0, this makes us check the bitfield is 0
++
++extern int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const char *s2, int len2);
++// returns 1/0 whether the first string interpreted as utf8 is identical to
++// the second string interpreted as big-endian utf16... useful for strings from next func
++
++extern char *stbtt_GetFontNameString(const stbtt_fontinfo *font, int *length, int platformID, int encodingID, int languageID, int nameID);
++// returns the string (which may be big-endian double byte, e.g. for unicode)
++// and puts the length in bytes in *length.
++//
++// some of the values for the IDs are below; for more see the truetype spec:
++//     http://developer.apple.com/textfonts/TTRefMan/RM06/Chap6name.html
++//     http://www.microsoft.com/typography/otspec/name.htm
++
++enum { // platformID
++   STBTT_PLATFORM_ID_UNICODE   =0,
++   STBTT_PLATFORM_ID_MAC       =1,
++   STBTT_PLATFORM_ID_ISO       =2,
++   STBTT_PLATFORM_ID_MICROSOFT =3
++};
++
++enum { // encodingID for STBTT_PLATFORM_ID_UNICODE
++   STBTT_UNICODE_EID_UNICODE_1_0    =0,
++   STBTT_UNICODE_EID_UNICODE_1_1    =1,
++   STBTT_UNICODE_EID_ISO_10646      =2,
++   STBTT_UNICODE_EID_UNICODE_2_0_BMP=3,
++   STBTT_UNICODE_EID_UNICODE_2_0_FULL=4,
++};
++
++enum { // encodingID for STBTT_PLATFORM_ID_MICROSOFT
++   STBTT_MS_EID_SYMBOL        =0,
++   STBTT_MS_EID_UNICODE_BMP   =1,
++   STBTT_MS_EID_SHIFTJIS      =2,
++   STBTT_MS_EID_UNICODE_FULL  =10,
++};
++
++enum { // encodingID for STBTT_PLATFORM_ID_MAC; same as Script Manager codes
++   STBTT_MAC_EID_ROMAN        =0,   STBTT_MAC_EID_ARABIC       =4,
++   STBTT_MAC_EID_JAPANESE     =1,   STBTT_MAC_EID_HEBREW       =5,
++   STBTT_MAC_EID_CHINESE_TRAD =2,   STBTT_MAC_EID_GREEK        =6,
++   STBTT_MAC_EID_KOREAN       =3,   STBTT_MAC_EID_RUSSIAN      =7,
++};
++
++enum { // languageID for STBTT_PLATFORM_ID_MICROSOFT; same as LCID...
++       // problematic because there are e.g. 16 english LCIDs and 16 arabic LCIDs
++   STBTT_MS_LANG_ENGLISH     =0x0409,   STBTT_MS_LANG_ITALIAN     =0x0410,
++   STBTT_MS_LANG_CHINESE     =0x0804,   STBTT_MS_LANG_JAPANESE    =0x0411,
++   STBTT_MS_LANG_DUTCH       =0x0413,   STBTT_MS_LANG_KOREAN      =0x0412,
++   STBTT_MS_LANG_FRENCH      =0x040c,   STBTT_MS_LANG_RUSSIAN     =0x0419,
++   STBTT_MS_LANG_GERMAN      =0x0407,   STBTT_MS_LANG_SPANISH     =0x0409,
++   STBTT_MS_LANG_HEBREW      =0x040d,   STBTT_MS_LANG_SWEDISH     =0x041D,
++};
++
++enum { // languageID for STBTT_PLATFORM_ID_MAC
++   STBTT_MAC_LANG_ENGLISH      =0 ,   STBTT_MAC_LANG_JAPANESE     =11,
++   STBTT_MAC_LANG_ARABIC       =12,   STBTT_MAC_LANG_KOREAN       =23,
++   STBTT_MAC_LANG_DUTCH        =4 ,   STBTT_MAC_LANG_RUSSIAN      =32,
++   STBTT_MAC_LANG_FRENCH       =1 ,   STBTT_MAC_LANG_SPANISH      =6 ,
++   STBTT_MAC_LANG_GERMAN       =2 ,   STBTT_MAC_LANG_SWEDISH      =5 ,
++   STBTT_MAC_LANG_HEBREW       =10,   STBTT_MAC_LANG_CHINESE_SIMPLIFIED =33,
++   STBTT_MAC_LANG_ITALIAN      =3 ,   STBTT_MAC_LANG_CHINESE_TRAD =19,
++};
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif // __STB_INCLUDE_STB_TRUETYPE_H__
++
++///////////////////////////////////////////////////////////////////////////////
++///////////////////////////////////////////////////////////////////////////////
++////
++////   IMPLEMENTATION
++////
++////
++
++#ifdef STB_TRUETYPE_IMPLEMENTATION
++
++//////////////////////////////////////////////////////////////////////////
++//
++// accessors to parse data from file
++//
++
++// on platforms that don't allow misaligned reads, if we want to allow
++// truetype fonts that aren't padded to alignment, define ALLOW_UNALIGNED_TRUETYPE
++
++#define ttBYTE(p)     (* (stbtt_uint8 *) (p))
++#define ttCHAR(p)     (* (stbtt_int8 *) (p))
++#define ttFixed(p)    ttLONG(p)
++
++#if defined(STB_TRUETYPE_BIGENDIAN) && !defined(ALLOW_UNALIGNED_TRUETYPE)
++
++   #define ttUSHORT(p)   (* (stbtt_uint16 *) (p))
++   #define ttSHORT(p)    (* (stbtt_int16 *) (p))
++   #define ttULONG(p)    (* (stbtt_uint32 *) (p))
++   #define ttLONG(p)     (* (stbtt_int32 *) (p))
++
++#else
++
++   stbtt_uint16 ttUSHORT(const stbtt_uint8 *p) { return p[0]*256 + p[1]; }
++   stbtt_int16 ttSHORT(const stbtt_uint8 *p)   { return p[0]*256 + p[1]; }
++   stbtt_uint32 ttULONG(const stbtt_uint8 *p)  { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
++   stbtt_int32 ttLONG(const stbtt_uint8 *p)    { return (p[0]<<24) + (p[1]<<16) + (p[2]<<8) + p[3]; }
++
++#endif
++
++#define stbtt_tag4(p,c0,c1,c2,c3) ((p)[0] == (c0) && (p)[1] == (c1) && (p)[2] == (c2) && (p)[3] == (c3))
++#define stbtt_tag(p,str)           stbtt_tag4(p,str[0],str[1],str[2],str[3])
++
++static int stbtt__isfont(const stbtt_uint8 *font)
++{
++   // check the version number
++   if (stbtt_tag(font, "1"))   return 1; // TrueType 1
++   if (stbtt_tag(font, "typ1"))   return 1; // TrueType with type 1 font -- we don't support this!
++   if (stbtt_tag(font, "OTTO"))   return 1; // OpenType with CFF
++   if (stbtt_tag4(font, 0,1,0,0)) return 1; // OpenType 1.0
++   return 0;
++}
++
++// @OPTIMIZE: binary search
++static stbtt_uint32 stbtt__find_table(stbtt_uint8 *data, stbtt_uint32 fontstart, const char *tag)
++{
++   stbtt_int32 num_tables = ttUSHORT(data+fontstart+4);
++   stbtt_uint32 tabledir = fontstart + 12;
++   stbtt_int32 i;
++   for (i=0; i < num_tables; ++i) {
++      stbtt_uint32 loc = tabledir + 16*i;
++      if (stbtt_tag(data+loc+0, tag))
++         return ttULONG(data+loc+8);
++   }
++   return 0;
++}
++
++int stbtt_GetFontOffsetForIndex(const unsigned char *font_collection, int index)
++{
++   // if it's just a font, there's only one valid index
++   if (stbtt__isfont(font_collection))
++      return index == 0 ? 0 : -1;
++
++   // check if it's a TTC
++   if (stbtt_tag(font_collection, "ttcf")) {
++      // version 1?
++      if (ttULONG(font_collection+4) == 0x00010000 || ttULONG(font_collection+4) == 0x00020000) {
++         stbtt_int32 n = ttLONG(font_collection+8);
++         if (index >= n)
++            return -1;
++         return ttULONG(font_collection+12+index*14);
++      }
++   }
++   return -1;
++}
++
++int stbtt_InitFont(stbtt_fontinfo *info, const unsigned char *data2, int fontstart)
++{
++   stbtt_uint8 *data = (stbtt_uint8 *) data2;
++   stbtt_uint32 cmap, t;
++   stbtt_int32 i,numTables;
++
++   info->data = data;
++   info->fontstart = fontstart;
++
++   cmap = stbtt__find_table(data, fontstart, "cmap");
++   info->loca = stbtt__find_table(data, fontstart, "loca");
++   info->head = stbtt__find_table(data, fontstart, "head");
++   info->glyf = stbtt__find_table(data, fontstart, "glyf");
++   info->hhea = stbtt__find_table(data, fontstart, "hhea");
++   info->hmtx = stbtt__find_table(data, fontstart, "hmtx");
++   if (!cmap || !info->loca || !info->head || !info->glyf || !info->hhea || !info->hmtx)
++      return 0;
++
++   t = stbtt__find_table(data, fontstart, "maxp");
++   if (t)
++      info->numGlyphs = ttUSHORT(data+t+4);
++   else
++      info->numGlyphs = 0xffff;
++
++   // find a cmap encoding table we understand *now* to avoid searching
++   // later. (todo: could make this installable)
++   // the same regardless of glyph.
++   numTables = ttUSHORT(data + cmap + 2);
++   info->index_map = 0;
++   for (i=0; i < numTables; ++i) {
++      stbtt_uint32 encoding_record = cmap + 4 + 8 * i;
++      // find an encoding we understand:
++      switch(ttUSHORT(data+encoding_record)) {
++         case STBTT_PLATFORM_ID_MICROSOFT:
++            switch (ttUSHORT(data+encoding_record+2)) {
++               case STBTT_MS_EID_UNICODE_BMP:
++               case STBTT_MS_EID_UNICODE_FULL:
++                  // MS/Unicode
++                  info->index_map = cmap + ttULONG(data+encoding_record+4);
++                  break;
++            }
++            break;
++      }
++   }
++   if (info->index_map == 0)
++      return 0;
++
++   info->indexToLocFormat = ttUSHORT(data+info->head + 50);
++   return 1;
++}
++
++int stbtt_FindGlyphIndex(const stbtt_fontinfo *info, int unicode_codepoint)
++{
++   stbtt_uint8 *data = info->data;
++   stbtt_uint32 index_map = info->index_map;
++
++   stbtt_uint16 format = ttUSHORT(data + index_map + 0);
++   if (format == 0) { // apple byte encoding
++      stbtt_int32 bytes = ttUSHORT(data + index_map + 2);
++      if (unicode_codepoint < bytes-6)
++         return ttBYTE(data + index_map + 6 + unicode_codepoint);
++      return 0;
++   } else if (format == 6) {
++      stbtt_uint32 first = ttUSHORT(data + index_map + 6);
++      stbtt_uint32 count = ttUSHORT(data + index_map + 8);
++      if ((stbtt_uint32) unicode_codepoint >= first && (stbtt_uint32) unicode_codepoint < first+count)
++         return ttUSHORT(data + index_map + 10 + (unicode_codepoint - first)*2);
++      return 0;
++   } else if (format == 2) {
++      STBTT_assert(0); // @TODO: high-byte mapping for japanese/chinese/korean
++      return 0;
++   } else if (format == 4) { // standard mapping for windows fonts: binary search collection of ranges
++      stbtt_uint16 segcount = ttUSHORT(data+index_map+6) >> 1;
++      stbtt_uint16 searchRange = ttUSHORT(data+index_map+8) >> 1;
++      stbtt_uint16 entrySelector = ttUSHORT(data+index_map+10);
++      stbtt_uint16 rangeShift = ttUSHORT(data+index_map+12) >> 1;
++      stbtt_uint16 item, offset, start, end;
++
++      // do a binary search of the segments
++      stbtt_uint32 endCount = index_map + 14;
++      stbtt_uint32 search = endCount;
++
++      if (unicode_codepoint > 0xffff)
++         return 0;
++
++      // they lie from endCount .. endCount + segCount
++      // but searchRange is the nearest power of two, so...
++      if (unicode_codepoint >= ttUSHORT(data + search + rangeShift*2))
++         search += rangeShift*2;
++
++      // now decrement to bias correctly to find smallest
++      search -= 2;
++      while (entrySelector) {
++         stbtt_uint16 start, end;
++         searchRange >>= 1;
++         start = ttUSHORT(data + search + 2 + segcount*2 + 2);
++         end = ttUSHORT(data + search + 2);
++         start = ttUSHORT(data + search + searchRange*2 + segcount*2 + 2);
++         end = ttUSHORT(data + search + searchRange*2);
++         if (unicode_codepoint > end)
++            search += searchRange*2;
++         --entrySelector;
++      }
++      search += 2;
++
++      item = (stbtt_uint16) ((search - endCount) >> 1);
++
++      STBTT_assert(unicode_codepoint <= ttUSHORT(data + endCount + 2*item));
++      start = ttUSHORT(data + index_map + 14 + segcount*2 + 2 + 2*item);
++      end = ttUSHORT(data + index_map + 14 + 2 + 2*item);
++      if (unicode_codepoint < start)
++         return 0;
++
++      offset = ttUSHORT(data + index_map + 14 + segcount*6 + 2 + 2*item);
++      if (offset == 0)
++         return unicode_codepoint + ttSHORT(data + index_map + 14 + segcount*4 + 2 + 2*item);
++
++      return ttUSHORT(data + offset + (unicode_codepoint-start)*2 + index_map + 14 + segcount*6 + 2 + 2*item);
++   } else if (format == 12) {
++      stbtt_uint16 ngroups = ttUSHORT(data+index_map+6);
++      stbtt_int32 low,high;
++      low = 0; high = (stbtt_int32)ngroups;
++      // Binary search the right group.
++      while (low <= high) {
++         stbtt_int32 mid = low + ((high-low) >> 1); // rounds down, so low <= mid < high
++         stbtt_uint32 start_char = ttULONG(data+index_map+16+mid*12);
++         stbtt_uint32 end_char = ttULONG(data+index_map+16+mid*12+4);
++         if ((stbtt_uint32) unicode_codepoint < start_char)
++            high = mid-1;
++         else if ((stbtt_uint32) unicode_codepoint > end_char)
++            low = mid+1;
++         else {
++            stbtt_uint32 start_glyph = ttULONG(data+index_map+16+mid*12+8);
++            return start_glyph + unicode_codepoint-start_char;
++         }
++      }
++      return 0; // not found
++   }
++   // @TODO
++   STBTT_assert(0);
++   return 0;
++}
++
++int stbtt_GetCodepointShape(const stbtt_fontinfo *info, int unicode_codepoint, stbtt_vertex **vertices)
++{
++   return stbtt_GetGlyphShape(info, stbtt_FindGlyphIndex(info, unicode_codepoint), vertices);
++}
++
++static void stbtt_setvertex(stbtt_vertex *v, stbtt_uint8 type, stbtt_int16 x, stbtt_int16 y, stbtt_int16 cx, stbtt_int16 cy)
++{
++   v->type = type;
++   v->x = x;
++   v->y = y;
++   v->cx = cx;
++   v->cy = cy;
++}
++
++static int stbtt__GetGlyfOffset(const stbtt_fontinfo *info, int glyph_index)
++{
++   int g1,g2;
++
++   if (glyph_index >= info->numGlyphs) return -1; // glyph index out of range
++   if (info->indexToLocFormat >= 2)    return -1; // unknown index->glyph map format
++
++   if (info->indexToLocFormat == 0) {
++      g1 = info->glyf + ttUSHORT(info->data + info->loca + glyph_index * 2) * 2;
++      g2 = info->glyf + ttUSHORT(info->data + info->loca + glyph_index * 2 + 2) * 2;
++   } else {
++      g1 = info->glyf + ttULONG (info->data + info->loca + glyph_index * 4);
++      g2 = info->glyf + ttULONG (info->data + info->loca + glyph_index * 4 + 4);
++   }
++
++   return g1==g2 ? -1 : g1; // if length is 0, return -1
++}
++
++int stbtt_GetGlyphBox(const stbtt_fontinfo *info, int glyph_index, int *x0, int *y0, int *x1, int *y1)
++{
++   int g = stbtt__GetGlyfOffset(info, glyph_index);
++   if (g < 0) return 0;
++
++   if (x0) *x0 = ttSHORT(info->data + g + 2);
++   if (y0) *y0 = ttSHORT(info->data + g + 4);
++   if (x1) *x1 = ttSHORT(info->data + g + 6);
++   if (y1) *y1 = ttSHORT(info->data + g + 8);
++   return 1;
++}
++
++int stbtt_GetCodepointBox(const stbtt_fontinfo *info, int codepoint, int *x0, int *y0, int *x1, int *y1)
++{
++   return stbtt_GetGlyphBox(info, stbtt_FindGlyphIndex(info,codepoint), x0,y0,x1,y1);
++}
++
++int stbtt_GetGlyphShape(const stbtt_fontinfo *info, int glyph_index, stbtt_vertex **pvertices)
++{
++   stbtt_int16 numberOfContours;
++   stbtt_uint8 *endPtsOfContours;
++   stbtt_uint8 *data = info->data;
++   stbtt_vertex *vertices=0;
++   int num_vertices=0;
++   int g = stbtt__GetGlyfOffset(info, glyph_index);
++
++   *pvertices = NULL;
++
++   if (g < 0) return 0;
++
++   numberOfContours = ttSHORT(data + g);
++
++   if (numberOfContours > 0) {
++      stbtt_uint8 flags=0,flagcount;
++      stbtt_int32 ins, i,j=0,m,n, next_move, was_off=0, off;
++      stbtt_int16 x,y,cx,cy,sx,sy;
++      stbtt_uint8 *points;
++      endPtsOfContours = (data + g + 10);
++      ins = ttUSHORT(data + g + 10 + numberOfContours * 2);
++      points = data + g + 10 + numberOfContours * 2 + 2 + ins;
++
++      n = 1+ttUSHORT(endPtsOfContours + numberOfContours*2-2);
++
++      m = n + numberOfContours;  // a loose bound on how many vertices we might need
++      vertices = (stbtt_vertex *) STBTT_malloc(m * sizeof(vertices[0]), info->userdata);
++      if (vertices == 0)
++         return 0;
++
++      next_move = 0;
++      flagcount=0;
++
++      // in first pass, we load uninterpreted data into the allocated array
++      // above, shifted to the end of the array so we won't overwrite it when
++      // we create our final data starting from the front
++
++      off = m - n; // starting offset for uninterpreted data, regardless of how m ends up being calculated
++
++      // first load flags
++
++      for (i=0; i < n; ++i) {
++         if (flagcount == 0) {
++            flags = *points++;
++            if (flags & 8)
++               flagcount = *points++;
++         } else
++            --flagcount;
++         vertices[off+i].type = flags;
++      }
++
++      // now load x coordinates
++      x=0;
++      for (i=0; i < n; ++i) {
++         flags = vertices[off+i].type;
++         if (flags & 2) {
++            stbtt_int16 dx = *points++;
++            x += (flags & 16) ? dx : -dx; // ???
++         } else {
++            if (!(flags & 16)) {
++               x = x + (stbtt_int16) (points[0]*256 + points[1]);
++               points += 2;
++            }
++         }
++         vertices[off+i].x = x;
++      }
++
++      // now load y coordinates
++      y=0;
++      for (i=0; i < n; ++i) {
++         flags = vertices[off+i].type;
++         if (flags & 4) {
++            stbtt_int16 dy = *points++;
++            y += (flags & 32) ? dy : -dy; // ???
++         } else {
++            if (!(flags & 32)) {
++               y = y + (stbtt_int16) (points[0]*256 + points[1]);
++               points += 2;
++            }
++         }
++         vertices[off+i].y = y;
++      }
++
++      // now convert them to our format
++      num_vertices=0;
++      sx = sy = cx = cy = 0;
++      for (i=0; i < n; ++i) {
++         flags = vertices[off+i].type;
++         x     = (stbtt_int16) vertices[off+i].x;
++         y     = (stbtt_int16) vertices[off+i].y;
++         if (next_move == i) {
++            // when we get to the end, we have to close the shape explicitly
++            if (i != 0) {
++               if (was_off)
++                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve,sx,sy,cx,cy);
++               else
++                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vline,sx,sy,0,0);
++            }
++
++            // now start the new one               
++            stbtt_setvertex(&vertices[num_vertices++], STBTT_vmove,x,y,0,0);
++            next_move = 1 + ttUSHORT(endPtsOfContours+j*2);
++            ++j;
++            was_off = 0;
++            sx = x;
++            sy = y;
++         } else {
++            if (!(flags & 1)) { // if it's a curve
++               if (was_off) // two off-curve control points in a row means interpolate an on-curve midpoint
++                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, (cx+x)>>1, (cy+y)>>1, cx, cy);
++               cx = x;
++               cy = y;
++               was_off = 1;
++            } else {
++               if (was_off)
++                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve, x,y, cx, cy);
++               else
++                  stbtt_setvertex(&vertices[num_vertices++], STBTT_vline, x,y,0,0);
++               was_off = 0;
++            }
++         }
++      }
++      if (i != 0) {
++         if (was_off)
++            stbtt_setvertex(&vertices[num_vertices++], STBTT_vcurve,sx,sy,cx,cy);
++         else
++            stbtt_setvertex(&vertices[num_vertices++], STBTT_vline,sx,sy,0,0);
++      }
++   } else if (numberOfContours == -1) {
++      // Compound shapes.
++      int more = 1;
++      stbtt_uint8 *comp = data + g + 10;
++      num_vertices = 0;
++      vertices = 0;
++      while (more) {
++         stbtt_uint16 flags, gidx;
++         int comp_num_verts = 0, i;
++         stbtt_vertex *comp_verts = 0, *tmp = 0;
++         float mtx[6] = {1,0,0,1,0,0}, m, n;
++         
++         flags = ttSHORT(comp); comp+=2;
++         gidx = ttSHORT(comp); comp+=2;
++
++         if (flags & 2) { // XY values
++            if (flags & 1) { // shorts
++               mtx[4] = ttSHORT(comp); comp+=2;
++               mtx[5] = ttSHORT(comp); comp+=2;
++            } else {
++               mtx[4] = ttCHAR(comp); comp+=1;
++               mtx[5] = ttCHAR(comp); comp+=1;
++            }
++         }
++         else {
++            // @TODO handle matching point
++            STBTT_assert(0);
++         }
++         if (flags & (1<<3)) { // WE_HAVE_A_SCALE
++            mtx[0] = mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
++            mtx[1] = mtx[2] = 0;
++         } else if (flags & (1<<6)) { // WE_HAVE_AN_X_AND_YSCALE
++            mtx[0] = ttSHORT(comp)/16384.0f; comp+=2;
++            mtx[1] = mtx[2] = 0;
++            mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
++         } else if (flags & (1<<7)) { // WE_HAVE_A_TWO_BY_TWO
++            mtx[0] = ttSHORT(comp)/16384.0f; comp+=2;
++            mtx[1] = ttSHORT(comp)/16384.0f; comp+=2;
++            mtx[2] = ttSHORT(comp)/16384.0f; comp+=2;
++            mtx[3] = ttSHORT(comp)/16384.0f; comp+=2;
++         }
++         
++         // Find transformation scales.
++         m = (float) sqrt(mtx[0]*mtx[0] + mtx[1]*mtx[1]);
++         n = (float) sqrt(mtx[2]*mtx[2] + mtx[3]*mtx[3]);
++
++         // Get indexed glyph.
++         comp_num_verts = stbtt_GetGlyphShape(info, gidx, &comp_verts);
++         if (comp_num_verts > 0) {
++            // Transform vertices.
++            for (i = 0; i < comp_num_verts; ++i) {
++               stbtt_vertex* v = &comp_verts[i];
++               stbtt_vertex_type x,y;
++               x=v->x; y=v->y;
++               v->x = (stbtt_vertex_type)(m * (mtx[0]*x + mtx[2]*y + mtx[4]));
++               v->y = (stbtt_vertex_type)(n * (mtx[1]*x + mtx[3]*y + mtx[5]));
++               x=v->cx; y=v->cy;
++               v->cx = (stbtt_vertex_type)(m * (mtx[0]*x + mtx[2]*y + mtx[4]));
++               v->cy = (stbtt_vertex_type)(n * (mtx[1]*x + mtx[3]*y + mtx[5]));
++            }
++            // Append vertices.
++            tmp = (stbtt_vertex*)STBTT_malloc((num_vertices+comp_num_verts)*sizeof(stbtt_vertex), info->userdata);
++            if (!tmp) {
++               if (vertices) STBTT_free(vertices, info->userdata);
++               if (comp_verts) STBTT_free(comp_verts, info->userdata);
++               return 0;
++            }
++            if (num_vertices > 0) memcpy(tmp, vertices, num_vertices*sizeof(stbtt_vertex));
++            memcpy(tmp+num_vertices, comp_verts, comp_num_verts*sizeof(stbtt_vertex));
++            if (vertices) STBTT_free(vertices, info->userdata);
++            vertices = tmp;
++            STBTT_free(comp_verts, info->userdata);
++            num_vertices += comp_num_verts;
++         }
++         // More components ?
++         more = flags & (1<<5);
++      }
++   } else if (numberOfContours < 0) {
++      // @TODO other compound variations?
++      STBTT_assert(0);
++   } else {
++      // numberOfCounters == 0, do nothing
++   }
++
++   *pvertices = vertices;
++   return num_vertices;
++}
++
++void stbtt_GetGlyphHMetrics(const stbtt_fontinfo *info, int glyph_index, int *advanceWidth, int *leftSideBearing)
++{
++   stbtt_uint16 numOfLongHorMetrics = ttUSHORT(info->data+info->hhea + 34);
++   if (glyph_index < numOfLongHorMetrics) {
++      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*glyph_index);
++      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*glyph_index + 2);
++   } else {
++      if (advanceWidth)     *advanceWidth    = ttSHORT(info->data + info->hmtx + 4*(numOfLongHorMetrics-1));
++      if (leftSideBearing)  *leftSideBearing = ttSHORT(info->data + info->hmtx + 4*numOfLongHorMetrics + 2*(glyph_index - numOfLongHorMetrics));
++   }
++}
++
++int  stbtt_GetGlyphKernAdvance(const stbtt_fontinfo * /*info*/, int /*glyph1*/, int /*glyph2*/)
++{
++   return 0;
++}
++
++int  stbtt_GetCodepointKernAdvance(const stbtt_fontinfo * /*info*/, int /*ch1*/, int /*ch2*/)
++{
++   return 0;
++}
++
++void stbtt_GetCodepointHMetrics(const stbtt_fontinfo *info, int codepoint, int *advanceWidth, int *leftSideBearing)
++{
++   stbtt_GetGlyphHMetrics(info, stbtt_FindGlyphIndex(info,codepoint), advanceWidth, leftSideBearing);
++}
++
++void stbtt_GetFontVMetrics(const stbtt_fontinfo *info, int *ascent, int *descent, int *lineGap)
++{
++   if (ascent ) *ascent  = ttSHORT(info->data+info->hhea + 4);
++   if (descent) *descent = ttSHORT(info->data+info->hhea + 6);
++   if (lineGap) *lineGap = ttSHORT(info->data+info->hhea + 8);
++}
++
++float stbtt_ScaleForPixelHeight(const stbtt_fontinfo *info, float height)
++{
++   int fheight = ttSHORT(info->data + info->hhea + 4) - ttSHORT(info->data + info->hhea + 6);
++   return (float) height / fheight;
++}
++
++void stbtt_FreeShape(const stbtt_fontinfo *info, stbtt_vertex *v)
++{
++   STBTT_free(v, info->userdata);
++}
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// antialiasing software rasterizer
++//
++
++void stbtt_GetGlyphBitmapBox(const stbtt_fontinfo *font, int glyph, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1)
++{
++   int x0,y0,x1,y1;
++   if (!stbtt_GetGlyphBox(font, glyph, &x0,&y0,&x1,&y1))
++      x0=y0=x1=y1=0; // e.g. space character
++   // now move to integral bboxes (treating pixels as little squares, what pixels get touched)?
++   if (ix0) *ix0 =  STBTT_ifloor(x0 * scale_x);
++   if (iy0) *iy0 = -STBTT_iceil (y1 * scale_y);
++   if (ix1) *ix1 =  STBTT_iceil (x1 * scale_x);
++   if (iy1) *iy1 = -STBTT_ifloor(y0 * scale_y);
++}
++
++void stbtt_GetCodepointBitmapBox(const stbtt_fontinfo *font, int codepoint, float scale_x, float scale_y, int *ix0, int *iy0, int *ix1, int *iy1)
++{
++   stbtt_GetGlyphBitmapBox(font, stbtt_FindGlyphIndex(font,codepoint), scale_x, scale_y, ix0,iy0,ix1,iy1);
++}
++
++typedef struct stbtt__edge {
++   float x0,y0, x1,y1;
++   int invert;
++} stbtt__edge;
++
++typedef struct stbtt__active_edge
++{
++   int x,dx;
++   float ey;
++   struct stbtt__active_edge *next;
++   int valid;
++} stbtt__active_edge;
++
++#define FIXSHIFT   10
++#define FIX        (1 << FIXSHIFT)
++#define FIXMASK    (FIX-1)
++
++static stbtt__active_edge *new_active(stbtt__edge *e, int off_x, float start_point, void *userdata)
++{
++   stbtt__active_edge *z = (stbtt__active_edge *) STBTT_malloc(sizeof(*z), userdata); // @TODO: make a pool of these!!!
++   float dxdy = (e->x1 - e->x0) / (e->y1 - e->y0);
++   STBTT_assert(e->y0 <= start_point);
++   if (!z) return z;
++   // round dx down to avoid going too far
++   if (dxdy < 0)
++      z->dx = -STBTT_ifloor(FIX * -dxdy);
++   else
++      z->dx = STBTT_ifloor(FIX * dxdy);
++   z->x = STBTT_ifloor(FIX * (e->x0 + dxdy * (start_point - e->y0)));
++   z->x -= off_x * FIX;
++   z->ey = e->y1;
++   z->next = 0;
++   z->valid = e->invert ? 1 : -1;
++   return z;
++}
++
++// note: this routine clips fills that extend off the edges... ideally this
++// wouldn't happen, but it could happen if the truetype glyph bounding boxes
++// are wrong, or if the user supplies a too-small bitmap
++static void stbtt__fill_active_edges(unsigned char *scanline, int len, stbtt__active_edge *e, int max_weight)
++{
++   // non-zero winding fill
++   int x0=0, w=0;
++
++   while (e) {
++      if (w == 0) {
++         // if we're currently at zero, we need to record the edge start point
++         x0 = e->x; w += e->valid;
++      } else {
++         int x1 = e->x; w += e->valid;
++         // if we went to zero, we need to draw
++         if (w == 0) {
++            int i = x0 >> FIXSHIFT;
++            int j = x1 >> FIXSHIFT;
++
++            if (i < len && j >= 0) {
++               if (i == j) {
++                  // x0,x1 are the same pixel, so compute combined coverage
++                  scanline[i] = scanline[i] + (stbtt_uint8) ((x1 - x0) * max_weight >> FIXSHIFT);
++               } else {
++                  if (i >= 0) // add antialiasing for x0
++                     scanline[i] = scanline[i] + (stbtt_uint8) (((FIX - (x0 & FIXMASK)) * max_weight) >> FIXSHIFT);
++                  else
++                     i = -1; // clip
++
++                  if (j < len) // add antialiasing for x1
++                     scanline[j] = scanline[j] + (stbtt_uint8) (((x1 & FIXMASK) * max_weight) >> FIXSHIFT);
++                  else
++                     j = len; // clip
++
++                  for (++i; i < j; ++i) // fill pixels between x0 and x1
++                     scanline[i] = scanline[i] + (stbtt_uint8) max_weight;
++               }
++            }
++         }
++      }
++      
++      e = e->next;
++   }
++}
++
++static void stbtt__rasterize_sorted_edges(stbtt__bitmap *result, stbtt__edge *e, int n, int vsubsample, int off_x, int off_y, void *userdata)
++{
++   stbtt__active_edge *active = NULL;
++   int y,j=0;
++   int max_weight = (255 / vsubsample);  // weight per vertical scanline
++   int s; // vertical subsample index
++   unsigned char scanline_data[512], *scanline;
++
++   if (result->w > 512)
++      scanline = (unsigned char *) STBTT_malloc(result->w, userdata);
++   else
++      scanline = scanline_data;
++
++   y = off_y * vsubsample;
++   e[n].y0 = (off_y + result->h) * (float) vsubsample + 1;
++
++   while (j < result->h) {
++      STBTT_memset(scanline, 0, result->w);
++      for (s=0; s < vsubsample; ++s) {
++         // find center of pixel for this scanline
++         float scan_y = y + 0.5f;
++         stbtt__active_edge **step = &active;
++
++         // update all active edges;
++         // remove all active edges that terminate before the center of this scanline
++         while (*step) {
++            stbtt__active_edge * z = *step;
++            if (z->ey <= scan_y) {
++               *step = z->next; // delete from list
++               STBTT_assert(z->valid);
++               z->valid = 0;
++               STBTT_free(z, userdata);
++            } else {
++               z->x += z->dx; // advance to position for current scanline
++               step = &((*step)->next); // advance through list
++            }
++         }
++
++         // resort the list if needed
++         for(;;) {
++            int changed=0;
++            step = &active;
++            while (*step && (*step)->next) {
++               if ((*step)->x > (*step)->next->x) {
++                  stbtt__active_edge *t = *step;
++                  stbtt__active_edge *q = t->next;
++
++                  t->next = q->next;
++                  q->next = t;
++                  *step = q;
++                  changed = 1;
++               }
++               step = &(*step)->next;
++            }
++            if (!changed) break;
++         }
++
++         // insert all edges that start before the center of this scanline -- omit ones that also end on this scanline
++         while (e->y0 <= scan_y) {
++            if (e->y1 > scan_y) {
++               stbtt__active_edge *z = new_active(e, off_x, scan_y, userdata);
++               // find insertion point
++               if (active == NULL)
++                  active = z;
++               else if (z->x < active->x) {
++                  // insert at front
++                  z->next = active;
++                  active = z;
++               } else {
++                  // find thing to insert AFTER
++                  stbtt__active_edge *p = active;
++                  while (p->next && p->next->x < z->x)
++                     p = p->next;
++                  // at this point, p->next->x is NOT < z->x
++                  z->next = p->next;
++                  p->next = z;
++               }
++            }
++            ++e;
++         }
++
++         // now process all active edges in XOR fashion
++         if (active)
++            stbtt__fill_active_edges(scanline, result->w, active, max_weight);
++
++         ++y;
++      }
++      STBTT_memcpy(result->pixels + j * result->stride, scanline, result->w);
++      ++j;
++   }
++
++   while (active) {
++      stbtt__active_edge *z = active;
++      active = active->next;
++      STBTT_free(z, userdata);
++   }
++
++   if (scanline != scanline_data)
++      STBTT_free(scanline, userdata);
++}
++
++static int stbtt__edge_compare(const void *p, const void *q)
++{
++   stbtt__edge *a = (stbtt__edge *) p;
++   stbtt__edge *b = (stbtt__edge *) q;
++
++   if (a->y0 < b->y0) return -1;
++   if (a->y0 > b->y0) return  1;
++   return 0;
++}
++
++typedef struct
++{
++   float x,y;
++} stbtt__point;
++
++static void stbtt__rasterize(stbtt__bitmap *result, stbtt__point *pts, int *wcount, int windings, float scale_x, float scale_y, int off_x, int off_y, int invert, void *userdata)
++{
++   float y_scale_inv = invert ? -scale_y : scale_y;
++   stbtt__edge *e;
++   int n,i,j,k,m;
++   int vsubsample = result->h < 8 ? 15 : 5;
++   // vsubsample should divide 255 evenly; otherwise we won't reach full opacity
++
++   // now we have to blow out the windings into explicit edge lists
++   n = 0;
++   for (i=0; i < windings; ++i)
++      n += wcount[i];
++
++   e = (stbtt__edge *) STBTT_malloc(sizeof(*e) * (n+1), userdata); // add an extra one as a sentinel
++   if (e == 0) return;
++   n = 0;
++
++   m=0;
++   for (i=0; i < windings; ++i) {
++      stbtt__point *p = pts + m;
++      m += wcount[i];
++      j = wcount[i]-1;
++      for (k=0; k < wcount[i]; j=k++) {
++         int a=k,b=j;
++         // skip the edge if horizontal
++         if (p[j].y == p[k].y)
++            continue;
++         // add edge from j to k to the list
++         e[n].invert = 0;
++         if (invert ? p[j].y > p[k].y : p[j].y < p[k].y) {
++            e[n].invert = 1;
++            a=j,b=k;
++         }
++         e[n].x0 = p[a].x * scale_x;
++         e[n].y0 = p[a].y * y_scale_inv * vsubsample;
++         e[n].x1 = p[b].x * scale_x;
++         e[n].y1 = p[b].y * y_scale_inv * vsubsample;
++         ++n;
++      }
++   }
++
++   // now sort the edges by their highest point (should snap to integer, and then by x)
++   STBTT_sort(e, n, sizeof(e[0]), stbtt__edge_compare);
++
++   // now, traverse the scanlines and find the intersections on each scanline, use xor winding rule
++   stbtt__rasterize_sorted_edges(result, e, n, vsubsample, off_x, off_y, userdata);
++
++   STBTT_free(e, userdata);
++}
++
++static void stbtt__add_point(stbtt__point *points, int n, float x, float y)
++{
++   if (!points) return; // during first pass, it's unallocated
++   points[n].x = x;
++   points[n].y = y;
++}
++
++// tesselate until threshhold p is happy... @TODO warped to compensate for non-linear stretching
++static int stbtt__tesselate_curve(stbtt__point *points, int *num_points, float x0, float y0, float x1, float y1, float x2, float y2, float objspace_flatness_squared, int n)
++{
++   // midpoint
++   float mx = (x0 + 2*x1 + x2)/4;
++   float my = (y0 + 2*y1 + y2)/4;
++   // versus directly drawn line
++   float dx = (x0+x2)/2 - mx;
++   float dy = (y0+y2)/2 - my;
++   if (n > 16) // 65536 segments on one curve better be enough!
++      return 1;
++   if (dx*dx+dy*dy > objspace_flatness_squared) { // half-pixel error allowed... need to be smaller if AA
++      stbtt__tesselate_curve(points, num_points, x0,y0, (x0+x1)/2.0f,(y0+y1)/2.0f, mx,my, objspace_flatness_squared,n+1);
++      stbtt__tesselate_curve(points, num_points, mx,my, (x1+x2)/2.0f,(y1+y2)/2.0f, x2,y2, objspace_flatness_squared,n+1);
++   } else {
++      stbtt__add_point(points, *num_points,x2,y2);
++      *num_points = *num_points+1;
++   }
++   return 1;
++}
++
++// returns number of contours
++stbtt__point *stbtt_FlattenCurves(stbtt_vertex *vertices, int num_verts, float objspace_flatness, int **contour_lengths, int *num_contours, void *userdata)
++{
++   stbtt__point *points=0;
++   int num_points=0;
++
++   float objspace_flatness_squared = objspace_flatness * objspace_flatness;
++   int i,n=0,start=0, pass;
++
++   // count how many "moves" there are to get the contour count
++   for (i=0; i < num_verts; ++i)
++      if (vertices[i].type == STBTT_vmove)
++         ++n;
++
++   *num_contours = n;
++   if (n == 0) return 0;
++
++   *contour_lengths = (int *) STBTT_malloc(sizeof(**contour_lengths) * n, userdata);
++
++   if (*contour_lengths == 0) {
++      *num_contours = 0;
++      return 0;
++   }
++
++   // make two passes through the points so we don't need to realloc
++   for (pass=0; pass < 2; ++pass) {
++      float x=0,y=0;
++      if (pass == 1) {
++         points = (stbtt__point *) STBTT_malloc(num_points * sizeof(points[0]), userdata);
++         if (points == NULL) goto error;
++      }
++      num_points = 0;
++      n= -1;
++      for (i=0; i < num_verts; ++i) {
++         switch (vertices[i].type) {
++            case STBTT_vmove:
++               // start the next contour
++               if (n >= 0)
++                  (*contour_lengths)[n] = num_points - start;
++               ++n;
++               start = num_points;
++
++               x = vertices[i].x, y = vertices[i].y;
++               stbtt__add_point(points, num_points++, x,y);
++               break;
++            case STBTT_vline:
++               x = vertices[i].x, y = vertices[i].y;
++               stbtt__add_point(points, num_points++, x, y);
++               break;
++            case STBTT_vcurve:
++               stbtt__tesselate_curve(points, &num_points, x,y,
++                                        vertices[i].cx, vertices[i].cy,
++                                        vertices[i].x,  vertices[i].y,
++                                        objspace_flatness_squared, 0);
++               x = vertices[i].x, y = vertices[i].y;
++               break;
++         }
++      }
++      (*contour_lengths)[n] = num_points - start;
++   }
++
++   return points;
++error:
++   STBTT_free(points, userdata);
++   STBTT_free(*contour_lengths, userdata);
++   *contour_lengths = 0;
++   *num_contours = 0;
++   return NULL;
++}
++
++void stbtt_Rasterize(stbtt__bitmap *result, float flatness_in_pixels, stbtt_vertex *vertices, int num_verts, float scale_x, float scale_y, int x_off, int y_off, int invert, void *userdata)
++{
++   float scale = scale_x > scale_y ? scale_y : scale_x;
++   int winding_count, *winding_lengths;
++   stbtt__point *windings = stbtt_FlattenCurves(vertices, num_verts, flatness_in_pixels / scale, &winding_lengths, &winding_count, userdata);
++   if (windings) {
++      stbtt__rasterize(result, windings, winding_lengths, winding_count, scale_x, scale_y, x_off, y_off, invert, userdata);
++      STBTT_free(winding_lengths, userdata);
++      STBTT_free(windings, userdata);
++   }
++}
++
++void stbtt_FreeBitmap(unsigned char *bitmap, void *userdata)
++{
++   STBTT_free(bitmap, userdata);
++}
++
++unsigned char *stbtt_GetGlyphBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int glyph, int *width, int *height, int *xoff, int *yoff)
++{
++   int ix0,iy0,ix1,iy1;
++   stbtt__bitmap gbm;
++   stbtt_vertex *vertices;   
++   int num_verts = stbtt_GetGlyphShape(info, glyph, &vertices);
++
++   if (scale_x == 0) scale_x = scale_y;
++   if (scale_y == 0) {
++      if (scale_x == 0) return NULL;
++      scale_y = scale_x;
++   }
++
++   stbtt_GetGlyphBitmapBox(info, glyph, scale_x, scale_y, &ix0,&iy0,&ix1,&iy1);
++
++   // now we get the size
++   gbm.w = (ix1 - ix0);
++   gbm.h = (iy1 - iy0);
++   gbm.pixels = NULL; // in case we error
++
++   if (width ) *width  = gbm.w;
++   if (height) *height = gbm.h;
++   if (xoff  ) *xoff   = ix0;
++   if (yoff  ) *yoff   = iy0;
++   
++   if (gbm.w && gbm.h) {
++      gbm.pixels = (unsigned char *) STBTT_malloc(gbm.w * gbm.h, info->userdata);
++      if (gbm.pixels) {
++         gbm.stride = gbm.w;
++
++         stbtt_Rasterize(&gbm, 0.35f, vertices, num_verts, scale_x, scale_y, ix0, iy0, 1, info->userdata);
++      }
++   }
++   STBTT_free(vertices, info->userdata);
++   return gbm.pixels;
++}   
++
++void stbtt_MakeGlyphBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int glyph)
++{
++   int ix0,iy0;
++   stbtt_vertex *vertices;   
++   int num_verts = stbtt_GetGlyphShape(info, glyph, &vertices);
++   stbtt__bitmap gbm;   
++
++   stbtt_GetGlyphBitmapBox(info, glyph, scale_x, scale_y, &ix0,&iy0,0,0);
++   gbm.pixels = output;
++   gbm.w = out_w;
++   gbm.h = out_h;
++   gbm.stride = out_stride;
++
++   if (gbm.w && gbm.h)
++      stbtt_Rasterize(&gbm, 0.35f, vertices, num_verts, scale_x, scale_y, ix0,iy0, 1, info->userdata);
++
++   STBTT_free(vertices, info->userdata);
++}
++
++unsigned char *stbtt_GetCodepointBitmap(const stbtt_fontinfo *info, float scale_x, float scale_y, int codepoint, int *width, int *height, int *xoff, int *yoff)
++{
++   return stbtt_GetGlyphBitmap(info, scale_x, scale_y, stbtt_FindGlyphIndex(info,codepoint), width,height,xoff,yoff);
++}   
++
++void stbtt_MakeCodepointBitmap(const stbtt_fontinfo *info, unsigned char *output, int out_w, int out_h, int out_stride, float scale_x, float scale_y, int codepoint)
++{
++   stbtt_MakeGlyphBitmap(info, output, out_w, out_h, out_stride, scale_x, scale_y, stbtt_FindGlyphIndex(info,codepoint));
++}
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// bitmap baking
++//
++// This is SUPER-SHITTY packing to keep source code small
++
++extern int stbtt_BakeFontBitmap(const unsigned char *data, int offset,  // font location (use offset=0 for plain .ttf)
++                                float pixel_height,                     // height of font in pixels
++                                unsigned char *pixels, int pw, int ph,  // bitmap to be filled in
++                                int first_char, int num_chars,          // characters to bake
++                                stbtt_bakedchar *chardata)
++{
++   float scale;
++   int x,y,bottom_y, i;
++   stbtt_fontinfo f;
++   stbtt_InitFont(&f, data, offset);
++   STBTT_memset(pixels, 0, pw*ph); // background of 0 around pixels
++   x=y=1;
++   bottom_y = 1;
++
++   scale = stbtt_ScaleForPixelHeight(&f, pixel_height);
++
++   for (i=0; i < num_chars; ++i) {
++      int advance, lsb, x0,y0,x1,y1,gw,gh;
++      int g = stbtt_FindGlyphIndex(&f, first_char + i);
++      stbtt_GetGlyphHMetrics(&f, g, &advance, &lsb);
++      stbtt_GetGlyphBitmapBox(&f, g, scale,scale, &x0,&y0,&x1,&y1);
++      gw = x1-x0;
++      gh = y1-y0;
++      if (x + gw + 1 >= pw)
++         y = bottom_y, x = 1; // advance to next row
++      if (y + gh + 1 >= ph) // check if it fits vertically AFTER potentially moving to next row
++         return -i;
++      STBTT_assert(x+gw < pw);
++      STBTT_assert(y+gh < ph);
++      stbtt_MakeGlyphBitmap(&f, pixels+x+y*pw, gw,gh,pw, scale,scale, g);
++      chardata[i].x0 = (stbtt_int16) x;
++      chardata[i].y0 = (stbtt_int16) y;
++      chardata[i].x1 = (stbtt_int16) (x + gw);
++      chardata[i].y1 = (stbtt_int16) (y + gh);
++      chardata[i].xadvance = scale * advance;
++      chardata[i].xoff     = (float) x0;
++      chardata[i].yoff     = (float) y0;
++      x = x + gw + 2;
++      if (y+gh+2 > bottom_y)
++         bottom_y = y+gh+2;
++   }
++   return bottom_y;
++}
++
++void stbtt_GetBakedQuad(stbtt_bakedchar *chardata, int pw, int ph, int char_index, float *xpos, float *ypos, stbtt_aligned_quad *q, int opengl_fillrule)
++{
++   float d3d_bias = opengl_fillrule ? 0 : -0.5f;
++   float ipw = 1.0f / pw, iph = 1.0f / ph;
++   stbtt_bakedchar *b = chardata + char_index;
++   int round_x = STBTT_ifloor((*xpos + b->xoff) + 0.5);
++   int round_y = STBTT_ifloor((*ypos + b->yoff) + 0.5);
++
++   q->x0 = round_x + d3d_bias;
++   q->y0 = round_y + d3d_bias;
++   q->x1 = round_x + b->x1 - b->x0 + d3d_bias;
++   q->y1 = round_y + b->y1 - b->y0 + d3d_bias;
++
++   q->s0 = b->x0 * ipw;
++   q->t0 = b->y0 * ipw;
++   q->s1 = b->x1 * iph;
++   q->t1 = b->y1 * iph;
++
++   *xpos += b->xadvance;
++}
++
++//////////////////////////////////////////////////////////////////////////////
++//
++// font name matching -- recommended not to use this
++//
++
++// check if a utf8 string contains a prefix which is the utf16 string; if so return length of matching utf8 string
++static stbtt_int32 stbtt__CompareUTF8toUTF16_bigendian_prefix(stbtt_uint8 *s1, stbtt_int32 len1, stbtt_uint8 *s2, stbtt_int32 len2) 
++{
++   stbtt_int32 i=0;
++
++   // convert utf16 to utf8 and compare the results while converting
++   while (len2) {
++      stbtt_uint16 ch = s2[0]*256 + s2[1];
++      if (ch < 0x80) {
++         if (i >= len1) return -1;
++         if (s1[i++] != ch) return -1;
++      } else if (ch < 0x800) {
++         if (i+1 >= len1) return -1;
++         if (s1[i++] != 0xc0 + (ch >> 6)) return -1;
++         if (s1[i++] != 0x80 + (ch & 0x3f)) return -1;
++      } else if (ch >= 0xd800 && ch < 0xdc00) {
++         stbtt_uint32 c;
++         stbtt_uint16 ch2 = s2[2]*256 + s2[3];
++         if (i+3 >= len1) return -1;
++         c = ((ch - 0xd800) << 10) + (ch2 - 0xdc00) + 0x10000;
++         if (s1[i++] != 0xf0 + (c >> 18)) return -1;
++         if (s1[i++] != 0x80 + ((c >> 12) & 0x3f)) return -1;
++         if (s1[i++] != 0x80 + ((c >>  6) & 0x3f)) return -1;
++         if (s1[i++] != 0x80 + ((c      ) & 0x3f)) return -1;
++         s2 += 2; // plus another 2 below
++         len2 -= 2;
++      } else if (ch >= 0xdc00 && ch < 0xe000) {
++         return -1;
++      } else {
++         if (i+2 >= len1) return -1;
++         if (s1[i++] != 0xe0 + (ch >> 12)) return -1;
++         if (s1[i++] != 0x80 + ((ch >> 6) & 0x3f)) return -1;
++         if (s1[i++] != 0x80 + ((ch     ) & 0x3f)) return -1;
++      }
++      s2 += 2;
++      len2 -= 2;
++   }
++   return i;
++}
++
++int stbtt_CompareUTF8toUTF16_bigendian(const char *s1, int len1, const char *s2, int len2) 
++{
++   return len1 == stbtt__CompareUTF8toUTF16_bigendian_prefix((stbtt_uint8*) s1, len1, (stbtt_uint8*) s2, len2);
++}
++
++// returns results in whatever encoding you request... but note that 2-byte encodings
++// will be BIG-ENDIAN... use stbtt_CompareUTF8toUTF16_bigendian() to compare
++char *stbtt_GetFontNameString(const stbtt_fontinfo *font, int *length, int platformID, int encodingID, int languageID, int nameID)
++{
++   stbtt_int32 i,count,stringOffset;
++   stbtt_uint8 *fc = font->data;
++   stbtt_uint32 offset = font->fontstart;
++   stbtt_uint32 nm = stbtt__find_table(fc, offset, "name");
++   if (!nm) return NULL;
++
++   count = ttUSHORT(fc+nm+2);
++   stringOffset = nm + ttUSHORT(fc+nm+4);
++   for (i=0; i < count; ++i) {
++      stbtt_uint32 loc = nm + 6 + 12 * i;
++      if (platformID == ttUSHORT(fc+loc+0) && encodingID == ttUSHORT(fc+loc+2)
++          && languageID == ttUSHORT(fc+loc+4) && nameID == ttUSHORT(fc+loc+6)) {
++         *length = ttUSHORT(fc+loc+8);
++         return (char *) (fc+stringOffset+ttUSHORT(fc+loc+10));
++      }
++   }
++   return NULL;
++}
++
++static int stbtt__matchpair(stbtt_uint8 *fc, stbtt_uint32 nm, stbtt_uint8 *name, stbtt_int32 nlen, stbtt_int32 target_id, stbtt_int32 next_id)
++{
++   stbtt_int32 i;
++   stbtt_int32 count = ttUSHORT(fc+nm+2);
++   stbtt_int32 stringOffset = nm + ttUSHORT(fc+nm+4);
++
++   for (i=0; i < count; ++i) {
++      stbtt_uint32 loc = nm + 6 + 12 * i;
++      stbtt_int32 id = ttUSHORT(fc+loc+6);
++      if (id == target_id) {
++         // find the encoding
++         stbtt_int32 platform = ttUSHORT(fc+loc+0), encoding = ttUSHORT(fc+loc+2), language = ttUSHORT(fc+loc+4);
++
++         // is this a Unicode encoding?
++         if (platform == 0 || (platform == 3 && encoding == 1) || (platform == 3 && encoding == 10)) {
++            stbtt_int32 slen = ttUSHORT(fc+loc+8), off = ttUSHORT(fc+loc+10);
++
++            // check if there's a prefix match
++            stbtt_int32 matchlen = stbtt__CompareUTF8toUTF16_bigendian_prefix(name, nlen, fc+stringOffset+off,slen);
++            if (matchlen >= 0) {
++               // check for target_id+1 immediately following, with same encoding & language
++               if (i+1 < count && ttUSHORT(fc+loc+12+6) == next_id && ttUSHORT(fc+loc+12) == platform && ttUSHORT(fc+loc+12+2) == encoding && ttUSHORT(fc+loc+12+4) == language) {
++                  stbtt_int32 slen = ttUSHORT(fc+loc+12+8), off = ttUSHORT(fc+loc+12+10);
++                  if (slen == 0) {
++                     if (matchlen == nlen)
++                        return 1;
++                  } else if (matchlen < nlen && name[matchlen] == ' ') {
++                     ++matchlen;
++                     if (stbtt_CompareUTF8toUTF16_bigendian((char*) (name+matchlen), nlen-matchlen, (char*)(fc+stringOffset+off),slen))
++                        return 1;
++                  }
++               } else {
++                  // if nothing immediately following
++                  if (matchlen == nlen)
++                     return 1;
++               }
++            }
++         }
++
++         // @TODO handle other encodings
++      }
++   }
++   return 0;
++}
++
++static int stbtt__matches(stbtt_uint8 *fc, stbtt_uint32 offset, stbtt_uint8 *name, stbtt_int32 flags)
++{
++   stbtt_int32 nlen = STBTT_strlen((char *) name);
++   stbtt_uint32 nm,hd;
++   if (!stbtt__isfont(fc+offset)) return 0;
++
++   // check italics/bold/underline flags in macStyle...
++   if (flags) {
++      hd = stbtt__find_table(fc, offset, "head");
++      if ((ttUSHORT(fc+hd+44) & 7) != (flags & 7)) return 0;
++   }
++
++   nm = stbtt__find_table(fc, offset, "name");
++   if (!nm) return 0;
++
++   if (flags) {
++      // if we checked the macStyle flags, then just check the family and ignore the subfamily
++      if (stbtt__matchpair(fc, nm, name, nlen, 16, -1))  return 1;
++      if (stbtt__matchpair(fc, nm, name, nlen,  1, -1))  return 1;
++      if (stbtt__matchpair(fc, nm, name, nlen,  3, -1))  return 1;
++   } else {
++      if (stbtt__matchpair(fc, nm, name, nlen, 16, 17))  return 1;
++      if (stbtt__matchpair(fc, nm, name, nlen,  1,  2))  return 1;
++      if (stbtt__matchpair(fc, nm, name, nlen,  3, -1))  return 1;
++   }
++
++   return 0;
++}
++
++int stbtt_FindMatchingFont(const unsigned char *font_collection, const char *name_utf8, stbtt_int32 flags)
++{
++   stbtt_int32 i;
++   for (i=0;;++i) {
++      stbtt_int32 off = stbtt_GetFontOffsetForIndex(font_collection, i);
++      if (off < 0) return off;
++      if (stbtt__matches((stbtt_uint8 *) font_collection, off, (stbtt_uint8*) name_utf8, flags))
++         return off;
++   }
++}
++
++#endif // STB_TRUETYPE_IMPLEMENTATION