Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

===================================================================
RCS file: /ftp/cvs/cvsroot/src/lib/libc/stdlib/jemalloc.c,v
rcsdiff: /ftp/cvs/cvsroot/src/lib/libc/stdlib/jemalloc.c,v: warning: Unknown phrases like `commitid ...;' are present.
retrieving revision 1.2
retrieving revision 1.35
diff -u -p -r1.2 -r1.35
--- src/lib/libc/stdlib/jemalloc.c	2007/10/05 23:42:23	1.2
+++ src/lib/libc/stdlib/jemalloc.c	2014/09/03 19:29:40	1.35
@@ -1,4 +1,4 @@
-/*	$NetBSD: jemalloc.c,v 1.2 2007/10/05 23:42:23 ad Exp $	*/
+/*	$NetBSD: jemalloc.c,v 1.35 2014/09/03 19:29:40 matt Exp $	*/
 
 /*-
  * Copyright (C) 2006,2007 Jason Evans <jasone@FreeBSD.org>.
@@ -118,7 +118,7 @@
 
 #include <sys/cdefs.h>
 /* __FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.147 2007/06/15 22:00:16 jasone Exp $"); */ 
-__RCSID("$NetBSD: jemalloc.c,v 1.2 2007/10/05 23:42:23 ad Exp $");
+__RCSID("$NetBSD: jemalloc.c,v 1.35 2014/09/03 19:29:40 matt Exp $");
 
 #ifdef __FreeBSD__
 #include "libc_private.h"
@@ -161,13 +161,33 @@ __RCSID("$NetBSD: jemalloc.c,v 1.2 2007/
 
 #ifdef __NetBSD__
 #  include <reentrant.h>
-void	_malloc_prefork(void);
-void	_malloc_postfork(void);
-ssize_t	_write(int, const void *, size_t);
-const char	*_getprogname(void);
+#  include "extern.h"
+
+#define STRERROR_R(a, b, c)	__strerror_r(a, b, c);
+/*
+ * A non localized version of strerror, that avoids bringing in
+ * stdio and the locale code. All the malloc messages are in English
+ * so why bother?
+ */
+static int
+__strerror_r(int e, char *s, size_t l)
+{
+	int rval;
+	size_t slen;
+
+	if (e >= 0 && e < sys_nerr) {
+		slen = strlcpy(s, sys_errlist[e], l);
+		rval = 0;
+	} else {
+		slen = snprintf_ss(s, l, "Unknown error %u", e);
+		rval = EINVAL;
+	}
+	return slen >= l ? ERANGE : rval;
+}
 #endif
 
 #ifdef __FreeBSD__
+#define STRERROR_R(a, b, c)	strerror_r(a, b, c);
 #include "un-namespace.h"
 #endif
 
@@ -196,6 +216,14 @@ const char	*_getprogname(void);
 #define	STRERROR_BUF		64
 
 /* Minimum alignment of allocations is 2^QUANTUM_2POW_MIN bytes. */
+
+/*
+ * If you touch the TINY_MIN_2POW definition for any architecture, please
+ * make sure to adjust the corresponding definition for JEMALLOC_TINY_MIN_2POW
+ * in the gcc 4.8 tree in dist/gcc/tree-ssa-ccp.c and verify that a native
+ * gcc is still buildable!
+ */
+
 #ifdef __i386__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	2
@@ -205,32 +233,49 @@ const char	*_getprogname(void);
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	3
 #endif
+#ifdef __aarch64__
+#  define QUANTUM_2POW_MIN	4
+#  define SIZEOF_PTR_2POW	3
+#  define NO_TLS
+#endif
 #ifdef __alpha__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	3
+#  define TINY_MIN_2POW		3
 #  define NO_TLS
 #endif
 #ifdef __sparc64__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	3
+#  define TINY_MIN_2POW		3
 #  define NO_TLS
 #endif
 #ifdef __amd64__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	3
+#  define TINY_MIN_2POW		3
 #endif
 #ifdef __arm__
 #  define QUANTUM_2POW_MIN	3
 #  define SIZEOF_PTR_2POW	2
 #  define USE_BRK
+#  ifdef __ARM_EABI__
+#    define TINY_MIN_2POW	3
+#  endif
 #  define NO_TLS
 #endif
 #ifdef __powerpc__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	2
 #  define USE_BRK
+#  define TINY_MIN_2POW		3
 #endif
-#ifdef __sparc__
+#if defined(__sparc__) && !defined(__sparc64__)
+#  define QUANTUM_2POW_MIN	4
+#  define SIZEOF_PTR_2POW	2
+#  define USE_BRK
+#endif
+#ifdef __or1k__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	2
 #  define USE_BRK
@@ -255,6 +300,11 @@ const char	*_getprogname(void);
 #  define SIZEOF_PTR_2POW	2
 #  define USE_BRK
 #endif
+#ifdef __hppa__                                                                                                                                         
+#  define QUANTUM_2POW_MIN     4                                                                                                                        
+#  define SIZEOF_PTR_2POW      2                                                                                                                        
+#  define USE_BRK                                                                                                                                       
+#endif           
 
 #define	SIZEOF_PTR		(1 << SIZEOF_PTR_2POW)
 
@@ -263,11 +313,6 @@ const char	*_getprogname(void);
 #  define SIZEOF_INT_2POW	2
 #endif
 
-/* We can't use TLS in non-PIC programs, since TLS relies on loader magic. */
-#if (!defined(PIC) && !defined(NO_TLS))
-#  define NO_TLS
-#endif
-
 /*
  * Size and alignment of memory chunks that are allocated by the OS's virtual
  * memory system.
@@ -283,7 +328,9 @@ const char	*_getprogname(void);
 #define	CACHELINE		((size_t)(1 << CACHELINE_2POW))
 
 /* Smallest size class to support. */
-#define	TINY_MIN_2POW		1
+#ifndef TINY_MIN_2POW
+#define	TINY_MIN_2POW		2
+#endif
 
 /*
  * Maximum size class that is a multiple of the quantum, but not (necessarily)
@@ -294,20 +341,25 @@ const char	*_getprogname(void);
 #define	SMALL_MAX_DEFAULT	(1 << SMALL_MAX_2POW_DEFAULT)
 
 /*
- * Maximum desired run header overhead.  Runs are sized as small as possible
- * such that this setting is still honored, without violating other constraints.
- * The goal is to make runs as small as possible without exceeding a per run
- * external fragmentation threshold.
+ * RUN_MAX_OVRHD indicates maximum desired run header overhead.  Runs are sized
+ * as small as possible such that this setting is still honored, without
+ * violating other constraints.  The goal is to make runs as small as possible
+ * without exceeding a per run external fragmentation threshold.
+ *
+ * We use binary fixed point math for overhead computations, where the binary
+ * point is implicitly RUN_BFP bits to the left.
  *
- * Note that it is possible to set this low enough that it cannot be honored
- * for some/all object sizes, since there is one bit of header overhead per
- * object (plus a constant).  In such cases, this constraint is relaxed.
+ * Note that it is possible to set RUN_MAX_OVRHD low enough that it cannot be
+ * honored for some/all object sizes, since there is one bit of header overhead
+ * per object (plus a constant).  This constraint is relaxed (ignored) for runs
+ * that are so small that the per-region overhead is greater than:
  *
- * RUN_MAX_OVRHD_RELAX specifies the maximum number of bits per region of
- * overhead for which RUN_MAX_OVRHD is relaxed.
+ *   (RUN_MAX_OVRHD / (reg_size << (3+RUN_BFP))
  */
-#define RUN_MAX_OVRHD		0.015
-#define RUN_MAX_OVRHD_RELAX	1.5
+#define RUN_BFP			12
+/*                              \/   Implicit binary fixed point. */
+#define RUN_MAX_OVRHD		0x0000003dU
+#define RUN_MAX_OVRHD_RELAX	0x00001800U
 
 /* Put a cap on small object run size.  This overrides RUN_MAX_OVRHD. */
 #define RUN_MAX_SMALL_2POW	15
@@ -620,7 +672,7 @@ static unsigned		ncpus;
 /* VM page size. */
 static size_t		pagesize;
 static size_t		pagesize_mask;
-static size_t		pagesize_2pow;
+static int		pagesize_2pow;
 
 /* Various bin-related settings. */
 static size_t		bin_maxclass; /* Max size class for bins. */
@@ -637,6 +689,7 @@ static size_t		quantum_mask; /* (quantum
 /* Various chunk-related settings. */
 static size_t		chunksize;
 static size_t		chunksize_mask; /* (chunksize - 1). */
+static int		chunksize_2pow;
 static unsigned		chunk_npages;
 static unsigned		arena_chunk_header_npages;
 static size_t		arena_maxclass; /* Max size class for arenas. */
@@ -674,6 +727,7 @@ static void		*brk_max;
 /* Huge allocation statistics. */
 static uint64_t		huge_nmalloc;
 static uint64_t		huge_ndalloc;
+static uint64_t		huge_nralloc;
 static size_t		huge_allocated;
 #endif
 
@@ -750,9 +804,9 @@ static bool	opt_junk = false;
 #endif
 static bool	opt_hint = false;
 static bool	opt_print_stats = false;
-static size_t	opt_quantum_2pow = QUANTUM_2POW_MIN;
-static size_t	opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
-static size_t	opt_chunk_2pow = CHUNK_2POW_DEFAULT;
+static int	opt_quantum_2pow = QUANTUM_2POW_MIN;
+static int	opt_small_max_2pow = SMALL_MAX_2POW_DEFAULT;
+static int	opt_chunk_2pow = CHUNK_2POW_DEFAULT;
 static bool	opt_utrace = false;
 static bool	opt_sysv = false;
 static bool	opt_xmalloc = false;
@@ -784,7 +838,7 @@ static void	wrtmessage(const char *p1, c
 #ifdef MALLOC_STATS
 static void	malloc_printf(const char *format, ...);
 #endif
-static char	*umax2s(uintmax_t x, char *s);
+static char	*size_t2s(size_t x, char *s);
 static bool	base_pages_alloc(size_t minsize);
 static void	*base_alloc(size_t size);
 static chunk_node_t *base_chunk_node_alloc(void);
@@ -793,10 +847,10 @@ static void	base_chunk_node_dealloc(chun
 static void	stats_print(arena_t *arena);
 #endif
 static void	*pages_map(void *addr, size_t size);
+static void	*pages_map_align(void *addr, size_t size, int align);
 static void	pages_unmap(void *addr, size_t size);
 static void	*chunk_alloc(size_t size);
 static void	chunk_dealloc(void *chunk, size_t size);
-static arena_t	*choose_arena_hard(void);
 static void	arena_run_split(arena_t *arena, arena_run_t *run, size_t size);
 static arena_chunk_t *arena_chunk_alloc(arena_t *arena);
 static void	arena_chunk_dealloc(arena_t *arena, arena_chunk_t *chunk);
@@ -918,10 +972,10 @@ static void
 wrtmessage(const char *p1, const char *p2, const char *p3, const char *p4)
 {
 
-	_write(STDERR_FILENO, p1, strlen(p1));
-	_write(STDERR_FILENO, p2, strlen(p2));
-	_write(STDERR_FILENO, p3, strlen(p3));
-	_write(STDERR_FILENO, p4, strlen(p4));
+	write(STDERR_FILENO, p1, strlen(p1));
+	write(STDERR_FILENO, p2, strlen(p2));
+	write(STDERR_FILENO, p3, strlen(p3));
+	write(STDERR_FILENO, p4, strlen(p4));
 }
 
 void	(*_malloc_message)(const char *p1, const char *p2, const char *p3,
@@ -946,18 +1000,19 @@ malloc_printf(const char *format, ...)
 
 /*
  * We don't want to depend on vsnprintf() for production builds, since that can
- * cause unnecessary bloat for static binaries.  umax2s() provides minimal
+ * cause unnecessary bloat for static binaries.  size_t2s() provides minimal
  * integer printing functionality, so that malloc_printf() use can be limited to
  * MALLOC_STATS code.
  */
 #define UMAX2S_BUFSIZE	21
 static char *
-umax2s(uintmax_t x, char *s)
+size_t2s(size_t x, char *s)
 {
 	unsigned i;
 
 	/* Make sure UMAX2S_BUFSIZE is large enough. */
-	assert(sizeof(uintmax_t) <= 8);
+	/* LINTED */
+	assert(sizeof(size_t) <= 8);
 
 	i = UMAX2S_BUFSIZE - 1;
 	s[i] = '\0';
@@ -1001,7 +1056,8 @@ base_pages_alloc(size_t minsize)
 			 */
 			incr = (intptr_t)chunksize
 			    - (intptr_t)CHUNK_ADDR2OFFSET(brk_cur);
-			if (incr < minsize)
+			assert(incr >= 0);
+			if ((size_t)incr < minsize)
 				incr += csize;
 
 			brk_prev = sbrk(incr);
@@ -1176,6 +1232,7 @@ stats_print(arena_t *arena)
  * Begin chunk management functions.
  */
 
+#ifndef lint
 static inline int
 chunk_comp(chunk_node_t *a, chunk_node_t *b)
 {
@@ -1192,12 +1249,11 @@ chunk_comp(chunk_node_t *a, chunk_node_t
 }
 
 /* Generate red-black tree code for chunks. */
-#ifndef lint
 RB_GENERATE_STATIC(chunk_tree_s, chunk_node_s, link, chunk_comp);
 #endif
 
 static void *
-pages_map(void *addr, size_t size)
+pages_map_align(void *addr, size_t size, int align)
 {
 	void *ret;
 
@@ -1205,8 +1261,8 @@ pages_map(void *addr, size_t size)
 	 * We don't use MAP_FIXED here, because it can cause the *replacement*
 	 * of existing mappings, and we only want to create new mappings.
 	 */
-	ret = mmap(addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
-	    -1, 0);
+	ret = mmap(addr, size, PROT_READ | PROT_WRITE,
+	    MAP_PRIVATE | MAP_ANON | MAP_ALIGNED(align), -1, 0);
 	assert(ret != NULL);
 
 	if (ret == MAP_FAILED)
@@ -1218,8 +1274,8 @@ pages_map(void *addr, size_t size)
 		if (munmap(ret, size) == -1) {
 			char buf[STRERROR_BUF];
 
-			strerror_r(errno, buf, sizeof(buf));
-			_malloc_message(_getprogname(),
+			STRERROR_R(errno, buf, sizeof(buf));
+			_malloc_message(getprogname(),
 			    ": (malloc) Error in munmap(): ", buf, "\n");
 			if (opt_abort)
 				abort();
@@ -1232,6 +1288,13 @@ pages_map(void *addr, size_t size)
 	return (ret);
 }
 
+static void *
+pages_map(void *addr, size_t size)
+{
+
+	return pages_map_align(addr, size, 0);
+}
+
 static void
 pages_unmap(void *addr, size_t size)
 {
@@ -1239,8 +1302,8 @@ pages_unmap(void *addr, size_t size)
 	if (munmap(addr, size) == -1) {
 		char buf[STRERROR_BUF];
 
-		strerror_r(errno, buf, sizeof(buf));
-		_malloc_message(_getprogname(),
+		STRERROR_R(errno, buf, sizeof(buf));
+		_malloc_message(getprogname(),
 		    ": (malloc) Error in munmap(): ", buf, "\n");
 		if (opt_abort)
 			abort();
@@ -1299,27 +1362,8 @@ chunk_alloc(size_t size)
 	 * anywhere.  Beware of size_t wrap-around.
 	 */
 	if (size + chunksize > size) {
-		if ((ret = pages_map(NULL, size + chunksize)) != NULL) {
-			size_t offset = CHUNK_ADDR2OFFSET(ret);
-
-			/*
-			 * Success.  Clean up unneeded leading/trailing space.
-			 */
-			if (offset != 0) {
-				/* Leading space. */
-				pages_unmap(ret, chunksize - offset);
-
-				ret = (void *)((uintptr_t)ret + (chunksize -
-				    offset));
-
-				/* Trailing space. */
-				pages_unmap((void *)((uintptr_t)ret + size),
-				    offset);
-			} else {
-				/* Trailing space only. */
-				pages_unmap((void *)((uintptr_t)ret + size),
-				    chunksize);
-			}
+		if ((ret = pages_map_align(NULL, size, chunksize_2pow))
+		    != NULL) {
 			goto RETURN;
 		}
 	}
@@ -1348,7 +1392,7 @@ chunk_alloc(size_t size)
 			 */
 			incr = (intptr_t)size
 			    - (intptr_t)CHUNK_ADDR2OFFSET(brk_cur);
-			if (incr == size) {
+			if (incr == (intptr_t)size) {
 				ret = brk_cur;
 			} else {
 				ret = (void *)((intptr_t)brk_cur + incr);
@@ -1506,68 +1550,57 @@ chunk_dealloc(void *chunk, size_t size)
  */
 
 /*
- * Choose an arena based on a per-thread value (fast-path code, calls slow-path
- * code if necessary).
+ * Choose an arena based on a per-thread and (optimistically) per-CPU value.
+ *
+ * We maintain at least one block of arenas.  Usually there are more.
+ * The blocks are $ncpu arenas in size.  Whole blocks are 'hashed'
+ * amongst threads.  To accomplish this, next_arena advances only in
+ * ncpu steps.
  */
-static inline arena_t *
-choose_arena(void)
+static __noinline arena_t *
+choose_arena_hard(void)
 {
-	arena_t *ret;
+	unsigned i, curcpu;
+	arena_t **map;
 
-	/*
-	 * We can only use TLS if this is a PIC library, since for the static
-	 * library version, libc's malloc is used by TLS allocation, which
-	 * introduces a bootstrapping issue.
-	 */
-	if (__isthreaded == false) {
-	    /*
-	     * Avoid the overhead of TLS for single-threaded operation.  If the
-	     * app switches to threaded mode, the initial thread may end up
-	     * being assigned to some other arena, but this one-time switch
-	     * shouldn't cause significant issues.
-	     */
-	    return (arenas[0]);
+	/* Initialize the current block of arenas and advance to next. */
+	malloc_mutex_lock(&arenas_mtx);
+	assert(next_arena % ncpus == 0);
+	assert(narenas % ncpus == 0);
+	map = &arenas[next_arena];
+	set_arenas_map(map);
+	for (i = 0; i < ncpus; i++) {
+		if (arenas[next_arena] == NULL)
+			arenas_extend(next_arena);
+		next_arena = (next_arena + 1) % narenas;
 	}
+	malloc_mutex_unlock(&arenas_mtx);
 
-	ret = get_arenas_map();
-	if (ret == NULL)
-		ret = choose_arena_hard();
-
-	assert(ret != NULL);
-	return (ret);
+	/*
+	 * If we were unable to allocate an arena above, then default to
+	 * the first arena, which is always present.
+	 */
+	curcpu = thr_curcpu();
+	if (map[curcpu] != NULL)
+		return map[curcpu];
+	return arenas[0];
 }
 
-/*
- * Choose an arena based on a per-thread value (slow-path code only, called
- * only by choose_arena()).
- */
-static arena_t *
-choose_arena_hard(void)
+static inline arena_t *
+choose_arena(void)
 {
-	arena_t *ret;
+	unsigned curcpu;
+	arena_t **map;
 
-	assert(__isthreaded);
+	map = get_arenas_map();
+	curcpu = thr_curcpu();
+	if (__predict_true(map != NULL && map[curcpu] != NULL))
+		return map[curcpu];
 
-	/* Assign one of the arenas to this thread, in a round-robin fashion. */
-	malloc_mutex_lock(&arenas_mtx);
-	ret = arenas[next_arena];
-	if (ret == NULL)
-		ret = arenas_extend(next_arena);
-	if (ret == NULL) {
-		/*
-		 * Make sure that this function never returns NULL, so that
-		 * choose_arena() doesn't have to check for a NULL return
-		 * value.
-		 */
-		ret = arenas[0];
-	}
-	next_arena = (next_arena + 1) % narenas;
-	malloc_mutex_unlock(&arenas_mtx);
-	set_arenas_map(ret);
-
-	return (ret);
+        return choose_arena_hard();
 }
 
+#ifndef lint
 static inline int
 arena_chunk_comp(arena_chunk_t *a, arena_chunk_t *b)
 {
@@ -1584,10 +1617,10 @@ arena_chunk_comp(arena_chunk_t *a, arena
 }
 
 /* Generate red-black tree code for arena chunks. */
-#ifndef lint
 RB_GENERATE_STATIC(arena_chunk_tree_s, arena_chunk_s, link, arena_chunk_comp);
 #endif
 
+#ifndef lint
 static inline int
 arena_run_comp(arena_run_t *a, arena_run_t *b)
 {
@@ -1604,7 +1637,6 @@ arena_run_comp(arena_run_t *a, arena_run
 }
 
 /* Generate red-black tree code for arena runs. */
-#ifndef lint
 RB_GENERATE_STATIC(arena_run_tree_s, arena_run_s, link, arena_run_comp);
 #endif
 
@@ -1663,6 +1695,7 @@ arena_run_reg_alloc(arena_run_t *run, ar
 		}
 	}
 	/* Not reached. */
+	/* LINTED */
 	assert(0);
 	return (NULL);
 }
@@ -1705,6 +1738,7 @@ arena_run_reg_dalloc(arena_run_t *run, a
 	};
 	unsigned diff, regind, elm, bit;
 
+	/* LINTED */
 	assert(run->magic == ARENA_RUN_MAGIC);
 	assert(((sizeof(size_invs)) / sizeof(unsigned)) + 3
 	    >= (SMALL_MAX_DEFAULT >> QUANTUM_2POW_MIN));
@@ -1741,7 +1775,7 @@ arena_run_reg_dalloc(arena_run_t *run, a
 			 * The page size is too large for us to use the lookup
 			 * table.  Use real division.
 			 */
-			regind = diff / size;
+			regind = (unsigned)(diff / size);
 		}
 	} else if (size <= ((sizeof(size_invs) / sizeof(unsigned))
 	    << QUANTUM_2POW_MIN) + 2) {
@@ -1754,7 +1788,7 @@ arena_run_reg_dalloc(arena_run_t *run, a
 		 * if the user increases small_max via the 'S' runtime
 		 * configuration option.
 		 */
-		regind = diff / size;
+		regind = (unsigned)(diff / size);
 	};
 	assert(diff == regind * size);
 	assert(regind < bin->nregs);
@@ -1780,7 +1814,7 @@ arena_run_split(arena_t *arena, arena_ru
 	run_ind = (unsigned)(((uintptr_t)run - (uintptr_t)chunk)
 	    >> pagesize_2pow);
 	total_pages = chunk->map[run_ind].npages;
-	need_pages = (size >> pagesize_2pow);
+	need_pages = (unsigned)(size >> pagesize_2pow);
 	assert(need_pages <= total_pages);
 	rem_pages = total_pages - need_pages;
 
@@ -1895,7 +1929,7 @@ arena_run_alloc(arena_t *arena, size_t s
 	 * Search through arena's chunks in address order for a free run that is
 	 * large enough.  Look for the first fit.
 	 */
-	need_npages = (size >> pagesize_2pow);
+	need_npages = (unsigned)(size >> pagesize_2pow);
 	limit_pages = chunk_npages - arena_chunk_header_npages;
 	compl_need_npages = limit_pages - need_npages;
 	/* LINTED */
@@ -1973,7 +2007,7 @@ arena_run_dalloc(arena_t *arena, arena_r
 	    >> pagesize_2pow);
 	assert(run_ind >= arena_chunk_header_npages);
 	assert(run_ind < (chunksize >> pagesize_2pow));
-	run_pages = (size >> pagesize_2pow);
+	run_pages = (unsigned)(size >> pagesize_2pow);
 	assert(run_pages == chunk->map[run_ind].npages);
 
 	/* Subtract pages from count of pages used in chunk. */
@@ -2136,7 +2170,6 @@ arena_bin_run_size_calc(arena_bin_t *bin
 	size_t try_run_size, good_run_size;
 	unsigned good_nregs, good_mask_nelms, good_reg0_offset;
 	unsigned try_nregs, try_mask_nelms, try_reg0_offset;
-	float max_ovrhd = RUN_MAX_OVRHD;
 
 	assert(min_run_size >= pagesize);
 	assert(min_run_size <= arena_maxclass);
@@ -2153,13 +2186,14 @@ arena_bin_run_size_calc(arena_bin_t *bin
 	 * header's mask length and the number of regions.
 	 */
 	try_run_size = min_run_size;
-	try_nregs = ((try_run_size - sizeof(arena_run_t)) / bin->reg_size)
-	    + 1; /* Counter-act the first line of the loop. */
+	try_nregs = (unsigned)(((try_run_size - sizeof(arena_run_t)) /
+	    bin->reg_size) + 1); /* Counter-act try_nregs-- in loop. */
 	do {
 		try_nregs--;
 		try_mask_nelms = (try_nregs >> (SIZEOF_INT_2POW + 3)) +
 		    ((try_nregs & ((1 << (SIZEOF_INT_2POW + 3)) - 1)) ? 1 : 0);
-		try_reg0_offset = try_run_size - (try_nregs * bin->reg_size);
+		try_reg0_offset = (unsigned)(try_run_size -
+		    (try_nregs * bin->reg_size));
 	} while (sizeof(arena_run_t) + (sizeof(unsigned) * (try_mask_nelms - 1))
 	    > try_reg0_offset);
 
@@ -2175,21 +2209,20 @@ arena_bin_run_size_calc(arena_bin_t *bin
 
 		/* Try more aggressive settings. */
 		try_run_size += pagesize;
-		try_nregs = ((try_run_size - sizeof(arena_run_t)) /
-		    bin->reg_size) + 1; /* Counter-act try_nregs-- in loop. */
+		try_nregs = (unsigned)(((try_run_size - sizeof(arena_run_t)) /
+		    bin->reg_size) + 1); /* Counter-act try_nregs-- in loop. */
 		do {
 			try_nregs--;
 			try_mask_nelms = (try_nregs >> (SIZEOF_INT_2POW + 3)) +
 			    ((try_nregs & ((1 << (SIZEOF_INT_2POW + 3)) - 1)) ?
 			    1 : 0);
-			try_reg0_offset = try_run_size - (try_nregs *
-			    bin->reg_size);
+			try_reg0_offset = (unsigned)(try_run_size - (try_nregs *
+			    bin->reg_size));
 		} while (sizeof(arena_run_t) + (sizeof(unsigned) *
 		    (try_mask_nelms - 1)) > try_reg0_offset);
 	} while (try_run_size <= arena_maxclass && try_run_size <= RUN_MAX_SMALL
-	    && max_ovrhd > RUN_MAX_OVRHD_RELAX / ((float)(bin->reg_size << 3))
-	    && ((float)(try_reg0_offset)) / ((float)(try_run_size)) >
-	    max_ovrhd);
+	    && RUN_MAX_OVRHD * (bin->reg_size << 3) > RUN_MAX_OVRHD_RELAX
+	    && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size);
 
 	assert(sizeof(arena_run_t) + (sizeof(unsigned) * (good_mask_nelms - 1))
 	    <= good_reg0_offset);
@@ -2319,7 +2352,7 @@ arena_palloc(arena_t *arena, size_t alig
 	assert((size & pagesize_mask) == 0);
 	assert((alignment & pagesize_mask) == 0);
 
-	npages = size >> pagesize_2pow;
+	npages = (unsigned)(size >> pagesize_2pow);
 
 	malloc_mutex_lock(&arena->mtx);
 	ret = (void *)arena_run_alloc(arena, alloc_size);
@@ -2334,7 +2367,7 @@ arena_palloc(arena_t *arena, size_t alig
 	assert((offset & pagesize_mask) == 0);
 	assert(offset < alloc_size);
 	if (offset == 0) {
-		pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
+		pageind = (unsigned)(((uintptr_t)ret - (uintptr_t)chunk) >>
 		    pagesize_2pow);
 
 		/* Update the map for the run to be kept. */
@@ -2345,13 +2378,13 @@ arena_palloc(arena_t *arena, size_t alig
 
 		/* Trim trailing space. */
 		arena_palloc_trim(arena, chunk, pageind + npages,
-		    (alloc_size - size) >> pagesize_2pow);
+		    (unsigned)((alloc_size - size) >> pagesize_2pow));
 	} else {
 		size_t leadsize, trailsize;
 
 		leadsize = alignment - offset;
 		ret = (void *)((uintptr_t)ret + leadsize);
-		pageind = (((uintptr_t)ret - (uintptr_t)chunk) >>
+		pageind = (unsigned)(((uintptr_t)ret - (uintptr_t)chunk) >>
 		    pagesize_2pow);
 
 		/* Update the map for the run to be kept. */
@@ -2361,15 +2394,16 @@ arena_palloc(arena_t *arena, size_t alig
 		}
 
 		/* Trim leading space. */
-		arena_palloc_trim(arena, chunk, pageind - (leadsize >>
-		    pagesize_2pow), leadsize >> pagesize_2pow);
+		arena_palloc_trim(arena, chunk,
+		    (unsigned)(pageind - (leadsize >> pagesize_2pow)),
+		    (unsigned)(leadsize >> pagesize_2pow));
 
 		trailsize = alloc_size - leadsize - size;
 		if (trailsize != 0) {
 			/* Trim trailing space. */
 			assert(trailsize < alloc_size);
 			arena_palloc_trim(arena, chunk, pageind + npages,
-			    trailsize >> pagesize_2pow);
+			    (unsigned)(trailsize >> pagesize_2pow));
 		}
 	}
 
@@ -2403,7 +2437,8 @@ arena_salloc(const void *ptr)
 	 * affects this function, so we don't need to lock.
 	 */
 	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> pagesize_2pow);
+	pageind = (unsigned)(((uintptr_t)ptr - (uintptr_t)chunk) >>
+	    pagesize_2pow);
 	mapelm = &chunk->map[pageind];
 	if (mapelm->pos != 0 || ptr != (char *)((uintptr_t)chunk) + (pageind <<
 	    pagesize_2pow)) {
@@ -2483,7 +2518,8 @@ arena_dalloc(arena_t *arena, arena_chunk
 	assert(ptr != NULL);
 	assert(CHUNK_ADDR2BASE(ptr) != ptr);
 
-	pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> pagesize_2pow);
+	pageind = (unsigned)(((uintptr_t)ptr - (uintptr_t)chunk) >>
+	    pagesize_2pow);
 	mapelm = &chunk->map[pageind];
 	if (mapelm->pos != 0 || ptr != (char *)((uintptr_t)chunk) + (pageind <<
 	    pagesize_2pow)) {
@@ -2667,7 +2703,7 @@ arenas_extend(unsigned ind)
 	 * by using arenas[0].  In practice, this is an extremely unlikely
 	 * failure.
 	 */
-	_malloc_message(_getprogname(),
+	_malloc_message(getprogname(),
 	    ": (malloc) Error initializing arena\n", "", "");
 	if (opt_abort)
 		abort();
@@ -2827,6 +2863,84 @@ huge_ralloc(void *ptr, size_t size, size
 		return (ptr);
 	}
 
+	if (CHUNK_ADDR2BASE(ptr) == ptr
+#ifdef USE_BRK
+	    && ((uintptr_t)ptr < (uintptr_t)brk_base
+	    || (uintptr_t)ptr >= (uintptr_t)brk_max)
+#endif
+	    ) {
+		chunk_node_t *node, key;
+		void *newptr;
+		size_t oldcsize;
+		size_t newcsize;
+
+		newcsize = CHUNK_CEILING(size);
+		oldcsize = CHUNK_CEILING(oldsize);
+		assert(oldcsize != newcsize);
+		if (newcsize == 0) {
+			/* size_t wrap-around */
+			return (NULL);
+		}
+
+		/*
+		 * Remove the old region from the tree now.  If mremap()
+		 * returns the region to the system, other thread may
+		 * map it for same huge allocation and insert it to the
+		 * tree before we acquire the mutex lock again.
+		 */
+		malloc_mutex_lock(&chunks_mtx);
+		key.chunk = __DECONST(void *, ptr);
+		/* LINTED */
+		node = RB_FIND(chunk_tree_s, &huge, &key);
+		assert(node != NULL);
+		assert(node->chunk == ptr);
+		assert(node->size == oldcsize);
+		RB_REMOVE(chunk_tree_s, &huge, node);
+		malloc_mutex_unlock(&chunks_mtx);
+
+		newptr = mremap(ptr, oldcsize, NULL, newcsize,
+		    MAP_ALIGNED(chunksize_2pow));
+		if (newptr == MAP_FAILED) {
+			/* We still own the old region. */
+			malloc_mutex_lock(&chunks_mtx);
+			RB_INSERT(chunk_tree_s, &huge, node);
+			malloc_mutex_unlock(&chunks_mtx);
+		} else {
+			assert(CHUNK_ADDR2BASE(newptr) == newptr);
+
+			/* Insert new or resized old region. */
+			malloc_mutex_lock(&chunks_mtx);
+			node->size = newcsize;
+			node->chunk = newptr;
+			RB_INSERT(chunk_tree_s, &huge, node);
+#ifdef MALLOC_STATS
+			huge_nralloc++;
+			huge_allocated += newcsize - oldcsize;
+			if (newcsize > oldcsize) {
+				stats_chunks.curchunks +=
+				    (newcsize - oldcsize) / chunksize;
+				if (stats_chunks.curchunks >
+				    stats_chunks.highchunks)
+					stats_chunks.highchunks =
+					    stats_chunks.curchunks;
+			} else {
+				stats_chunks.curchunks -=
+				    (oldcsize - newcsize) / chunksize;
+			}
+#endif
+			malloc_mutex_unlock(&chunks_mtx);
+
+			if (opt_junk && size < oldsize) {
+				memset((void *)((uintptr_t)newptr + size), 0x5a,
+				    newcsize - size);
+			} else if (opt_zero && size > oldsize) {
+				memset((void *)((uintptr_t)newptr + oldsize), 0,
+				    size - oldsize);
+			}
+			return (newptr);
+		}
+	}
+
 	/*
 	 * If we get here, then size and oldsize are different enough that we
 	 * need to use a different size class.  In that case, fall back to
@@ -3126,16 +3240,17 @@ malloc_print_stats(void)
 		    opt_xmalloc ? "X" : "x",
 		    opt_zero ? "Z\n" : "z\n");
 
-		_malloc_message("CPUs: ", umax2s(ncpus, s), "\n", "");
-		_malloc_message("Max arenas: ", umax2s(narenas, s), "\n", "");
-		_malloc_message("Pointer size: ", umax2s(sizeof(void *), s),
+		_malloc_message("CPUs: ", size_t2s(ncpus, s), "\n", "");
+		_malloc_message("Max arenas: ", size_t2s(narenas, s), "\n", "");
+		_malloc_message("Pointer size: ", size_t2s(sizeof(void *), s),
 		    "\n", "");
-		_malloc_message("Quantum size: ", umax2s(quantum, s), "\n", "");
-		_malloc_message("Max small size: ", umax2s(small_max, s), "\n",
+		_malloc_message("Quantum size: ", size_t2s(quantum, s), "\n", "");
+		_malloc_message("Max small size: ", size_t2s(small_max, s), "\n",
 		    "");
 
-		_malloc_message("Chunk size: ", umax2s(chunksize, s), "", "");
-		_malloc_message(" (2^", umax2s(opt_chunk_2pow, s), ")\n", "");
+		_malloc_message("Chunk size: ", size_t2s(chunksize, s), "", "");
+		_malloc_message(" (2^", size_t2s((size_t)opt_chunk_2pow, s),
+		    ")\n", "");
 
 #ifdef MALLOC_STATS
 		{
@@ -3188,10 +3303,11 @@ malloc_print_stats(void)
 
 			/* Print chunk stats. */
 			malloc_printf(
-			    "huge: nmalloc      ndalloc    allocated\n");
-			malloc_printf(" %12llu %12llu %12zu\n",
-			    huge_nmalloc, huge_ndalloc, huge_allocated
-			    * chunksize);
+			    "huge: nmalloc      ndalloc      "
+			    "nralloc    allocated\n");
+			malloc_printf(" %12llu %12llu %12llu %12zu\n",
+			    huge_nmalloc, huge_ndalloc, huge_nralloc,
+			    huge_allocated);
 
 			/* Print stats for each arena. */
 			for (i = 0; i < narenas; i++) {
@@ -3229,9 +3345,10 @@ static bool
 malloc_init_hard(void)
 {
 	unsigned i, j;
-	int linklen;
+	ssize_t linklen;
 	char buf[PATH_MAX + 1];
 	const char *opts = "";
+	int serrno;
 
 	malloc_mutex_lock(&init_lock);
 	if (malloc_initialized) {
@@ -3243,6 +3360,7 @@ malloc_init_hard(void)
 		return (false);
 	}
 
+	serrno = errno;
 	/* Get number of CPUs. */
 	{
 		int mib[2];
@@ -3293,8 +3411,8 @@ malloc_init_hard(void)
 			}
 			break;
 		case 1:
-			if (issetugid() == 0 && (opts =
-			    getenv("MALLOC_OPTIONS")) != NULL) {
+			if ((opts = getenv("MALLOC_OPTIONS")) != NULL &&
+			    issetugid() == 0) {
 				/*
 				 * Do nothing; opts is already initialized to
 				 * the value of the MALLOC_OPTIONS environment
@@ -3320,6 +3438,7 @@ malloc_init_hard(void)
 			break;
 		default:
 			/* NOTREACHED */
+			/* LINTED */
 			assert(false);
 		}
 
@@ -3353,14 +3472,8 @@ malloc_init_hard(void)
 					opt_chunk_2pow--;
 				break;
 			case 'K':
-				/*
-				 * There must be fewer pages in a chunk than
-				 * can be recorded by the pos field of
-				 * arena_chunk_map_t, in order to make POS_FREE
-				 * special.
-				 */
-				if (opt_chunk_2pow - pagesize_2pow
-				    < (sizeof(uint32_t) << 3) - 1)
+				if (opt_chunk_2pow + 1 <
+				    (int)(sizeof(size_t) << 3))
 					opt_chunk_2pow++;
 				break;
 			case 'n':
@@ -3420,13 +3533,14 @@ malloc_init_hard(void)
 				
 				cbuf[0] = opts[j];
 				cbuf[1] = '\0';
-				_malloc_message(_getprogname(),
+				_malloc_message(getprogname(),
 				    ": (malloc) Unsupported character in "
 				    "malloc options: '", cbuf, "'\n");
 			}
 			}
 		}
 	}
+	errno = serrno;
 
 	/* Take care to call atexit() only once. */
 	if (opt_print_stats) {
@@ -3442,10 +3556,10 @@ malloc_init_hard(void)
 	/* Set bin-related variables. */
 	bin_maxclass = (pagesize >> 1);
 	assert(opt_quantum_2pow >= TINY_MIN_2POW);
-	ntbins = opt_quantum_2pow - TINY_MIN_2POW;
+	ntbins = (unsigned)(opt_quantum_2pow - TINY_MIN_2POW);
 	assert(ntbins <= opt_quantum_2pow);
-	nqbins = (small_max >> opt_quantum_2pow);
-	nsbins = pagesize_2pow - opt_small_max_2pow - 1;
+	nqbins = (unsigned)(small_max >> opt_quantum_2pow);
+	nsbins = (unsigned)(pagesize_2pow - opt_small_max_2pow - 1);
 
 	/* Set variables according to the value of opt_quantum_2pow. */
 	quantum = (1 << opt_quantum_2pow);
@@ -3459,12 +3573,13 @@ malloc_init_hard(void)
 	/* Set variables according to the value of opt_chunk_2pow. */
 	chunksize = (1LU << opt_chunk_2pow);
 	chunksize_mask = chunksize - 1;
-	chunk_npages = (chunksize >> pagesize_2pow);
+	chunksize_2pow = (unsigned)opt_chunk_2pow;
+	chunk_npages = (unsigned)(chunksize >> pagesize_2pow);
 	{
 		unsigned header_size;
 
-		header_size = sizeof(arena_chunk_t) + (sizeof(arena_chunk_map_t)
-		    * (chunk_npages - 1));
+		header_size = (unsigned)(sizeof(arena_chunk_t) +
+		    (sizeof(arena_chunk_map_t) * (chunk_npages - 1)));
 		arena_chunk_header_npages = (header_size >> pagesize_2pow);
 		if ((header_size & pagesize_mask) != 0)
 			arena_chunk_header_npages++;
@@ -3496,6 +3611,7 @@ malloc_init_hard(void)
 #ifdef MALLOC_STATS
 	huge_nmalloc = 0;
 	huge_ndalloc = 0;
+	huge_nralloc = 0;
 	huge_allocated = 0;
 #endif
 	RB_INIT(&old_chunks);
@@ -3538,7 +3654,7 @@ malloc_init_hard(void)
 		 * can handle.
 		 */
 		if (narenas * sizeof(arena_t *) > chunksize)
-			narenas = chunksize / sizeof(arena_t *);
+			narenas = (unsigned)(chunksize / sizeof(arena_t *));
 	} else if (opt_narenas_lshift < 0) {
 		if ((narenas << opt_narenas_lshift) < narenas)
 			narenas <<= opt_narenas_lshift;
@@ -3610,7 +3726,7 @@ malloc(size_t size)
 RETURN:
 	if (ret == NULL) {
 		if (opt_xmalloc) {
-			_malloc_message(_getprogname(),
+			_malloc_message(getprogname(),
 			    ": (malloc) Error in malloc(): out of memory\n", "",
 			    "");
 			abort();
@@ -3622,9 +3738,6 @@ RETURN:
 	return (ret);
 }
 
-/* XXXAD */
-int	posix_memalign(void **memptr, size_t alignment, size_t size);
-
 int
 posix_memalign(void **memptr, size_t alignment, size_t size)
 {
@@ -3638,7 +3751,7 @@ posix_memalign(void **memptr, size_t ali
 		if (((alignment - 1) & alignment) != 0
 		    || alignment < sizeof(void *)) {
 			if (opt_xmalloc) {
-				_malloc_message(_getprogname(),
+				_malloc_message(getprogname(),
 				    ": (malloc) Error in posix_memalign(): "
 				    "invalid alignment\n", "", "");
 				abort();
@@ -3653,7 +3766,7 @@ posix_memalign(void **memptr, size_t ali
 
 	if (result == NULL) {
 		if (opt_xmalloc) {
-			_malloc_message(_getprogname(),
+			_malloc_message(getprogname(),
 			": (malloc) Error in posix_memalign(): out of memory\n",
 			"", "");
 			abort();
@@ -3708,7 +3821,7 @@ calloc(size_t num, size_t size)
 RETURN:
 	if (ret == NULL) {
 		if (opt_xmalloc) {
-			_malloc_message(_getprogname(),
+			_malloc_message(getprogname(),
 			    ": (malloc) Error in calloc(): out of memory\n", "",
 			    "");
 			abort();
@@ -3743,7 +3856,7 @@ realloc(void *ptr, size_t size)
 
 		if (ret == NULL) {
 			if (opt_xmalloc) {
-				_malloc_message(_getprogname(),
+				_malloc_message(getprogname(),
 				    ": (malloc) Error in realloc(): out of "
 				    "memory\n", "", "");
 				abort();
@@ -3758,7 +3871,7 @@ realloc(void *ptr, size_t size)
 
 		if (ret == NULL) {
 			if (opt_xmalloc) {
-				_malloc_message(_getprogname(),
+				_malloc_message(getprogname(),
 				    ": (malloc) Error in realloc(): out of "
 				    "memory\n", "", "");
 				abort();
@@ -3825,7 +3938,6 @@ _malloc_prefork(void)
 		if (arenas[i] != NULL)
 			malloc_mutex_lock(&arenas[i]->mtx);
 	}
-	malloc_mutex_unlock(&arenas_mtx);
 
 	malloc_mutex_lock(&base_mtx);
 
@@ -3843,7 +3955,6 @@ _malloc_postfork(void)
 
 	malloc_mutex_unlock(&base_mtx);
 
-	malloc_mutex_lock(&arenas_mtx);
 	for (i = 0; i < narenas; i++) {
 		if (arenas[i] != NULL)
 			malloc_mutex_unlock(&arenas[i]->mtx);