Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

===================================================================
RCS file: /ftp/cvs/cvsroot/src/lib/libc/stdlib/jemalloc.c,v
rcsdiff: /ftp/cvs/cvsroot/src/lib/libc/stdlib/jemalloc.c,v: warning: Unknown phrases like `commitid ...;' are present.
retrieving revision 1.19
retrieving revision 1.32
diff -u -p -r1.19 -r1.32
--- src/lib/libc/stdlib/jemalloc.c	2008/06/23 10:46:25	1.19
+++ src/lib/libc/stdlib/jemalloc.c	2014/02/25 12:13:19	1.32
@@ -1,4 +1,4 @@
-/*	$NetBSD: jemalloc.c,v 1.19 2008/06/23 10:46:25 ad Exp $	*/
+/*	$NetBSD: jemalloc.c,v 1.32 2014/02/25 12:13:19 martin Exp $	*/
 
 /*-
  * Copyright (C) 2006,2007 Jason Evans <jasone@FreeBSD.org>.
@@ -118,7 +118,7 @@
 
 #include <sys/cdefs.h>
 /* __FBSDID("$FreeBSD: src/lib/libc/stdlib/malloc.c,v 1.147 2007/06/15 22:00:16 jasone Exp $"); */ 
-__RCSID("$NetBSD: jemalloc.c,v 1.19 2008/06/23 10:46:25 ad Exp $");
+__RCSID("$NetBSD: jemalloc.c,v 1.32 2014/02/25 12:13:19 martin Exp $");
 
 #ifdef __FreeBSD__
 #include "libc_private.h"
@@ -216,6 +216,14 @@ __strerror_r(int e, char *s, size_t l)
 #define	STRERROR_BUF		64
 
 /* Minimum alignment of allocations is 2^QUANTUM_2POW_MIN bytes. */
+
+/*
+ * If you touch the TINY_MIN_2POW definition for any architecture, please
+ * make sure to adjust the corresponding definition for JEMALLOC_TINY_MIN_2POW
+ * in the gcc 4.8 tree in dist/gcc/tree-ssa-ccp.c and verify that a native
+ * gcc is still buildable!
+ */
+
 #ifdef __i386__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	2
@@ -228,27 +236,34 @@ __strerror_r(int e, char *s, size_t l)
 #ifdef __alpha__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	3
+#  define TINY_MIN_2POW		3
 #  define NO_TLS
 #endif
 #ifdef __sparc64__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	3
+#  define TINY_MIN_2POW		3
 #  define NO_TLS
 #endif
 #ifdef __amd64__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	3
+#  define TINY_MIN_2POW		3
 #endif
 #ifdef __arm__
 #  define QUANTUM_2POW_MIN	3
 #  define SIZEOF_PTR_2POW	2
 #  define USE_BRK
+#  ifdef __ARM_EABI__
+#    define TINY_MIN_2POW	3
+#  endif
 #  define NO_TLS
 #endif
 #ifdef __powerpc__
 #  define QUANTUM_2POW_MIN	4
 #  define SIZEOF_PTR_2POW	2
 #  define USE_BRK
+#  define TINY_MIN_2POW		3
 #endif
 #if defined(__sparc__) && !defined(__sparc64__)
 #  define QUANTUM_2POW_MIN	4
@@ -288,11 +303,6 @@ __strerror_r(int e, char *s, size_t l)
 #  define SIZEOF_INT_2POW	2
 #endif
 
-/* We can't use TLS in non-PIC programs, since TLS relies on loader magic. */
-#if (!defined(PIC) && !defined(NO_TLS))
-#  define NO_TLS
-#endif
-
 /*
  * Size and alignment of memory chunks that are allocated by the OS's virtual
  * memory system.
@@ -308,7 +318,9 @@ __strerror_r(int e, char *s, size_t l)
 #define	CACHELINE		((size_t)(1 << CACHELINE_2POW))
 
 /* Smallest size class to support. */
-#define	TINY_MIN_2POW		1
+#ifndef TINY_MIN_2POW
+#define	TINY_MIN_2POW		2
+#endif
 
 /*
  * Maximum size class that is a multiple of the quantum, but not (necessarily)
@@ -319,20 +331,25 @@ __strerror_r(int e, char *s, size_t l)
 #define	SMALL_MAX_DEFAULT	(1 << SMALL_MAX_2POW_DEFAULT)
 
 /*
- * Maximum desired run header overhead.  Runs are sized as small as possible
- * such that this setting is still honored, without violating other constraints.
- * The goal is to make runs as small as possible without exceeding a per run
- * external fragmentation threshold.
+ * RUN_MAX_OVRHD indicates maximum desired run header overhead.  Runs are sized
+ * as small as possible such that this setting is still honored, without
+ * violating other constraints.  The goal is to make runs as small as possible
+ * without exceeding a per run external fragmentation threshold.
  *
- * Note that it is possible to set this low enough that it cannot be honored
- * for some/all object sizes, since there is one bit of header overhead per
- * object (plus a constant).  In such cases, this constraint is relaxed.
+ * We use binary fixed point math for overhead computations, where the binary
+ * point is implicitly RUN_BFP bits to the left.
  *
- * RUN_MAX_OVRHD_RELAX specifies the maximum number of bits per region of
- * overhead for which RUN_MAX_OVRHD is relaxed.
+ * Note that it is possible to set RUN_MAX_OVRHD low enough that it cannot be
+ * honored for some/all object sizes, since there is one bit of header overhead
+ * per object (plus a constant).  This constraint is relaxed (ignored) for runs
+ * that are so small that the per-region overhead is greater than:
+ *
+ *   (RUN_MAX_OVRHD / (reg_size << (3+RUN_BFP))
  */
-#define RUN_MAX_OVRHD		0.015
-#define RUN_MAX_OVRHD_RELAX	1.5
+#define RUN_BFP			12
+/*                              \/   Implicit binary fixed point. */
+#define RUN_MAX_OVRHD		0x0000003dU
+#define RUN_MAX_OVRHD_RELAX	0x00001800U
 
 /* Put a cap on small object run size.  This overrides RUN_MAX_OVRHD. */
 #define RUN_MAX_SMALL_2POW	15
@@ -811,7 +828,7 @@ static void	wrtmessage(const char *p1, c
 #ifdef MALLOC_STATS
 static void	malloc_printf(const char *format, ...);
 #endif
-static char	*umax2s(uintmax_t x, char *s);
+static char	*size_t2s(size_t x, char *s);
 static bool	base_pages_alloc(size_t minsize);
 static void	*base_alloc(size_t size);
 static chunk_node_t *base_chunk_node_alloc(void);
@@ -973,19 +990,19 @@ malloc_printf(const char *format, ...)
 
 /*
  * We don't want to depend on vsnprintf() for production builds, since that can
- * cause unnecessary bloat for static binaries.  umax2s() provides minimal
+ * cause unnecessary bloat for static binaries.  size_t2s() provides minimal
  * integer printing functionality, so that malloc_printf() use can be limited to
  * MALLOC_STATS code.
  */
 #define UMAX2S_BUFSIZE	21
 static char *
-umax2s(uintmax_t x, char *s)
+size_t2s(size_t x, char *s)
 {
 	unsigned i;
 
 	/* Make sure UMAX2S_BUFSIZE is large enough. */
 	/* LINTED */
-	assert(sizeof(uintmax_t) <= 8);
+	assert(sizeof(size_t) <= 8);
 
 	i = UMAX2S_BUFSIZE - 1;
 	s[i] = '\0';
@@ -1029,7 +1046,8 @@ base_pages_alloc(size_t minsize)
 			 */
 			incr = (intptr_t)chunksize
 			    - (intptr_t)CHUNK_ADDR2OFFSET(brk_cur);
-			if (incr < minsize)
+			assert(incr >= 0);
+			if ((size_t)incr < minsize)
 				incr += csize;
 
 			brk_prev = sbrk(incr);
@@ -1364,7 +1382,7 @@ chunk_alloc(size_t size)
 			 */
 			incr = (intptr_t)size
 			    - (intptr_t)CHUNK_ADDR2OFFSET(brk_cur);
-			if (incr == size) {
+			if (incr == (intptr_t)size) {
 				ret = brk_cur;
 			} else {
 				ret = (void *)((intptr_t)brk_cur + incr);
@@ -2142,7 +2160,6 @@ arena_bin_run_size_calc(arena_bin_t *bin
 	size_t try_run_size, good_run_size;
 	unsigned good_nregs, good_mask_nelms, good_reg0_offset;
 	unsigned try_nregs, try_mask_nelms, try_reg0_offset;
-	float max_ovrhd = RUN_MAX_OVRHD;
 
 	assert(min_run_size >= pagesize);
 	assert(min_run_size <= arena_maxclass);
@@ -2160,7 +2177,7 @@ arena_bin_run_size_calc(arena_bin_t *bin
 	 */
 	try_run_size = min_run_size;
 	try_nregs = (unsigned)(((try_run_size - sizeof(arena_run_t)) /
-	    bin->reg_size) + 1); /* Counter-act the first line of the loop. */
+	    bin->reg_size) + 1); /* Counter-act try_nregs-- in loop. */
 	do {
 		try_nregs--;
 		try_mask_nelms = (try_nregs >> (SIZEOF_INT_2POW + 3)) +
@@ -2194,9 +2211,8 @@ arena_bin_run_size_calc(arena_bin_t *bin
 		} while (sizeof(arena_run_t) + (sizeof(unsigned) *
 		    (try_mask_nelms - 1)) > try_reg0_offset);
 	} while (try_run_size <= arena_maxclass && try_run_size <= RUN_MAX_SMALL
-	    && max_ovrhd > RUN_MAX_OVRHD_RELAX / ((float)(bin->reg_size << 3))
-	    && ((float)(try_reg0_offset)) / ((float)(try_run_size)) >
-	    max_ovrhd);
+	    && RUN_MAX_OVRHD * (bin->reg_size << 3) > RUN_MAX_OVRHD_RELAX
+	    && (try_reg0_offset << RUN_BFP) > RUN_MAX_OVRHD * try_run_size);
 
 	assert(sizeof(arena_run_t) + (sizeof(unsigned) * (good_mask_nelms - 1))
 	    <= good_reg0_offset);
@@ -2855,25 +2871,38 @@ huge_ralloc(void *ptr, size_t size, size
 			/* size_t wrap-around */
 			return (NULL);
 		}
+
+		/*
+		 * Remove the old region from the tree now.  If mremap()
+		 * returns the region to the system, other thread may
+		 * map it for same huge allocation and insert it to the
+		 * tree before we acquire the mutex lock again.
+		 */
+		malloc_mutex_lock(&chunks_mtx);
+		key.chunk = __DECONST(void *, ptr);
+		/* LINTED */
+		node = RB_FIND(chunk_tree_s, &huge, &key);
+		assert(node != NULL);
+		assert(node->chunk == ptr);
+		assert(node->size == oldcsize);
+		RB_REMOVE(chunk_tree_s, &huge, node);
+		malloc_mutex_unlock(&chunks_mtx);
+
 		newptr = mremap(ptr, oldcsize, NULL, newcsize,
 		    MAP_ALIGNED(chunksize_2pow));
-		if (newptr != MAP_FAILED) {
+		if (newptr == MAP_FAILED) {
+			/* We still own the old region. */
+			malloc_mutex_lock(&chunks_mtx);
+			RB_INSERT(chunk_tree_s, &huge, node);
+			malloc_mutex_unlock(&chunks_mtx);
+		} else {
 			assert(CHUNK_ADDR2BASE(newptr) == newptr);
 
-			/* update tree */
+			/* Insert new or resized old region. */
 			malloc_mutex_lock(&chunks_mtx);
-			key.chunk = __DECONST(void *, ptr);
-			/* LINTED */
-			node = RB_FIND(chunk_tree_s, &huge, &key);
-			assert(node != NULL);
-			assert(node->chunk == ptr);
-			assert(node->size == oldcsize);
 			node->size = newcsize;
-			if (ptr != newptr) {
-				RB_REMOVE(chunk_tree_s, &huge, node);
-				node->chunk = newptr;
-				RB_INSERT(chunk_tree_s, &huge, node);
-			}
+			node->chunk = newptr;
+			RB_INSERT(chunk_tree_s, &huge, node);
 #ifdef MALLOC_STATS
 			huge_nralloc++;
 			huge_allocated += newcsize - oldcsize;
@@ -3201,16 +3230,17 @@ malloc_print_stats(void)
 		    opt_xmalloc ? "X" : "x",
 		    opt_zero ? "Z\n" : "z\n");
 
-		_malloc_message("CPUs: ", umax2s(ncpus, s), "\n", "");
-		_malloc_message("Max arenas: ", umax2s(narenas, s), "\n", "");
-		_malloc_message("Pointer size: ", umax2s(sizeof(void *), s),
+		_malloc_message("CPUs: ", size_t2s(ncpus, s), "\n", "");
+		_malloc_message("Max arenas: ", size_t2s(narenas, s), "\n", "");
+		_malloc_message("Pointer size: ", size_t2s(sizeof(void *), s),
 		    "\n", "");
-		_malloc_message("Quantum size: ", umax2s(quantum, s), "\n", "");
-		_malloc_message("Max small size: ", umax2s(small_max, s), "\n",
+		_malloc_message("Quantum size: ", size_t2s(quantum, s), "\n", "");
+		_malloc_message("Max small size: ", size_t2s(small_max, s), "\n",
 		    "");
 
-		_malloc_message("Chunk size: ", umax2s(chunksize, s), "", "");
-		_malloc_message(" (2^", umax2s(opt_chunk_2pow, s), ")\n", "");
+		_malloc_message("Chunk size: ", size_t2s(chunksize, s), "", "");
+		_malloc_message(" (2^", size_t2s((size_t)opt_chunk_2pow, s),
+		    ")\n", "");
 
 #ifdef MALLOC_STATS
 		{
@@ -3308,6 +3338,7 @@ malloc_init_hard(void)
 	ssize_t linklen;
 	char buf[PATH_MAX + 1];
 	const char *opts = "";
+	int serrno;
 
 	malloc_mutex_lock(&init_lock);
 	if (malloc_initialized) {
@@ -3319,6 +3350,7 @@ malloc_init_hard(void)
 		return (false);
 	}
 
+	serrno = errno;
 	/* Get number of CPUs. */
 	{
 		int mib[2];
@@ -3430,14 +3462,8 @@ malloc_init_hard(void)
 					opt_chunk_2pow--;
 				break;
 			case 'K':
-				/*
-				 * There must be fewer pages in a chunk than
-				 * can be recorded by the pos field of
-				 * arena_chunk_map_t, in order to make POS_FREE
-				 * special.
-				 */
-				if (opt_chunk_2pow - pagesize_2pow
-				    < (sizeof(uint32_t) << 3) - 1)
+				if (opt_chunk_2pow + 1 <
+				    (int)(sizeof(size_t) << 3))
 					opt_chunk_2pow++;
 				break;
 			case 'n':
@@ -3504,6 +3530,7 @@ malloc_init_hard(void)
 			}
 		}
 	}
+	errno = serrno;
 
 	/* Take care to call atexit() only once. */
 	if (opt_print_stats) {