1 files changed, 561 insertions, 0 deletions
diff --git a/deps/jemalloc/include/jemalloc/internal/prof.h b/deps/jemalloc/include/jemalloc/internal/prof.h
new file mode 100644
index 000000000..f94387350
--- /dev/null
+++ b/deps/jemalloc/include/jemalloc/internal/prof.h
@@ -0,0 +1,561 @@
+#ifdef JEMALLOC_PROF
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+typedef struct prof_bt_s prof_bt_t;
+typedef struct prof_cnt_s prof_cnt_t;
+typedef struct prof_thr_cnt_s prof_thr_cnt_t;
+typedef struct prof_ctx_s prof_ctx_t;
+typedef struct prof_tdata_s prof_tdata_t;
+
+/* Option defaults. */
+#define	PROF_PREFIX_DEFAULT		"jeprof"
+#define	LG_PROF_BT_MAX_DEFAULT		7
+#define	LG_PROF_SAMPLE_DEFAULT		0
+#define	LG_PROF_INTERVAL_DEFAULT	-1
+#define	LG_PROF_TCMAX_DEFAULT		-1
+
+/*
+ * Hard limit on stack backtrace depth.  Note that the version of
+ * prof_backtrace() that is based on __builtin_return_address() necessarily has
+ * a hard-coded number of backtrace frame handlers.
+ */
+#if (defined(JEMALLOC_PROF_LIBGCC) || defined(JEMALLOC_PROF_LIBUNWIND))
+#  define LG_PROF_BT_MAX	((ZU(1) << (LG_SIZEOF_PTR+3)) - 1)
+#else
+#  define LG_PROF_BT_MAX	7 /* >= LG_PROF_BT_MAX_DEFAULT */
+#endif
+#define	PROF_BT_MAX		(1U << LG_PROF_BT_MAX)
+
+/* Initial hash table size. */
+#define	PROF_CKH_MINITEMS	64
+
+/* Size of memory buffer to use when writing dump files. */
+#define	PROF_DUMP_BUF_SIZE	65536
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+struct prof_bt_s {
+	/* Backtrace, stored as len program counters. */
+	void		**vec;
+	unsigned	len;
+};
+
+#ifdef JEMALLOC_PROF_LIBGCC
+/* Data structure passed to libgcc _Unwind_Backtrace() callback functions. */
+typedef struct {
+	prof_bt_t	*bt;
+	unsigned	nignore;
+	unsigned	max;
+} prof_unwind_data_t;
+#endif
+
+struct prof_cnt_s {
+	/*
+	 * Profiling counters.  An allocation/deallocation pair can operate on
+	 * different prof_thr_cnt_t objects that are linked into the same
+	 * prof_ctx_t cnts_ql, so it is possible for the cur* counters to go
+	 * negative.  In principle it is possible for the *bytes counters to
+	 * overflow/underflow, but a general solution would require something
+	 * like 128-bit counters; this implementation doesn't bother to solve
+	 * that problem.
+	 */
+	int64_t		curobjs;
+	int64_t		curbytes;
+	uint64_t	accumobjs;
+	uint64_t	accumbytes;
+};
+
+struct prof_thr_cnt_s {
+	/* Linkage into prof_ctx_t's cnts_ql. */
+	ql_elm(prof_thr_cnt_t)	cnts_link;
+
+	/* Linkage into thread's LRU. */
+	ql_elm(prof_thr_cnt_t)	lru_link;
+
+	/*
+	 * Associated context.  If a thread frees an object that it did not
+	 * allocate, it is possible that the context is not cached in the
+	 * thread's hash table, in which case it must be able to look up the
+	 * context, insert a new prof_thr_cnt_t into the thread's hash table,
+	 * and link it into the prof_ctx_t's cnts_ql.
+	 */
+	prof_ctx_t		*ctx;
+
+	/*
+	 * Threads use memory barriers to update the counters.  Since there is
+	 * only ever one writer, the only challenge is for the reader to get a
+	 * consistent read of the counters.
+	 *
+	 * The writer uses this series of operations:
+	 *
+	 * 1) Increment epoch to an odd number.
+	 * 2) Update counters.
+	 * 3) Increment epoch to an even number.
+	 *
+	 * The reader must assure 1) that the epoch is even while it reads the
+	 * counters, and 2) that the epoch doesn't change between the time it
+	 * starts and finishes reading the counters.
+	 */
+	unsigned		epoch;
+
+	/* Profiling counters. */
+	prof_cnt_t		cnts;
+};
+
+struct prof_ctx_s {
+	/* Associated backtrace. */
+	prof_bt_t		*bt;
+
+	/* Protects cnt_merged and cnts_ql. */
+	malloc_mutex_t		lock;
+
+	/* Temporary storage for summation during dump. */
+	prof_cnt_t		cnt_summed;
+
+	/* When threads exit, they merge their stats into cnt_merged. */
+	prof_cnt_t		cnt_merged;
+
+	/*
+	 * List of profile counters, one for each thread that has allocated in
+	 * this context.
+	 */
+	ql_head(prof_thr_cnt_t)	cnts_ql;
+};
+
+struct prof_tdata_s {
+	/*
+	 * Hash of (prof_bt_t *)-->(prof_thr_cnt_t *).  Each thread keeps a
+	 * cache of backtraces, with associated thread-specific prof_thr_cnt_t
+	 * objects.  Other threads may read the prof_thr_cnt_t contents, but no
+	 * others will ever write them.
+	 *
+	 * Upon thread exit, the thread must merge all the prof_thr_cnt_t
+	 * counter data into the associated prof_ctx_t objects, and unlink/free
+	 * the prof_thr_cnt_t objects.
+	 */
+	ckh_t			bt2cnt;
+
+	/* LRU for contents of bt2cnt. */
+	ql_head(prof_thr_cnt_t)	lru_ql;
+
+	/* Backtrace vector, used for calls to prof_backtrace(). */
+	void			**vec;
+
+	/* Sampling state. */
+	uint64_t		prn_state;
+	uint64_t		threshold;
+	uint64_t		accum;
+};
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+extern bool	opt_prof;
+/*
+ * Even if opt_prof is true, sampling can be temporarily disabled by setting
+ * opt_prof_active to false.  No locking is used when updating opt_prof_active,
+ * so there are no guarantees regarding how long it will take for all threads
+ * to notice state changes.
+ */
+extern bool	opt_prof_active;
+extern size_t	opt_lg_prof_bt_max;   /* Maximum backtrace depth. */
+extern size_t	opt_lg_prof_sample;   /* Mean bytes between samples. */
+extern ssize_t	opt_lg_prof_interval; /* lg(prof_interval). */
+extern bool	opt_prof_gdump;       /* High-water memory dumping. */
+extern bool	opt_prof_leak;        /* Dump leak summary at exit. */
+extern bool	opt_prof_accum;       /* Report cumulative bytes. */
+extern ssize_t	opt_lg_prof_tcmax;    /* lg(max per thread bactrace cache) */
+extern char	opt_prof_prefix[PATH_MAX + 1];
+
+/*
+ * Profile dump interval, measured in bytes allocated.  Each arena triggers a
+ * profile dump when it reaches this threshold.  The effect is that the
+ * interval between profile dumps averages prof_interval, though the actual
+ * interval between dumps will tend to be sporadic, and the interval will be a
+ * maximum of approximately (prof_interval * narenas).
+ */
+extern uint64_t	prof_interval;
+
+/*
+ * If true, promote small sampled objects to large objects, since small run
+ * headers do not have embedded profile context pointers.
+ */
+extern bool	prof_promote;
+
+/* (1U << opt_lg_prof_bt_max). */
+extern unsigned	prof_bt_max;
+
+/* Thread-specific backtrace cache, used to reduce bt2ctx contention. */
+#ifndef NO_TLS
+extern __thread prof_tdata_t	*prof_tdata_tls
+    JEMALLOC_ATTR(tls_model("initial-exec"));
+#  define PROF_TCACHE_GET()	prof_tdata_tls
+#  define PROF_TCACHE_SET(v)	do {					\
+	prof_tdata_tls = (v);						\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#else
+#  define PROF_TCACHE_GET()						\
+	((prof_tdata_t *)pthread_getspecific(prof_tdata_tsd))
+#  define PROF_TCACHE_SET(v)	do {					\
+	pthread_setspecific(prof_tdata_tsd, (void *)(v));		\
+} while (0)
+#endif
+/*
+ * Same contents as b2cnt_tls, but initialized such that the TSD destructor is
+ * called when a thread exits, so that prof_tdata_tls contents can be merged,
+ * unlinked, and deallocated.
+ */
+extern pthread_key_t	prof_tdata_tsd;
+
+void	bt_init(prof_bt_t *bt, void **vec);
+void	prof_backtrace(prof_bt_t *bt, unsigned nignore, unsigned max);
+prof_thr_cnt_t	*prof_lookup(prof_bt_t *bt);
+void	prof_idump(void);
+bool	prof_mdump(const char *filename);
+void	prof_gdump(void);
+prof_tdata_t	*prof_tdata_init(void);
+void	prof_boot0(void);
+void	prof_boot1(void);
+bool	prof_boot2(void);
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+void	prof_sample_threshold_update(prof_tdata_t *prof_tdata);
+prof_thr_cnt_t	*prof_alloc_prep(size_t size);
+prof_ctx_t	*prof_ctx_get(const void *ptr);
+void	prof_ctx_set(const void *ptr, prof_ctx_t *ctx);
+bool	prof_sample_accum_update(size_t size);
+void	prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt);
+void	prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    size_t old_size, prof_ctx_t *old_ctx);
+void	prof_free(const void *ptr, size_t size);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_PROF_C_))
+JEMALLOC_INLINE void
+prof_sample_threshold_update(prof_tdata_t *prof_tdata)
+{
+	uint64_t r;
+	double u;
+
+	/*
+	 * Compute sample threshold as a geometrically distributed random
+	 * variable with mean (2^opt_lg_prof_sample).
+	 *
+	 *                         __        __
+	 *                         |  log(u)  |                     1
+	 * prof_tdata->threshold = | -------- |, where p = -------------------
+	 *                         | log(1-p) |             opt_lg_prof_sample
+	 *                                                 2
+	 *
+	 * For more information on the math, see:
+	 *
+	 *   Non-Uniform Random Variate Generation
+	 *   Luc Devroye
+	 *   Springer-Verlag, New York, 1986
+	 *   pp 500
+	 *   (http://cg.scs.carleton.ca/~luc/rnbookindex.html)
+	 */
+	prn64(r, 53, prof_tdata->prn_state,
+	    (uint64_t)6364136223846793005LLU, (uint64_t)1442695040888963407LLU);
+	u = (double)r * (1.0/9007199254740992.0L);
+	prof_tdata->threshold = (uint64_t)(log(u) /
+	    log(1.0 - (1.0 / (double)((uint64_t)1U << opt_lg_prof_sample))))
+	    + (uint64_t)1U;
+}
+
+JEMALLOC_INLINE prof_thr_cnt_t *
+prof_alloc_prep(size_t size)
+{
+#ifdef JEMALLOC_ENABLE_INLINE
+   /* This function does not have its own stack frame, because it is inlined. */
+#  define NIGNORE 1
+#else
+#  define NIGNORE 2
+#endif
+	prof_thr_cnt_t *ret;
+	prof_tdata_t *prof_tdata;
+	prof_bt_t bt;
+
+	assert(size == s2u(size));
+
+	prof_tdata = PROF_TCACHE_GET();
+	if (prof_tdata == NULL) {
+		prof_tdata = prof_tdata_init();
+		if (prof_tdata == NULL)
+			return (NULL);
+	}
+
+	if (opt_prof_active == false) {
+		/* Sampling is currently inactive, so avoid sampling. */
+		ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	} else if (opt_lg_prof_sample == 0) {
+		/*
+		 * Don't bother with sampling logic, since sampling interval is
+		 * 1.
+		 */
+		bt_init(&bt, prof_tdata->vec);
+		prof_backtrace(&bt, NIGNORE, prof_bt_max);
+		ret = prof_lookup(&bt);
+	} else {
+		if (prof_tdata->threshold == 0) {
+			/*
+			 * Initialize.  Seed the prng differently for each
+			 * thread.
+			 */
+			prof_tdata->prn_state = (uint64_t)(uintptr_t)&size;
+			prof_sample_threshold_update(prof_tdata);
+		}
+
+		/*
+		 * Determine whether to capture a backtrace based on whether
+		 * size is enough for prof_accum to reach
+		 * prof_tdata->threshold.  However, delay updating these
+		 * variables until prof_{m,re}alloc(), because we don't know
+		 * for sure that the allocation will succeed.
+		 *
+		 * Use subtraction rather than addition to avoid potential
+		 * integer overflow.
+		 */
+		if (size >= prof_tdata->threshold - prof_tdata->accum) {
+			bt_init(&bt, prof_tdata->vec);
+			prof_backtrace(&bt, NIGNORE, prof_bt_max);
+			ret = prof_lookup(&bt);
+		} else
+			ret = (prof_thr_cnt_t *)(uintptr_t)1U;
+	}
+
+	return (ret);
+#undef NIGNORE
+}
+
+JEMALLOC_INLINE prof_ctx_t *
+prof_ctx_get(const void *ptr)
+{
+	prof_ctx_t *ret;
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		dassert(chunk->arena->magic == ARENA_MAGIC);
+
+		ret = arena_prof_ctx_get(ptr);
+	} else
+		ret = huge_prof_ctx_get(ptr);
+
+	return (ret);
+}
+
+JEMALLOC_INLINE void
+prof_ctx_set(const void *ptr, prof_ctx_t *ctx)
+{
+	arena_chunk_t *chunk;
+
+	assert(ptr != NULL);
+
+	chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr);
+	if (chunk != ptr) {
+		/* Region. */
+		dassert(chunk->arena->magic == ARENA_MAGIC);
+
+		arena_prof_ctx_set(ptr, ctx);
+	} else
+		huge_prof_ctx_set(ptr, ctx);
+}
+
+JEMALLOC_INLINE bool
+prof_sample_accum_update(size_t size)
+{
+	prof_tdata_t *prof_tdata;
+
+	/* Sampling logic is unnecessary if the interval is 1. */
+	assert(opt_lg_prof_sample != 0);
+
+	prof_tdata = PROF_TCACHE_GET();
+	assert(prof_tdata != NULL);
+
+	/* Take care to avoid integer overflow. */
+	if (size >= prof_tdata->threshold - prof_tdata->accum) {
+		prof_tdata->accum -= (prof_tdata->threshold - size);
+		/* Compute new sample threshold. */
+		prof_sample_threshold_update(prof_tdata);
+		while (prof_tdata->accum >= prof_tdata->threshold) {
+			prof_tdata->accum -= prof_tdata->threshold;
+			prof_sample_threshold_update(prof_tdata);
+		}
+		return (false);
+	} else {
+		prof_tdata->accum += size;
+		return (true);
+	}
+}
+
+JEMALLOC_INLINE void
+prof_malloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt)
+{
+
+	assert(ptr != NULL);
+	assert(size == isalloc(ptr));
+
+	if (opt_lg_prof_sample != 0) {
+		if (prof_sample_accum_update(size)) {
+			/*
+			 * Don't sample.  For malloc()-like allocation, it is
+			 * always possible to tell in advance how large an
+			 * object's usable size will be, so there should never
+			 * be a difference between the size passed to
+			 * prof_alloc_prep() and prof_malloc().
+			 */
+			assert((uintptr_t)cnt == (uintptr_t)1U);
+		}
+	}
+
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+		/*********/
+		mb_write();
+		/*********/
+		cnt->epoch++;
+		/*********/
+		mb_write();
+		/*********/
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+}
+
+JEMALLOC_INLINE void
+prof_realloc(const void *ptr, size_t size, prof_thr_cnt_t *cnt,
+    size_t old_size, prof_ctx_t *old_ctx)
+{
+	prof_thr_cnt_t *told_cnt;
+
+	assert(ptr != NULL || (uintptr_t)cnt <= (uintptr_t)1U);
+
+	if (ptr != NULL) {
+		assert(size == isalloc(ptr));
+		if (opt_lg_prof_sample != 0) {
+			if (prof_sample_accum_update(size)) {
+				/*
+				 * Don't sample.  The size passed to
+				 * prof_alloc_prep() was larger than what
+				 * actually got allocated, so a backtrace was
+				 * captured for this allocation, even though
+				 * its actual size was insufficient to cross
+				 * the sample threshold.
+				 */
+				cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+			}
+		}
+	}
+
+	if ((uintptr_t)old_ctx > (uintptr_t)1U) {
+		told_cnt = prof_lookup(old_ctx->bt);
+		if (told_cnt == NULL) {
+			/*
+			 * It's too late to propagate OOM for this realloc(),
+			 * so operate directly on old_cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&old_ctx->lock);
+			old_ctx->cnt_merged.curobjs--;
+			old_ctx->cnt_merged.curbytes -= old_size;
+			malloc_mutex_unlock(&old_ctx->lock);
+			told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+		}
+	} else
+		told_cnt = (prof_thr_cnt_t *)(uintptr_t)1U;
+
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		prof_ctx_set(ptr, cnt->ctx);
+		cnt->epoch++;
+	} else
+		prof_ctx_set(ptr, (prof_ctx_t *)(uintptr_t)1U);
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U) {
+		told_cnt->cnts.curobjs--;
+		told_cnt->cnts.curbytes -= old_size;
+	}
+	if ((uintptr_t)cnt > (uintptr_t)1U) {
+		cnt->cnts.curobjs++;
+		cnt->cnts.curbytes += size;
+		if (opt_prof_accum) {
+			cnt->cnts.accumobjs++;
+			cnt->cnts.accumbytes += size;
+		}
+	}
+	/*********/
+	mb_write();
+	/*********/
+	if ((uintptr_t)told_cnt > (uintptr_t)1U)
+		told_cnt->epoch++;
+	if ((uintptr_t)cnt > (uintptr_t)1U)
+		cnt->epoch++;
+	/*********/
+	mb_write(); /* Not strictly necessary. */
+}
+
+JEMALLOC_INLINE void
+prof_free(const void *ptr, size_t size)
+{
+	prof_ctx_t *ctx = prof_ctx_get(ptr);
+
+	if ((uintptr_t)ctx > (uintptr_t)1) {
+		assert(size == isalloc(ptr));
+		prof_thr_cnt_t *tcnt = prof_lookup(ctx->bt);
+
+		if (tcnt != NULL) {
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->cnts.curobjs--;
+			tcnt->cnts.curbytes -= size;
+			/*********/
+			mb_write();
+			/*********/
+			tcnt->epoch++;
+			/*********/
+			mb_write();
+			/*********/
+		} else {
+			/*
+			 * OOM during free() cannot be propagated, so operate
+			 * directly on cnt->ctx->cnt_merged.
+			 */
+			malloc_mutex_lock(&ctx->lock);
+			ctx->cnt_merged.curobjs--;
+			ctx->cnt_merged.curbytes -= size;
+			malloc_mutex_unlock(&ctx->lock);
+		}
+	}
+}
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
+#endif /* JEMALLOC_PROF */