summaryrefslogtreecommitdiff
path: root/deps/jemalloc/include/jemalloc/internal/bit_util.h
diff options
context:
space:
mode:
Diffstat (limited to 'deps/jemalloc/include/jemalloc/internal/bit_util.h')
-rw-r--r--deps/jemalloc/include/jemalloc/internal/bit_util.h457
1 files changed, 320 insertions, 137 deletions
diff --git a/deps/jemalloc/include/jemalloc/internal/bit_util.h b/deps/jemalloc/include/jemalloc/internal/bit_util.h
index c045eb868..bac59140f 100644
--- a/deps/jemalloc/include/jemalloc/internal/bit_util.h
+++ b/deps/jemalloc/include/jemalloc/internal/bit_util.h
@@ -3,144 +3,383 @@
#include "jemalloc/internal/assert.h"
-#define BIT_UTIL_INLINE static inline
-
/* Sanity check. */
#if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \
|| !defined(JEMALLOC_INTERNAL_FFS)
# error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
#endif
+/*
+ * Unlike the builtins and posix ffs functions, our ffs requires a non-zero
+ * input, and returns the position of the lowest bit set (as opposed to the
+ * posix versions, which return 1 larger than that position and use a return
+ * value of zero as a sentinel. This tends to simplify logic in callers, and
+ * allows for consistency with the builtins we build fls on top of.
+ */
+static inline unsigned
+ffs_llu(unsigned long long x) {
+ util_assume(x != 0);
+ return JEMALLOC_INTERNAL_FFSLL(x) - 1;
+}
-BIT_UTIL_INLINE unsigned
-ffs_llu(unsigned long long bitmap) {
- return JEMALLOC_INTERNAL_FFSLL(bitmap);
+static inline unsigned
+ffs_lu(unsigned long x) {
+ util_assume(x != 0);
+ return JEMALLOC_INTERNAL_FFSL(x) - 1;
}
-BIT_UTIL_INLINE unsigned
-ffs_lu(unsigned long bitmap) {
- return JEMALLOC_INTERNAL_FFSL(bitmap);
+static inline unsigned
+ffs_u(unsigned x) {
+ util_assume(x != 0);
+ return JEMALLOC_INTERNAL_FFS(x) - 1;
}
-BIT_UTIL_INLINE unsigned
-ffs_u(unsigned bitmap) {
- return JEMALLOC_INTERNAL_FFS(bitmap);
+#define DO_FLS_SLOW(x, suffix) do { \
+ util_assume(x != 0); \
+ x |= (x >> 1); \
+ x |= (x >> 2); \
+ x |= (x >> 4); \
+ x |= (x >> 8); \
+ x |= (x >> 16); \
+ if (sizeof(x) > 4) { \
+ /* \
+ * If sizeof(x) is 4, then the expression "x >> 32" \
+ * will generate compiler warnings even if the code \
+ * never executes. This circumvents the warning, and \
+ * gets compiled out in optimized builds. \
+ */ \
+ int constant_32 = sizeof(x) * 4; \
+ x |= (x >> constant_32); \
+ } \
+ x++; \
+ if (x == 0) { \
+ return 8 * sizeof(x) - 1; \
+ } \
+ return ffs_##suffix(x) - 1; \
+} while(0)
+
+static inline unsigned
+fls_llu_slow(unsigned long long x) {
+ DO_FLS_SLOW(x, llu);
}
-#ifdef JEMALLOC_INTERNAL_POPCOUNTL
-BIT_UTIL_INLINE unsigned
+static inline unsigned
+fls_lu_slow(unsigned long x) {
+ DO_FLS_SLOW(x, lu);
+}
+
+static inline unsigned
+fls_u_slow(unsigned x) {
+ DO_FLS_SLOW(x, u);
+}
+
+#undef DO_FLS_SLOW
+
+#ifdef JEMALLOC_HAVE_BUILTIN_CLZ
+static inline unsigned
+fls_llu(unsigned long long x) {
+ util_assume(x != 0);
+ /*
+ * Note that the xor here is more naturally written as subtraction; the
+ * last bit set is the number of bits in the type minus the number of
+ * leading zero bits. But GCC implements that as:
+ * bsr edi, edi
+ * mov eax, 31
+ * xor edi, 31
+ * sub eax, edi
+ * If we write it as xor instead, then we get
+ * bsr eax, edi
+ * as desired.
+ */
+ return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+ util_assume(x != 0);
+ return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+ util_assume(x != 0);
+ return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
+}
+#elif defined(_MSC_VER)
+
+#if LG_SIZEOF_PTR == 3
+#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
+#else
+/*
+ * This never actually runs; we're just dodging a compiler error for the
+ * never-taken branch where sizeof(void *) == 8.
+ */
+#define DO_BSR64(bit, x) bit = 0; unreachable()
+#endif
+
+#define DO_FLS(x) do { \
+ if (x == 0) { \
+ return 8 * sizeof(x); \
+ } \
+ unsigned long bit; \
+ if (sizeof(x) == 4) { \
+ _BitScanReverse(&bit, (unsigned)x); \
+ return (unsigned)bit; \
+ } \
+ if (sizeof(x) == 8 && sizeof(void *) == 8) { \
+ DO_BSR64(bit, x); \
+ return (unsigned)bit; \
+ } \
+ if (sizeof(x) == 8 && sizeof(void *) == 4) { \
+ /* Dodge a compiler warning, as above. */ \
+ int constant_32 = sizeof(x) * 4; \
+ if (_BitScanReverse(&bit, \
+ (unsigned)(x >> constant_32))) { \
+ return 32 + (unsigned)bit; \
+ } else { \
+ _BitScanReverse(&bit, (unsigned)x); \
+ return (unsigned)bit; \
+ } \
+ } \
+ unreachable(); \
+} while (0)
+
+static inline unsigned
+fls_llu(unsigned long long x) {
+ DO_FLS(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+ DO_FLS(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+ DO_FLS(x);
+}
+
+#undef DO_FLS
+#undef DO_BSR64
+#else
+
+static inline unsigned
+fls_llu(unsigned long long x) {
+ return fls_llu_slow(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+ return fls_lu_slow(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+ return fls_u_slow(x);
+}
+#endif
+
+#if LG_SIZEOF_LONG_LONG > 3
+# error "Haven't implemented popcount for 16-byte ints."
+#endif
+
+#define DO_POPCOUNT(x, type) do { \
+ /* \
+ * Algorithm from an old AMD optimization reference manual. \
+ * We're putting a little bit more work than you might expect \
+ * into the no-instrinsic case, since we only support the \
+ * GCC intrinsics spelling of popcount (for now). Detecting \
+ * whether or not the popcount builtin is actually useable in \
+ * MSVC is nontrivial. \
+ */ \
+ \
+ type bmul = (type)0x0101010101010101ULL; \
+ \
+ /* \
+ * Replace each 2 bits with the sideways sum of the original \
+ * values. 0x5 = 0b0101. \
+ * \
+ * You might expect this to be: \
+ * x = (x & 0x55...) + ((x >> 1) & 0x55...). \
+ * That costs an extra mask relative to this, though. \
+ */ \
+ x = x - ((x >> 1) & (0x55U * bmul)); \
+ /* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\
+ x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \
+ /* \
+ * Replace each 8 bits with their sideways sum. Note that we \
+ * can't overflow within each 4-bit sum here, so we can skip \
+ * the initial mask. \
+ */ \
+ x = (x + (x >> 4)) & (bmul * 0x0FU); \
+ /* \
+ * None of the partial sums in this multiplication (viewed in \
+ * base-256) can overflow into the next digit. So the least \
+ * significant byte of the product will be the least \
+ * significant byte of the original value, the second least \
+ * significant byte will be the sum of the two least \
+ * significant bytes of the original value, and so on. \
+ * Importantly, the high byte will be the byte-wise sum of all \
+ * the bytes of the original value. \
+ */ \
+ x = x * bmul; \
+ x >>= ((sizeof(x) - 1) * 8); \
+ return (unsigned)x; \
+} while(0)
+
+static inline unsigned
+popcount_u_slow(unsigned bitmap) {
+ DO_POPCOUNT(bitmap, unsigned);
+}
+
+static inline unsigned
+popcount_lu_slow(unsigned long bitmap) {
+ DO_POPCOUNT(bitmap, unsigned long);
+}
+
+static inline unsigned
+popcount_llu_slow(unsigned long long bitmap) {
+ DO_POPCOUNT(bitmap, unsigned long long);
+}
+
+#undef DO_POPCOUNT
+
+static inline unsigned
+popcount_u(unsigned bitmap) {
+#ifdef JEMALLOC_INTERNAL_POPCOUNT
+ return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
+#else
+ return popcount_u_slow(bitmap);
+#endif
+}
+
+static inline unsigned
popcount_lu(unsigned long bitmap) {
- return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
+#ifdef JEMALLOC_INTERNAL_POPCOUNTL
+ return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
+#else
+ return popcount_lu_slow(bitmap);
+#endif
}
+
+static inline unsigned
+popcount_llu(unsigned long long bitmap) {
+#ifdef JEMALLOC_INTERNAL_POPCOUNTLL
+ return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
+#else
+ return popcount_llu_slow(bitmap);
#endif
+}
/*
* Clears first unset bit in bitmap, and returns
* place of bit. bitmap *must not* be 0.
*/
-BIT_UTIL_INLINE size_t
+static inline size_t
cfs_lu(unsigned long* bitmap) {
- size_t bit = ffs_lu(*bitmap) - 1;
+ util_assume(*bitmap != 0);
+ size_t bit = ffs_lu(*bitmap);
*bitmap ^= ZU(1) << bit;
return bit;
}
-BIT_UTIL_INLINE unsigned
-ffs_zu(size_t bitmap) {
+static inline unsigned
+ffs_zu(size_t x) {
#if LG_SIZEOF_PTR == LG_SIZEOF_INT
- return ffs_u(bitmap);
+ return ffs_u(x);
#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
- return ffs_lu(bitmap);
+ return ffs_lu(x);
#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
- return ffs_llu(bitmap);
+ return ffs_llu(x);
#else
#error No implementation for size_t ffs()
#endif
}
-BIT_UTIL_INLINE unsigned
-ffs_u64(uint64_t bitmap) {
+static inline unsigned
+fls_zu(size_t x) {
+#if LG_SIZEOF_PTR == LG_SIZEOF_INT
+ return fls_u(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
+ return fls_lu(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
+ return fls_llu(x);
+#else
+#error No implementation for size_t fls()
+#endif
+}
+
+
+static inline unsigned
+ffs_u64(uint64_t x) {
#if LG_SIZEOF_LONG == 3
- return ffs_lu(bitmap);
+ return ffs_lu(x);
#elif LG_SIZEOF_LONG_LONG == 3
- return ffs_llu(bitmap);
+ return ffs_llu(x);
#else
#error No implementation for 64-bit ffs()
#endif
}
-BIT_UTIL_INLINE unsigned
-ffs_u32(uint32_t bitmap) {
+static inline unsigned
+fls_u64(uint64_t x) {
+#if LG_SIZEOF_LONG == 3
+ return fls_lu(x);
+#elif LG_SIZEOF_LONG_LONG == 3
+ return fls_llu(x);
+#else
+#error No implementation for 64-bit fls()
+#endif
+}
+
+static inline unsigned
+ffs_u32(uint32_t x) {
#if LG_SIZEOF_INT == 2
- return ffs_u(bitmap);
+ return ffs_u(x);
#else
#error No implementation for 32-bit ffs()
#endif
- return ffs_u(bitmap);
+ return ffs_u(x);
+}
+
+static inline unsigned
+fls_u32(uint32_t x) {
+#if LG_SIZEOF_INT == 2
+ return fls_u(x);
+#else
+#error No implementation for 32-bit fls()
+#endif
+ return fls_u(x);
}
-BIT_UTIL_INLINE uint64_t
+static inline uint64_t
pow2_ceil_u64(uint64_t x) {
-#if (defined(__amd64__) || defined(__x86_64__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ))
- if(unlikely(x <= 1)) {
+ if (unlikely(x <= 1)) {
return x;
}
- size_t msb_on_index;
-#if (defined(__amd64__) || defined(__x86_64__))
- asm ("bsrq %1, %0"
- : "=r"(msb_on_index) // Outputs.
- : "r"(x-1) // Inputs.
- );
-#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
- msb_on_index = (63 ^ __builtin_clzll(x - 1));
-#endif
+ size_t msb_on_index = fls_u64(x - 1);
+ /*
+ * Range-check; it's on the callers to ensure that the result of this
+ * call won't overflow.
+ */
assert(msb_on_index < 63);
return 1ULL << (msb_on_index + 1);
-#else
- x--;
- x |= x >> 1;
- x |= x >> 2;
- x |= x >> 4;
- x |= x >> 8;
- x |= x >> 16;
- x |= x >> 32;
- x++;
- return x;
-#endif
}
-BIT_UTIL_INLINE uint32_t
+static inline uint32_t
pow2_ceil_u32(uint32_t x) {
-#if ((defined(__i386__) || defined(JEMALLOC_HAVE_BUILTIN_CLZ)) && (!defined(__s390__)))
- if(unlikely(x <= 1)) {
- return x;
+ if (unlikely(x <= 1)) {
+ return x;
}
- size_t msb_on_index;
-#if (defined(__i386__))
- asm ("bsr %1, %0"
- : "=r"(msb_on_index) // Outputs.
- : "r"(x-1) // Inputs.
- );
-#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
- msb_on_index = (31 ^ __builtin_clz(x - 1));
-#endif
+ size_t msb_on_index = fls_u32(x - 1);
+ /* As above. */
assert(msb_on_index < 31);
return 1U << (msb_on_index + 1);
-#else
- x--;
- x |= x >> 1;
- x |= x >> 2;
- x |= x >> 4;
- x |= x >> 8;
- x |= x >> 16;
- x++;
- return x;
-#endif
}
/* Compute the smallest power of 2 that is >= x. */
-BIT_UTIL_INLINE size_t
+static inline size_t
pow2_ceil_zu(size_t x) {
#if (LG_SIZEOF_PTR == 3)
return pow2_ceil_u64(x);
@@ -149,77 +388,21 @@ pow2_ceil_zu(size_t x) {
#endif
}
-#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
-BIT_UTIL_INLINE unsigned
-lg_floor(size_t x) {
- size_t ret;
- assert(x != 0);
-
- asm ("bsr %1, %0"
- : "=r"(ret) // Outputs.
- : "r"(x) // Inputs.
- );
- assert(ret < UINT_MAX);
- return (unsigned)ret;
-}
-#elif (defined(_MSC_VER))
-BIT_UTIL_INLINE unsigned
+static inline unsigned
lg_floor(size_t x) {
- unsigned long ret;
-
- assert(x != 0);
-
+ util_assume(x != 0);
#if (LG_SIZEOF_PTR == 3)
- _BitScanReverse64(&ret, x);
-#elif (LG_SIZEOF_PTR == 2)
- _BitScanReverse(&ret, x);
+ return fls_u64(x);
#else
-# error "Unsupported type size for lg_floor()"
+ return fls_u32(x);
#endif
- assert(ret < UINT_MAX);
- return (unsigned)ret;
}
-#elif (defined(JEMALLOC_HAVE_BUILTIN_CLZ))
-BIT_UTIL_INLINE unsigned
-lg_floor(size_t x) {
- assert(x != 0);
-#if (LG_SIZEOF_PTR == LG_SIZEOF_INT)
- return ((8 << LG_SIZEOF_PTR) - 1) - __builtin_clz(x);
-#elif (LG_SIZEOF_PTR == LG_SIZEOF_LONG)
- return ((8 << LG_SIZEOF_PTR) - 1) - __builtin_clzl(x);
-#else
-# error "Unsupported type size for lg_floor()"
-#endif
-}
-#else
-BIT_UTIL_INLINE unsigned
-lg_floor(size_t x) {
- assert(x != 0);
-
- x |= (x >> 1);
- x |= (x >> 2);
- x |= (x >> 4);
- x |= (x >> 8);
- x |= (x >> 16);
-#if (LG_SIZEOF_PTR == 3)
- x |= (x >> 32);
-#endif
- if (x == SIZE_T_MAX) {
- return (8 << LG_SIZEOF_PTR) - 1;
- }
- x++;
- return ffs_zu(x) - 2;
-}
-#endif
-
-BIT_UTIL_INLINE unsigned
+static inline unsigned
lg_ceil(size_t x) {
return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
}
-#undef BIT_UTIL_INLINE
-
/* A compile-time version of lg_floor and lg_ceil. */
#define LG_FLOOR_1(x) 0
#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))