Two step allocator for 64-bit systems

Summary: The current OS memory allocator conflates the concepts of allocating address space and allocating memory, which makes the HEAP_ALLOCED() implementation excessively complicated (as the only thing it cares about is address space layout) and slow. Instead, what we want is to allocate a single insanely large contiguous block of address space (to make HEAP_ALLOCED() checks fast), and then commit subportions of that in 1MB blocks as we did before. This is currently behind a flag, USE_LARGE_ADDRESS_SPACE, that is only enabled for certain OSes. Test Plan: validate Reviewers: simonmar, ezyang, austin Subscribers: thomie, carter Differential Revision: https://phabricator.haskell.org/D524 GHC Trac Issues: #9706
author: Giovanni Campagna <gcampagn@cs.stanford.edu> 2015-07-17 11:55:49 +0100
committer: Simon Marlow <marlowsd@gmail.com> 2015-07-22 17:50:02 +0100
commit: 0d1a8d09f452977aadef7897aa12a8d41c7a4af0 (patch)
tree: 3e8404c7f37c77b67ca913521e6890d6491f4721 /includes/rts
parent: b949c96b4960168a3b399fe14485b24a2167b982 (diff)
download: haskell-0d1a8d09f452977aadef7897aa12a8d41c7a4af0.tar.gz
1 files changed, 3 insertions, 191 deletions
diff --git a/includes/rts/storage/MBlock.h b/includes/rts/storage/MBlock.h
index 29105cae93..046990eea9 100644
--- a/includes/rts/storage/MBlock.h
+++ b/includes/rts/storage/MBlock.h
@@ -19,203 +19,15 @@ extern void initMBlocks(void);
 extern void * getMBlock(void);
 extern void * getMBlocks(nat n);
 extern void freeMBlocks(void *addr, nat n);
+extern void releaseFreeMemory(void);
 extern void freeAllMBlocks(void);
 
-extern void *getFirstMBlock(void);
-extern void *getNextMBlock(void *mblock);
+extern void *getFirstMBlock(void **state);
+extern void *getNextMBlock(void **state, void *mblock);
 
 #ifdef THREADED_RTS
 // needed for HEAP_ALLOCED below
 extern SpinLock gc_alloc_block_sync;
 #endif
 
-/* -----------------------------------------------------------------------------
-   The HEAP_ALLOCED() test.
-
-   HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
-   It needs to be FAST.
-
-   See wiki commentary at
-     http://ghc.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
-
-   Implementation of HEAP_ALLOCED
-   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-   Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
-   can just use a table to record which megablocks in the address
-   space belong to the heap.  On a 32-bit machine, with 1Mb
-   megablocks, using 8 bits for each entry in the table, the table
-   requires 4k.  Lookups during GC will be fast, because the table
-   will be quickly cached (indeed, performance measurements showed no
-   measurable difference between doing the table lookup and using a
-   constant comparison).
-
-   On 64-bit machines, we cache one 12-bit block map that describes
-   4096 megablocks or 4GB of memory. If HEAP_ALLOCED is called for
-   an address that is not in the cache, it calls slowIsHeapAlloced
-   (see MBlock.c) which will find the block map for the 4GB block in
-   question.
-   -------------------------------------------------------------------------- */
-
-#if SIZEOF_VOID_P == 4
-extern StgWord8 mblock_map[];
-
-/* On a 32-bit machine a 4KB table is always sufficient */
-# define MBLOCK_MAP_SIZE        4096
-# define MBLOCK_MAP_ENTRY(p)    ((StgWord)(p) >> MBLOCK_SHIFT)
-# define HEAP_ALLOCED(p)        mblock_map[MBLOCK_MAP_ENTRY(p)]
-# define HEAP_ALLOCED_GC(p)     HEAP_ALLOCED(p)
-
-/* -----------------------------------------------------------------------------
-   HEAP_ALLOCED for 64-bit machines.
-
- Here are some cache layout options:
-
- [1]
- 16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
-  mblock size =          20 bits
-  entries   =     8192   13 bits
-  line size =             0 bits (1 bit of value)
-  tag size  =            15 bits
-                       = 48 bits
-
- [2]
- 32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
-  mblock size =          20 bits
-  entries   =    16384   14 bits
-  line size =             2 bits (4 bits of value)
-  tag size  =            12 bits
-                       = 48 bits
-
- [3]
- 16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
-  mblock size =          20 bits
-  entries   =    8192    13 bits
-  line size =             1 bits (2 bits of value)
-  tag size  =            14 bits
-                       = 48 bits
-
- [4]
- 4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
-  mblock size =          20 bits
-  entries   =     1024   10 bits
-  line size =             4 bits (16 bits of value)
-  tag size  =            14 bits
-                       = 48 bits
-
- [5]
- 4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
-  mblock size =          20 bits
-  entries   =     512     9 bits
-  line size =             5 bits (32 bits of value)
-  tag size  =            14 bits
-                       = 48 bits
-
- We actually use none of the above.  After much experimentation it was
- found that optimising the lookup is the most important factor,
- followed by reducing the number of misses.  To that end, we use a
- variant of [1] in which each cache entry is ((mblock << 1) + value)
- where value is 0 for non-heap and 1 for heap.  The cache entries can
- be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
- 1 bit for the value.  The cache can be as big as we like, but
- currently we use 8k entries, giving us 8GB capacity.
-
- ---------------------------------------------------------------------------- */
-
-#elif SIZEOF_VOID_P == 8
-
-#define MBC_LINE_BITS 0
-#define MBC_TAG_BITS 15
-
-#if x86_64_HOST_ARCH
-// 32bits are enough for 'entry' as modern amd64 boxes have
-// only 48bit sized virtual addres.
-typedef StgWord32 MbcCacheLine;
-#else
-// 32bits is not enough here as some arches (like ia64) use
-// upper address bits to distinct memory areas.
-typedef StgWord64 MbcCacheLine;
-#endif
-
-typedef StgWord8  MBlockMapLine;
-
-#define MBLOCK_MAP_LINE(p)  (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
-
-#define MBC_LINE_SIZE  (1<<MBC_LINE_BITS)
-#define MBC_SHIFT      (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
-#define MBC_ENTRIES    (1<<MBC_SHIFT)
-
-extern MbcCacheLine mblock_cache[];
-
-#define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
-
-#define MBLOCK_MAP_ENTRIES  (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
-
-typedef struct {
-    StgWord32    addrHigh32;
-    MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
-} MBlockMap;
-
-extern W_ mpc_misses;
-
-StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p);
-
-INLINE_HEADER
-StgBool HEAP_ALLOCED(void *p)
-{
-    StgWord mblock;
-    nat entry_no;
-    MbcCacheLine entry, value;
-
-    mblock   = (StgWord)p >> MBLOCK_SHIFT;
-    entry_no = mblock & (MBC_ENTRIES-1);
-    entry    = mblock_cache[entry_no];
-    value    = entry ^ (mblock << 1);
-    // this formulation coaxes gcc into prioritising the value==1
-    // case, which we expect to be the most common.
-    // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
-    if (value == 1) {
-        return 1;
-    } else if (value == 0) {
-        return 0;
-    } else {
-        // putting the rest out of line turned out to be a slight
-        // performance improvement:
-        return HEAP_ALLOCED_miss(mblock,p);
-    }
-}
-
-// In the parallel GC, the cache itself is safe to *read*, and can be
-// updated atomically, but we need to place a lock around operations
-// that touch the MBlock map.
-INLINE_HEADER
-StgBool HEAP_ALLOCED_GC(void *p)
-{
-    StgWord mblock;
-    nat entry_no;
-    MbcCacheLine entry, value;
-    StgBool b;
-
-    mblock   = (StgWord)p >> MBLOCK_SHIFT;
-    entry_no = mblock & (MBC_ENTRIES-1);
-    entry    = mblock_cache[entry_no];
-    value    = entry ^ (mblock << 1);
-    if (value == 1) {
-        return 1;
-    } else if (value == 0) {
-        return 0;
-    } else {
-        // putting the rest out of line turned out to be a slight
-        // performance improvement:
-        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
-        b = HEAP_ALLOCED_miss(mblock,p);
-        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
-        return b;
-    }
-}
-
-#else
-# error HEAP_ALLOCED not defined
-#endif
-
 #endif /* RTS_STORAGE_MBLOCK_H */
author	Giovanni Campagna <gcampagn@cs.stanford.edu>	2015-07-17 11:55:49 +0100
committer	Simon Marlow <marlowsd@gmail.com>	2015-07-22 17:50:02 +0100
commit	0d1a8d09f452977aadef7897aa12a8d41c7a4af0 (patch)
tree	3e8404c7f37c77b67ca913521e6890d6491f4721 /includes/rts
parent	b949c96b4960168a3b399fe14485b24a2167b982 (diff)
download	haskell-0d1a8d09f452977aadef7897aa12a8d41c7a4af0.tar.gz