summaryrefslogtreecommitdiff
path: root/rts/sm
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2009-03-09 12:13:00 +0000
committerSimon Marlow <marlowsd@gmail.com>2009-03-09 12:13:00 +0000
commit9fe7b8ea2136a4a07752b2851840c9366706f832 (patch)
treecc60d55de58895f8e82432c84711fa0d63544fa2 /rts/sm
parent1b62aecee4a58f52999cfa53f1c6b7744b29b808 (diff)
downloadhaskell-9fe7b8ea2136a4a07752b2851840c9366706f832.tar.gz
Redesign 64-bit HEAP_ALLOCED (FIX #2934 at the same time)
After much experimentation, I've found a formulation for HEAP_ALLOCED that (a) improves performance, and (b) doesn't have any race conditions when used concurrently. GC performance on x86_64 should be improved slightly. See extensive comments in MBlock.h for the details.
Diffstat (limited to 'rts/sm')
-rw-r--r--rts/sm/Evac.c7
-rw-r--r--rts/sm/GCAux.c2
-rw-r--r--rts/sm/MBlock.c150
-rw-r--r--rts/sm/MBlock.h146
-rw-r--r--rts/sm/Scav.c2
5 files changed, 237 insertions, 70 deletions
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index 8d37f2766b..062b73f069 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -29,6 +29,7 @@ StgWord64 whitehole_spin = 0;
#if defined(THREADED_RTS) && !defined(PARALLEL_GC)
#define evacuate(p) evacuate1(p)
+#define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
#endif
#if !defined(PARALLEL_GC)
@@ -364,7 +365,7 @@ loop:
ASSERT(LOOKS_LIKE_CLOSURE_PTR(q));
- if (!HEAP_ALLOCED(q)) {
+ if (!HEAP_ALLOCED_GC(q)) {
if (!major_gc) return;
@@ -780,7 +781,7 @@ unchain_thunk_selectors(StgSelector *p, StgClosure *val)
// invoke eval_thunk_selector(), the recursive calls will not
// evacuate the value (because we want to select on the value,
// not evacuate it), so in this case val is in from-space.
- // ASSERT(!HEAP_ALLOCED(val) || Bdescr((P_)val)->gen_no > N || (Bdescr((P_)val)->flags & BF_EVACUATED));
+ // ASSERT(!HEAP_ALLOCED_GC(val) || Bdescr((P_)val)->gen_no > N || (Bdescr((P_)val)->flags & BF_EVACUATED));
prev = (StgSelector*)((StgClosure *)p)->payload[0];
@@ -834,7 +835,7 @@ eval_thunk_selector (StgClosure **q, StgSelector * p, rtsBool evac)
selector_chain:
bd = Bdescr((StgPtr)p);
- if (HEAP_ALLOCED(p)) {
+ if (HEAP_ALLOCED_GC(p)) {
// If the THUNK_SELECTOR is in to-space or in a generation that we
// are not collecting, then bale out early. We won't be able to
// save any space in any case, and updating with an indirection is
diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c
index 66806f4d3f..c1ff54123d 100644
--- a/rts/sm/GCAux.c
+++ b/rts/sm/GCAux.c
@@ -48,7 +48,7 @@ isAlive(StgClosure *p)
// Problem here is that we sometimes don't set the link field, eg.
// for static closures with an empty SRT or CONSTR_STATIC_NOCAFs.
//
- if (!HEAP_ALLOCED(q)) {
+ if (!HEAP_ALLOCED_GC(q)) {
return p;
}
diff --git a/rts/sm/MBlock.c b/rts/sm/MBlock.c
index 28aa7b0a43..b3fa13b0bc 100644
--- a/rts/sm/MBlock.c
+++ b/rts/sm/MBlock.c
@@ -17,13 +17,10 @@
#include "Trace.h"
#include "OSMem.h"
-lnat mblocks_allocated = 0;
+#include <string.h>
-void
-initMBlocks(void)
-{
- osMemInit();
-}
+lnat mblocks_allocated = 0;
+lnat mpc_misses = 0;
/* -----------------------------------------------------------------------------
The MBlock Map: provides our implementation of HEAP_ALLOCED()
@@ -31,12 +28,21 @@ initMBlocks(void)
#if SIZEOF_VOID_P == 4
StgWord8 mblock_map[MBLOCK_MAP_SIZE]; // initially all zeros
+
+static void
+markHeapAlloced(void *p)
+{
+ mblock_map[MBLOCK_MAP_ENTRY(p)] = 1;
+}
+
#elif SIZEOF_VOID_P == 8
-static MBlockMap dummy_mblock_map;
-MBlockMap *mblock_cache = &dummy_mblock_map;
-nat mblock_map_count = 0;
+
MBlockMap **mblock_maps = NULL;
+nat mblock_map_count = 0;
+
+MbcCacheLine mblock_cache[MBC_ENTRIES];
+
static MBlockMap *
findMBlockMap(void *p)
{
@@ -52,39 +58,56 @@ findMBlockMap(void *p)
return NULL;
}
-StgBool
-slowIsHeapAlloced(void *p)
+StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p)
{
- MBlockMap *map = findMBlockMap(p);
- if(map)
+ MBlockMap *map;
+ MBlockMapLine value;
+ nat entry_no;
+
+ entry_no = mblock & (MBC_ENTRIES-1);
+
+ map = findMBlockMap(p);
+ if (map)
{
- mblock_cache = map;
- return map->mblocks[MBLOCK_MAP_ENTRY(p)];
+ mpc_misses++;
+ value = map->lines[MBLOCK_MAP_LINE(p)];
+ mblock_cache[entry_no] = (mblock<<1) | value;
+ return value;
}
else
- return 0;
+ {
+ mblock_cache[entry_no] = (mblock<<1);
+ return 0;
+ }
}
-#endif
static void
markHeapAlloced(void *p)
{
-#if SIZEOF_VOID_P == 4
- mblock_map[MBLOCK_MAP_ENTRY(p)] = 1;
-#elif SIZEOF_VOID_P == 8
MBlockMap *map = findMBlockMap(p);
if(map == NULL)
{
mblock_map_count++;
mblock_maps = realloc(mblock_maps,
sizeof(MBlockMap*) * mblock_map_count);
- map = mblock_maps[mblock_map_count-1] = calloc(1,sizeof(MBlockMap));
+ map = mblock_maps[mblock_map_count-1] =
+ stgMallocBytes(sizeof(MBlockMap),"markHeapAlloced");
+ memset(map,0,sizeof(MBlockMap));
map->addrHigh32 = (StgWord32) (((StgWord)p) >> 32);
}
- map->mblocks[MBLOCK_MAP_ENTRY(p)] = 1;
- mblock_cache = map;
-#endif
+
+ map->lines[MBLOCK_MAP_LINE(p)] = 1;
+
+ {
+ StgWord mblock;
+ nat entry_no;
+
+ mblock = (StgWord)p >> MBLOCK_SHIFT;
+ entry_no = mblock & (MBC_ENTRIES-1);
+ mblock_cache[entry_no] = (mblock << 1) + 1;
+ }
}
+#endif
/* ----------------------------------------------------------------------------
Debugging code for traversing the allocated MBlocks
@@ -125,47 +148,59 @@ void * getNextMBlock(void *mblock)
#elif SIZEOF_VOID_P == 8
-STATIC_INLINE
-void * mapEntryToMBlock(MBlockMap *map, nat i)
-{
- return (void *)(((StgWord)map->addrHigh32) << 32) +
- ((StgWord)i << MBLOCK_SHIFT);
-}
-
-void * getFirstMBlock(void)
-{
- MBlockMap *map;
- nat i, j;
-
- for (j = 0; j < mblock_map_count; j++) {
- map = mblock_maps[j];
- for (i = 0; i < MBLOCK_MAP_SIZE; i++) {
- if (map->mblocks[i]) return mapEntryToMBlock(map,i);
- }
- }
- return NULL;
-}
-
-void * getNextMBlock(void *mblock)
+void * getNextMBlock(void *p)
{
MBlockMap *map;
- nat i, j;
+ nat off, j;
+ nat line_no;
+ MBlockMapLine line;
for (j = 0; j < mblock_map_count; j++) {
map = mblock_maps[j];
- if (map->addrHigh32 == (StgWord)mblock >> 32) break;
+ if (map->addrHigh32 == (StgWord)p >> 32) break;
}
if (j == mblock_map_count) return NULL;
for (; j < mblock_map_count; j++) {
map = mblock_maps[j];
- if (map->addrHigh32 == (StgWord)mblock >> 32) {
- i = MBLOCK_MAP_ENTRY(mblock) + 1;
+ if (map->addrHigh32 == (StgWord)p >> 32) {
+ line_no = MBLOCK_MAP_LINE(p);
+ off = (((StgWord)p >> MBLOCK_SHIFT) & (MBC_LINE_SIZE-1)) + 1;
+ // + 1 because we want the *next* mblock
} else {
- i = 0;
+ line_no = 0; off = 0;
}
- for (; i < MBLOCK_MAP_SIZE; i++) {
- if (map->mblocks[i]) return mapEntryToMBlock(map,i);
+ for (; line_no < MBLOCK_MAP_ENTRIES; line_no++) {
+ line = map->lines[line_no];
+ for (; off < MBC_LINE_SIZE; off++) {
+ if (line & (1<<off)) {
+ return (void*)(((StgWord)map->addrHigh32 << 32) +
+ line_no * MBC_LINE_SIZE * MBLOCK_SIZE +
+ off * MBLOCK_SIZE);
+ }
+ }
+ off = 0;
+ }
+ }
+ return NULL;
+}
+
+void * getFirstMBlock(void)
+{
+ MBlockMap *map = mblock_maps[0];
+ nat line_no, off;
+ MbcCacheLine line;
+
+ for (line_no = 0; line_no < MBLOCK_MAP_ENTRIES; line_no++) {
+ line = map->lines[line_no];
+ if (line) {
+ for (off = 0; off < MBC_LINE_SIZE; off++) {
+ if (line & (1<<off)) {
+ return (void*)(((StgWord)map->addrHigh32 << 32) +
+ line_no * MBC_LINE_SIZE * MBLOCK_SIZE +
+ off * MBLOCK_SIZE);
+ }
+ }
}
}
return NULL;
@@ -213,3 +248,12 @@ freeAllMBlocks(void)
{
osFreeAllMBlocks();
}
+
+void
+initMBlocks(void)
+{
+ osMemInit();
+#if SIZEOF_VOID_P == 8
+ memset(mblock_cache,0xff,sizeof(mblock_cache));
+#endif
+}
diff --git a/rts/sm/MBlock.h b/rts/sm/MBlock.h
index ef6f8de33a..f9dddc3138 100644
--- a/rts/sm/MBlock.h
+++ b/rts/sm/MBlock.h
@@ -12,6 +12,8 @@
#ifndef MBLOCK_H
#define MBLOCK_H
+#include "GC.h"
+
extern lnat RTS_VAR(mblocks_allocated);
extern void initMBlocks(void);
@@ -59,25 +61,145 @@ extern StgWord8 mblock_map[];
# define MBLOCK_MAP_SIZE 4096
# define MBLOCK_MAP_ENTRY(p) ((StgWord)(p) >> MBLOCK_SHIFT)
# define HEAP_ALLOCED(p) mblock_map[MBLOCK_MAP_ENTRY(p)]
+# define HEAP_ALLOCED_GC(p) HEAP_ALLOCED(p)
+
+/* -----------------------------------------------------------------------------
+ HEAP_ALLOCED for 64-bit machines.
+
+ Here are some cache layout options:
+
+ [1]
+ 16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
+ mblock size = 20 bits
+ entries = 8192 13 bits
+ line size = 0 bits (1 bit of value)
+ tag size = 15 bits
+ = 48 bits
+
+ [2]
+ 32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
+ mblock size = 20 bits
+ entries = 16384 14 bits
+ line size = 2 bits (4 bits of value)
+ tag size = 12 bits
+ = 48 bits
+
+ [3]
+ 16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
+ mblock size = 20 bits
+ entries = 8192 13 bits
+ line size = 1 bits (2 bits of value)
+ tag size = 14 bits
+ = 48 bits
+
+ [4]
+ 4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
+ mblock size = 20 bits
+ entries = 1024 10 bits
+ line size = 4 bits (16 bits of value)
+ tag size = 14 bits
+ = 48 bits
+
+ [5]
+ 4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
+ mblock size = 20 bits
+ entries = 512 9 bits
+ line size = 5 bits (32 bits of value)
+ tag size = 14 bits
+ = 48 bits
+
+ We actually use none of the above. After much experimentation it was
+ found that optimising the lookup is the most important factor,
+ followed by reducing the number of misses. To that end, we use a
+ variant of [1] in which each cache entry is ((mblock << 1) + value)
+ where value is 0 for non-heap and 1 for heap. The cache entries can
+ be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
+ 1 bit for the value. The cache can be as big as we like, but
+ currently we use 8k entries, giving us 8GB capacity.
+
+ ---------------------------------------------------------------------------- */
#elif SIZEOF_VOID_P == 8
-# define MBLOCK_MAP_SIZE 4096
-# define MBLOCK_MAP_ENTRY(p) (((StgWord)(p) & 0xffffffff) >> MBLOCK_SHIFT)
+#define MBC_LINE_BITS 0
+#define MBC_TAG_BITS 15
+typedef StgWord32 MbcCacheLine; // could use 16, but 32 was faster
+typedef StgWord8 MBlockMapLine;
-typedef struct {
- StgWord32 addrHigh32;
- StgWord8 mblocks[MBLOCK_MAP_SIZE];
-} MBlockMap;
+#define MBLOCK_MAP_LINE(p) (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
-extern MBlockMap *mblock_cache;
+#define MBC_LINE_SIZE (1<<MBC_LINE_BITS)
+#define MBC_SHIFT (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
+#define MBC_ENTRIES (1<<MBC_SHIFT)
-StgBool slowIsHeapAlloced(void *p);
+extern MbcCacheLine mblock_cache[];
+
+#define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
+
+#define MBLOCK_MAP_ENTRIES (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
+
+typedef struct {
+ StgWord32 addrHigh32;
+ MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
+} MBlockMap;
-# define HEAP_ALLOCED(p) \
- ( ((((StgWord)(p)) >> 32) == mblock_cache->addrHigh32) \
- ? mblock_cache->mblocks[MBLOCK_MAP_ENTRY(p)] \
- : slowIsHeapAlloced(p) )
+extern lnat mpc_misses;
+
+StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p);
+
+INLINE_HEADER
+StgBool HEAP_ALLOCED(void *p)
+{
+ StgWord mblock;
+ nat entry_no;
+ MbcCacheLine entry, value;
+
+ mblock = (StgWord)p >> MBLOCK_SHIFT;
+ entry_no = mblock & (MBC_ENTRIES-1);
+ entry = mblock_cache[entry_no];
+ value = entry ^ (mblock << 1);
+ // this formulation coaxes gcc into prioritising the value==1
+ // case, which we expect to be the most common.
+ // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
+ if (value == 1) {
+ return 1;
+ } else if (value == 0) {
+ return 0;
+ } else {
+ // putting the rest out of line turned out to be a slight
+ // performance improvement:
+ return HEAP_ALLOCED_miss(mblock,p);
+ }
+}
+
+// In the parallel GC, the cache itself is safe to *read*, and can be
+// updated atomically, but we need to place a lock around operations
+// that touch the MBlock map.
+INLINE_HEADER
+StgBool HEAP_ALLOCED_GC(void *p)
+{
+ StgWord mblock;
+ nat entry_no;
+ MbcCacheLine entry, value;
+ StgBool b;
+
+ mblock = (StgWord)p >> MBLOCK_SHIFT;
+ entry_no = mblock & (MBC_ENTRIES-1);
+ entry = mblock_cache[entry_no];
+ value = entry ^ (mblock << 1);
+ if (value == 1) {
+ return 1;
+ } else if (value == 0) {
+ return 0;
+ } else {
+ // putting the rest out of line turned out to be a slight
+ // performance improvement:
+ ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+ b = HEAP_ALLOCED_miss(mblock,p);
+ RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+ return b;
+ }
+}
#else
# error HEAP_ALLOCED not defined
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index 732ad15927..34096d489b 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -1372,7 +1372,7 @@ scavenge_one(StgPtr p)
* evacuated, so we perform that check here.
*/
StgClosure *q = ((StgInd *)p)->indirectee;
- if (HEAP_ALLOCED(q) && Bdescr((StgPtr)q)->flags & BF_EVACUATED) {
+ if (HEAP_ALLOCED_GC(q) && Bdescr((StgPtr)q)->flags & BF_EVACUATED) {
break;
}
evacuate(&((StgInd *)p)->indirectee);