summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2014-11-21 17:05:58 +0000
committerSimon Marlow <marlowsd@gmail.com>2014-11-25 14:37:26 +0000
commit452eb80f15fce8665df52bc9facebfafb5b6267b (patch)
treea34e76ab765b6d1db49b1c05372f8924e52cacf9
parente22bc0dedb9e9da0176ad7ce4a74acbefedc7207 (diff)
downloadhaskell-452eb80f15fce8665df52bc9facebfafb5b6267b.tar.gz
Add +RTS -n<size>: divide the nursery into chunks
See the documentation for details.
-rw-r--r--docs/users_guide/runtime_control.xml36
-rw-r--r--includes/rts/Flags.h1
-rw-r--r--rts/RtsFlags.c11
-rw-r--r--rts/Schedule.c6
-rw-r--r--rts/sm/GC.c29
-rw-r--r--rts/sm/Sanity.c9
-rw-r--r--rts/sm/Storage.c117
-rw-r--r--rts/sm/Storage.h4
8 files changed, 155 insertions, 58 deletions
diff --git a/docs/users_guide/runtime_control.xml b/docs/users_guide/runtime_control.xml
index cdd9fd4997..612a44122a 100644
--- a/docs/users_guide/runtime_control.xml
+++ b/docs/users_guide/runtime_control.xml
@@ -365,6 +365,42 @@ $ ./prog -f +RTS -H32m -S -RTS -h foo bar
</varlistentry>
<varlistentry>
+ <term>
+ <option>-n</option><replaceable>size</replaceable>
+ <indexterm><primary><option>-n</option></primary><secondary>RTS option</secondary></indexterm>
+ <indexterm><primary>allocation area, chunk size</primary></indexterm>
+ </term>
+ <listitem>
+ <para>&lsqb;Default: 0, Example:
+ <literal>-n4m</literal>&rsqb; When set to a non-zero value,
+ this option divides the allocation area (<option>-A</option>
+ value) into chunks of the specified size. During execution,
+ when a processor exhausts its current chunk, it is given
+ another chunk from the pool until the pool is exhausted, at
+ which point a collection is triggered.</para>
+
+ <para>This option is only useful when running in parallel
+ (<option>-N2</option> or greater). It allows the processor
+ cores to make better use of the available allocation area,
+ even when cores are allocating at different rates. Without
+ <option>-n</option>, each core gets a fixed-size allocation
+ area specified by the <option>-A</option>, and the first
+ core to exhaust its allocation area triggers a GC across all
+ the cores. This can result in a collection happening when
+ the allocation areas of some cores are only partially full,
+ so the purpose of the <option>-n</option> is to allow cores
+ that are allocating faster to get more of the allocation
+ area. This means less frequent GC, leading a lower GC
+ overhead for the same heap size.</para>
+
+ <para>This is particularly useful in conjunction with larger
+ <option>-A</option> values, for example <option>-A64m
+ -n4m</option> is a useful combination on larger core counts
+ (8+).</para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
<term>
<option>-c</option>
<indexterm><primary><option>-c</option></primary><secondary>RTS option</secondary></indexterm>
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h
index b707a20edc..1440b14a37 100644
--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -41,6 +41,7 @@ typedef struct _GC_FLAGS {
nat maxHeapSize; /* in *blocks* */
nat minAllocAreaSize; /* in *blocks* */
+ nat nurseryChunkSize; /* in *blocks* */
nat minOldGenSize; /* in *blocks* */
nat heapSizeSuggestion; /* in *blocks* */
rtsBool heapSizeSuggestionAuto;
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index 82e90e5b0e..68667005f0 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -108,6 +108,7 @@ void initRtsFlagsDefaults(void)
RtsFlags.GcFlags.stkChunkBufferSize = (1 * 1024) / sizeof(W_);
RtsFlags.GcFlags.minAllocAreaSize = (512 * 1024) / BLOCK_SIZE;
+ RtsFlags.GcFlags.nurseryChunkSize = 0;
RtsFlags.GcFlags.minOldGenSize = (1024 * 1024) / BLOCK_SIZE;
RtsFlags.GcFlags.maxHeapSize = 0; /* off by default */
RtsFlags.GcFlags.heapSizeSuggestion = 0; /* none */
@@ -735,8 +736,16 @@ error = rtsTrue;
break;
case 'A':
OPTION_UNSAFE;
+ // minimum two blocks in the nursery, so that we have one to
+ // grab for allocate().
RtsFlags.GcFlags.minAllocAreaSize
- = decodeSize(rts_argv[arg], 2, BLOCK_SIZE, HS_INT_MAX)
+ = decodeSize(rts_argv[arg], 2, 2*BLOCK_SIZE, HS_INT_MAX)
+ / BLOCK_SIZE;
+ break;
+ case 'n':
+ OPTION_UNSAFE;
+ RtsFlags.GcFlags.nurseryChunkSize
+ = decodeSize(rts_argv[arg], 2, 2*BLOCK_SIZE, HS_INT_MAX)
/ BLOCK_SIZE;
break;
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 447b70ef52..f25b37288d 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -1169,6 +1169,12 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
}
}
+ if (getNewNursery(cap)) {
+ debugTrace(DEBUG_sched, "thread %ld got a new nursery", t->id);
+ pushOnRunQueue(cap,t);
+ return rtsFalse;
+ }
+
if (cap->r.rHpLim == NULL || cap->context_switch) {
// Sometimes we miss a context switch, e.g. when calling
// primitives in a tight loop, MAYBE_GC() doesn't check the
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 748dc0d6de..9777f32a3b 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -408,10 +408,6 @@ GarbageCollect (nat collect_gen,
break;
}
- if (!DEBUG_IS_ON && n_gc_threads != 1) {
- clearNursery(cap);
- }
-
shutdown_gc_threads(gct->thread_index);
// Now see which stable names are still alive.
@@ -646,21 +642,6 @@ GarbageCollect (nat collect_gen,
}
}
- // Reset the nursery: make the blocks empty
- if (DEBUG_IS_ON || n_gc_threads == 1) {
- for (n = 0; n < n_capabilities; n++) {
- clearNursery(capabilities[n]);
- }
- } else {
- // When doing parallel GC, clearNursery() is called by the
- // worker threads
- for (n = 0; n < n_capabilities; n++) {
- if (gc_threads[n]->idle) {
- clearNursery(capabilities[n]);
- }
- }
- }
-
resize_nursery();
resetNurseries();
@@ -1072,10 +1053,6 @@ gcWorkerThread (Capability *cap)
scavenge_until_all_done();
- if (!DEBUG_IS_ON) {
- clearNursery(cap);
- }
-
#ifdef THREADED_RTS
// Now that the whole heap is marked, we discard any sparks that
// were found to be unreachable. The main GC thread is currently
@@ -1716,9 +1693,9 @@ resize_nursery (void)
}
else
{
- // we might have added extra large blocks to the nursery, so
- // resize back to minAllocAreaSize again.
- resizeNurseriesFixed(RtsFlags.GcFlags.minAllocAreaSize);
+ // we might have added extra blocks to the nursery, so
+ // resize back to the original size again.
+ resizeNurseriesFixed();
}
}
}
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index dd50ded063..c4a699e59a 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -765,8 +765,11 @@ findMemoryLeak (void)
markBlocks(generations[g].large_objects);
}
- for (i = 0; i < n_capabilities; i++) {
+ for (i = 0; i < n_nurseries; i++) {
markBlocks(nurseries[i].blocks);
+ }
+
+ for (i = 0; i < n_capabilities; i++) {
markBlocks(capabilities[i]->pinned_object_block);
}
@@ -856,9 +859,11 @@ memInventory (rtsBool show)
}
nursery_blocks = 0;
- for (i = 0; i < n_capabilities; i++) {
+ for (i = 0; i < n_nurseries; i++) {
ASSERT(countBlocks(nurseries[i].blocks) == nurseries[i].n_blocks);
nursery_blocks += nurseries[i].n_blocks;
+ }
+ for (i = 0; i < n_capabilities; i++) {
if (capabilities[i]->pinned_object_block != NULL) {
nursery_blocks += capabilities[i]->pinned_object_block->blocks;
}
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index e4a6984c40..f02c00591c 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -55,6 +55,8 @@ generation *g0 = NULL; /* generation 0, for convenience */
generation *oldest_gen = NULL; /* oldest generation, for convenience */
nursery *nurseries = NULL; /* array of nurseries, size == n_capabilities */
+nat n_nurseries;
+volatile StgWord next_nursery = 0;
#ifdef THREADED_RTS
/*
@@ -65,6 +67,7 @@ Mutex sm_mutex;
#endif
static void allocNurseries (nat from, nat to);
+static void assignNurseriesToCapabilities (nat from, nat to);
static void
initGeneration (generation *gen, int g)
@@ -190,6 +193,7 @@ initStorage (void)
N = 0;
+ next_nursery = 0;
storageAddCapabilities(0, n_capabilities);
IF_DEBUG(gc, statDescribeGens());
@@ -206,13 +210,22 @@ initStorage (void)
void storageAddCapabilities (nat from, nat to)
{
- nat n, g, i;
+ nat n, g, i, new_n_nurseries;
+
+ if (RtsFlags.GcFlags.nurseryChunkSize == 0) {
+ new_n_nurseries = to;
+ } else {
+ memcount total_alloc = to * RtsFlags.GcFlags.minAllocAreaSize;
+ new_n_nurseries =
+ stg_max(to, total_alloc / RtsFlags.GcFlags.nurseryChunkSize);
+ }
if (from > 0) {
- nurseries = stgReallocBytes(nurseries, to * sizeof(struct nursery_),
+ nurseries = stgReallocBytes(nurseries,
+ new_n_nurseries * sizeof(struct nursery_),
"storageAddCapabilities");
} else {
- nurseries = stgMallocBytes(to * sizeof(struct nursery_),
+ nurseries = stgMallocBytes(new_n_nurseries * sizeof(struct nursery_),
"storageAddCapabilities");
}
@@ -228,7 +241,15 @@ void storageAddCapabilities (nat from, nat to)
* don't want it to be a big one. This vague idea is borne out by
* rigorous experimental evidence.
*/
- allocNurseries(from, to);
+ allocNurseries(n_nurseries, new_n_nurseries);
+ n_nurseries = new_n_nurseries;
+
+ /*
+ * Assign each of the new capabilities a nursery. Remember to start from
+ * next_nursery, because we may have already consumed some of the earlier
+ * nurseries.
+ */
+ assignNurseriesToCapabilities(from,to);
// allocate a block for each mut list
for (n = from; n < to; n++) {
@@ -525,15 +546,26 @@ allocNursery (bdescr *tail, W_ blocks)
return &bd[0];
}
+STATIC_INLINE void
+assignNurseryToCapability (Capability *cap, nat n)
+{
+ cap->r.rNursery = &nurseries[n];
+ cap->r.rCurrentNursery = nurseries[n].blocks;
+ newNurseryBlock(nurseries[n].blocks);
+ cap->r.rCurrentAlloc = NULL;
+}
+
+/*
+ * Give each Capability a nursery from the pool. No need to do atomic increments
+ * here, everything must be stopped to call this function.
+ */
static void
assignNurseriesToCapabilities (nat from, nat to)
{
nat i;
for (i = from; i < to; i++) {
- capabilities[i]->r.rCurrentNursery = nurseries[i].blocks;
- newNurseryBlock(nurseries[i].blocks);
- capabilities[i]->r.rCurrentAlloc = NULL;
+ assignNurseryToCapability(capabilities[i], next_nursery++);
}
}
@@ -541,42 +573,46 @@ static void
allocNurseries (nat from, nat to)
{
nat i;
+ memcount n_blocks;
+
+ if (RtsFlags.GcFlags.nurseryChunkSize) {
+ n_blocks = RtsFlags.GcFlags.nurseryChunkSize;
+ } else {
+ n_blocks = RtsFlags.GcFlags.minAllocAreaSize;
+ }
for (i = from; i < to; i++) {
- nurseries[i].blocks =
- allocNursery(NULL, RtsFlags.GcFlags.minAllocAreaSize);
- nurseries[i].n_blocks =
- RtsFlags.GcFlags.minAllocAreaSize;
+ nurseries[i].blocks = allocNursery(NULL, n_blocks);
+ nurseries[i].n_blocks = n_blocks;
}
- assignNurseriesToCapabilities(from, to);
}
void
-clearNursery (Capability *cap USED_IF_DEBUG)
+resetNurseries (void)
{
+ next_nursery = 0;
+ assignNurseriesToCapabilities(0, n_capabilities);
+
#ifdef DEBUG
bdescr *bd;
- for (bd = nurseries[cap->no].blocks; bd; bd = bd->link) {
- ASSERT(bd->gen_no == 0);
- ASSERT(bd->gen == g0);
- IF_DEBUG(sanity, memset(bd->start, 0xaa, BLOCK_SIZE));
+ nat n;
+ for (n = 0; n < n_nurseries; n++) {
+ for (bd = nurseries[n].blocks; bd; bd = bd->link) {
+ ASSERT(bd->gen_no == 0);
+ ASSERT(bd->gen == g0);
+ IF_DEBUG(sanity, memset(bd->start, 0xaa, BLOCK_SIZE));
+ }
}
#endif
}
-void
-resetNurseries (void)
-{
- assignNurseriesToCapabilities(0, n_capabilities);
-}
-
W_
countNurseryBlocks (void)
{
nat i;
W_ blocks = 0;
- for (i = 0; i < n_capabilities; i++) {
+ for (i = 0; i < n_nurseries; i++) {
blocks += nurseries[i].n_blocks;
}
return blocks;
@@ -625,15 +661,30 @@ resizeNursery (nursery *nursery, W_ blocks)
//
// Resize each of the nurseries to the specified size.
//
-void
-resizeNurseriesFixed (W_ blocks)
+static void
+resizeNurseriesEach (W_ blocks)
{
nat i;
- for (i = 0; i < n_capabilities; i++) {
+
+ for (i = 0; i < n_nurseries; i++) {
resizeNursery(&nurseries[i], blocks);
}
}
+void
+resizeNurseriesFixed (void)
+{
+ nat blocks;
+
+ if (RtsFlags.GcFlags.nurseryChunkSize) {
+ blocks = RtsFlags.GcFlags.nurseryChunkSize;
+ } else {
+ blocks = RtsFlags.GcFlags.minAllocAreaSize;
+ }
+
+ resizeNurseriesEach(blocks);
+}
+
//
// Resize the nurseries to the total specified size.
//
@@ -642,9 +693,19 @@ resizeNurseries (W_ blocks)
{
// If there are multiple nurseries, then we just divide the number
// of available blocks between them.
- resizeNurseriesFixed(blocks / n_capabilities);
+ resizeNurseriesEach(blocks / n_nurseries);
}
+rtsBool
+getNewNursery (Capability *cap)
+{
+ StgWord i = atomic_inc(&next_nursery, 1) - 1;
+ if (i >= n_nurseries) {
+ return rtsFalse;
+ }
+ assignNurseryToCapability(cap, i);
+ return rtsTrue;
+}
/* -----------------------------------------------------------------------------
move_STACK is called to update the TSO structure after it has been
diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h
index 943c3e39b7..a4421db3f2 100644
--- a/rts/sm/Storage.h
+++ b/rts/sm/Storage.h
@@ -80,12 +80,14 @@ void dirty_TVAR(Capability *cap, StgTVar *p);
-------------------------------------------------------------------------- */
extern nursery *nurseries;
+extern nat n_nurseries;
void resetNurseries ( void );
void clearNursery ( Capability *cap );
void resizeNurseries ( W_ blocks );
-void resizeNurseriesFixed ( W_ blocks );
+void resizeNurseriesFixed ( void );
W_ countNurseryBlocks ( void );
+rtsBool getNewNursery ( Capability *cap );
/* -----------------------------------------------------------------------------
Allocation accounting