summaryrefslogtreecommitdiff
path: root/includes
diff options
context:
space:
mode:
authorSimon Marlow <marlowsd@gmail.com>2010-12-15 12:08:43 +0000
committerSimon Marlow <marlowsd@gmail.com>2010-12-15 12:08:43 +0000
commitf30d527344db528618f64a25250a3be557d9f287 (patch)
tree5b827afed254139a197cbdcdd37bebe8fa859d67 /includes
parent99b6e6ac44c6c610b0d60e3b70a2341c83d23106 (diff)
downloadhaskell-f30d527344db528618f64a25250a3be557d9f287.tar.gz
Implement stack chunks and separate TSO/STACK objects
This patch makes two changes to the way stacks are managed: 1. The stack is now stored in a separate object from the TSO. This means that it is easier to replace the stack object for a thread when the stack overflows or underflows; we don't have to leave behind the old TSO as an indirection any more. Consequently, we can remove ThreadRelocated and deRefTSO(), which were a pain. This is obviously the right thing, but the last time I tried to do it it made performance worse. This time I seem to have cracked it. 2. Stacks are now represented as a chain of chunks, rather than a single monolithic object. The big advantage here is that individual chunks are marked clean or dirty according to whether they contain pointers to the young generation, and the GC can avoid traversing clean stack chunks during a young-generation collection. This means that programs with deep stacks will see a big saving in GC overhead when using the default GC settings. A secondary advantage is that there is much less copying involved as the stack grows. Programs that quickly grow a deep stack will see big improvements. In some ways the implementation is simpler, as nothing special needs to be done to reclaim stack as the stack shrinks (the GC just recovers the dead stack chunks). On the other hand, we have to manage stack underflow between chunks, so there's a new stack frame (UNDERFLOW_FRAME), and we now have separate TSO and STACK objects. The total amount of code is probably about the same as before. There are new RTS flags: -ki<size> Sets the initial thread stack size (default 1k) Egs: -ki4k -ki2m -kc<size> Sets the stack chunk size (default 32k) -kb<size> Sets the stack chunk buffer size (default 1k) -ki was previously called just -k, and the old name is still accepted for backwards compatibility. These new options are documented.
Diffstat (limited to 'includes')
-rw-r--r--includes/Cmm.h6
-rw-r--r--includes/mkDerivedConstants.c9
-rw-r--r--includes/rts/Constants.h8
-rw-r--r--includes/rts/Flags.h2
-rw-r--r--includes/rts/prof/LDV.h9
-rw-r--r--includes/rts/storage/ClosureMacros.h72
-rw-r--r--includes/rts/storage/ClosureTypes.h46
-rw-r--r--includes/rts/storage/Closures.h5
-rw-r--r--includes/rts/storage/TSO.h82
-rw-r--r--includes/stg/MiscClosures.h3
-rw-r--r--includes/stg/Ticky.h3
11 files changed, 156 insertions, 89 deletions
diff --git a/includes/Cmm.h b/includes/Cmm.h
index 0088c1aa05..6abe760be5 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -467,6 +467,12 @@
#define mutArrPtrsCardWords(n) \
ROUNDUP_BYTES_TO_WDS(((n) + (1 << MUT_ARR_PTRS_CARD_BITS) - 1) >> MUT_ARR_PTRS_CARD_BITS)
+#if defined(PROFILING) || (!defined(THREADED_RTS) && defined(DEBUG))
+#define OVERWRITING_CLOSURE(c) foreign "C" overwritingClosure(c "ptr")
+#else
+#define OVERWRITING_CLOSURE(c) /* nothing */
+#endif
+
/* -----------------------------------------------------------------------------
Voluntary Yields/Blocks
diff --git a/includes/mkDerivedConstants.c b/includes/mkDerivedConstants.c
index ade104a4be..0ed7ec67da 100644
--- a/includes/mkDerivedConstants.c
+++ b/includes/mkDerivedConstants.c
@@ -296,9 +296,12 @@ main(int argc, char *argv[])
closure_field(StgTSO, dirty);
closure_field(StgTSO, bq);
closure_field_("StgTSO_CCCS", StgTSO, prof.CCCS);
- tso_field(StgTSO, sp);
- tso_field_offset(StgTSO, stack);
- tso_field(StgTSO, stack_size);
+ closure_field(StgTSO, stackobj);
+
+ closure_field(StgStack, sp);
+ closure_field_offset(StgStack, stack);
+ closure_field(StgStack, stack_size);
+ closure_field(StgStack, dirty);
struct_size(StgTSOProfInfo);
diff --git a/includes/rts/Constants.h b/includes/rts/Constants.h
index e21a893bbc..a4114ab999 100644
--- a/includes/rts/Constants.h
+++ b/includes/rts/Constants.h
@@ -198,8 +198,7 @@
#define ThreadRunGHC 1 /* return to address on top of stack */
#define ThreadInterpret 2 /* interpret this thread */
#define ThreadKilled 3 /* thread has died, don't run it */
-#define ThreadRelocated 4 /* thread has moved, link points to new locn */
-#define ThreadComplete 5 /* thread has finished */
+#define ThreadComplete 4 /* thread has finished */
/*
* Constants for the why_blocked field of a TSO
@@ -266,11 +265,6 @@
#define TSO_STOPPED_ON_BREAKPOINT 16
/*
- * TSO_LINK_DIRTY is set when a TSO's link field is modified
- */
-#define TSO_LINK_DIRTY 32
-
-/*
* Used by the sanity checker to check whether TSOs are on the correct
* mutable list.
*/
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h
index 8bfadaa0cd..75525d8984 100644
--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -29,6 +29,8 @@ struct GC_FLAGS {
nat maxStkSize; /* in *words* */
nat initialStkSize; /* in *words* */
+ nat stkChunkSize; /* in *words* */
+ nat stkChunkBufferSize; /* in *words* */
nat maxHeapSize; /* in *blocks* */
nat minAllocAreaSize; /* in *blocks* */
diff --git a/includes/rts/prof/LDV.h b/includes/rts/prof/LDV.h
index 77d873cceb..64266911bd 100644
--- a/includes/rts/prof/LDV.h
+++ b/includes/rts/prof/LDV.h
@@ -31,25 +31,16 @@
#ifdef CMINUSMINUS
-#define LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(c) \
- foreign "C" LDV_recordDead_FILL_SLOP_DYNAMIC(c "ptr")
-
#else
#define LDV_RECORD_CREATE(c) \
LDVW((c)) = ((StgWord)RTS_DEREF(era) << LDV_SHIFT) | LDV_STATE_CREATE
-void LDV_recordDead_FILL_SLOP_DYNAMIC( StgClosure *p );
-
-#define LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(c) \
- LDV_recordDead_FILL_SLOP_DYNAMIC(c)
-
#endif
#else /* !PROFILING */
#define LDV_RECORD_CREATE(c) /* nothing */
-#define LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(c) /* nothing */
#endif /* PROFILING */
diff --git a/includes/rts/storage/ClosureMacros.h b/includes/rts/storage/ClosureMacros.h
index aead2edd04..7123c20587 100644
--- a/includes/rts/storage/ClosureMacros.h
+++ b/includes/rts/storage/ClosureMacros.h
@@ -131,9 +131,9 @@
// Use when changing a closure from one kind to another
#define OVERWRITE_INFO(c, new_info) \
- LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC((StgClosure *)(c)); \
- SET_INFO((c), (new_info)); \
- LDV_RECORD_CREATE(c);
+ OVERWRITING_CLOSURE((StgClosure *)(c)); \
+ SET_INFO((c), (new_info)); \
+ LDV_RECORD_CREATE(c);
/* -----------------------------------------------------------------------------
How to get hold of the static link field for a static closure.
@@ -289,8 +289,8 @@ INLINE_HEADER StgOffset arr_words_sizeW( StgArrWords* x )
INLINE_HEADER StgOffset mut_arr_ptrs_sizeW( StgMutArrPtrs* x )
{ return sizeofW(StgMutArrPtrs) + x->size; }
-INLINE_HEADER StgWord tso_sizeW ( StgTSO *tso )
-{ return TSO_STRUCT_SIZEW + tso->stack_size; }
+INLINE_HEADER StgWord stack_sizeW ( StgStack *stack )
+{ return sizeofW(StgStack) + stack->stack_size; }
INLINE_HEADER StgWord bco_sizeW ( StgBCO *bco )
{ return bco->size; }
@@ -339,7 +339,9 @@ closure_sizeW_ (StgClosure *p, StgInfoTable *info)
case MUT_ARR_PTRS_FROZEN0:
return mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
case TSO:
- return tso_sizeW((StgTSO *)p);
+ return sizeofW(StgTSO);
+ case STACK:
+ return stack_sizeW((StgStack*)p);
case BCO:
return bco_sizeW((StgBCO *)p);
case TREC_CHUNK:
@@ -417,4 +419,62 @@ INLINE_HEADER StgWord8 *mutArrPtrsCard (StgMutArrPtrs *a, lnat n)
return ((StgWord8 *)&(a->payload[a->ptrs]) + n);
}
+/* -----------------------------------------------------------------------------
+ Replacing a closure with a different one. We must call
+ OVERWRITING_CLOSURE(p) on the old closure that is about to be
+ overwritten.
+
+ In PROFILING mode, LDV profiling requires that we fill the slop
+ with zeroes, and record the old closure as dead (LDV_recordDead()).
+
+ In DEBUG mode, we must overwrite the slop with zeroes, because the
+ sanity checker wants to walk through the heap checking all the
+ pointers.
+
+ In multicore mode, we *cannot* overwrite slop with zeroes, because
+ another thread might be reading it. So,
+
+ PROFILING is not compatible with +RTS -N<n> (for n > 1)
+
+ THREADED_RTS can be used with DEBUG, but full heap sanity
+ checking is disabled.
+
+ -------------------------------------------------------------------------- */
+
+#if defined(PROFILING) || (!defined(THREADED_RTS) && defined(DEBUG))
+#define OVERWRITING_CLOSURE(c) overwritingClosure(c)
+#else
+#define OVERWRITING_CLOSURE(c) /* nothing */
+#endif
+
+#ifdef PROFILING
+void LDV_recordDead (StgClosure *c, nat size);
+#endif
+
+#ifdef KEEP_INLINES
+void overwritingClosure (StgClosure *p);
+#else
+INLINE_HEADER
+#endif
+void
+overwritingClosure (StgClosure *p)
+{
+ nat size, i;
+
+#if defined(PROFILING)
+ if (era <= 0) return;
+#endif
+
+ size = closure_sizeW(p);
+
+ // For LDV profiling, we need to record the closure as dead
+#if defined(PROFILING)
+ LDV_recordDead((StgClosure *)(p), size);
+#endif
+
+ for (i = 0; i < size - sizeofW(StgThunkHeader); i++) {
+ ((StgThunk *)(p))->payload[i] = 0;
+ }
+}
+
#endif /* RTS_STORAGE_CLOSUREMACROS_H */
diff --git a/includes/rts/storage/ClosureTypes.h b/includes/rts/storage/ClosureTypes.h
index b7489c90c1..75ec08bf18 100644
--- a/includes/rts/storage/ClosureTypes.h
+++ b/includes/rts/storage/ClosureTypes.h
@@ -56,27 +56,29 @@
#define RET_FUN 35
#define UPDATE_FRAME 36
#define CATCH_FRAME 37
-#define STOP_FRAME 38
-#define BLOCKING_QUEUE 39
-#define BLACKHOLE 40
-#define MVAR_CLEAN 41
-#define MVAR_DIRTY 42
-#define ARR_WORDS 43
-#define MUT_ARR_PTRS_CLEAN 44
-#define MUT_ARR_PTRS_DIRTY 45
-#define MUT_ARR_PTRS_FROZEN0 46
-#define MUT_ARR_PTRS_FROZEN 47
-#define MUT_VAR_CLEAN 48
-#define MUT_VAR_DIRTY 49
-#define WEAK 50
-#define PRIM 51
-#define MUT_PRIM 52
-#define TSO 53
-#define TREC_CHUNK 54
-#define ATOMICALLY_FRAME 55
-#define CATCH_RETRY_FRAME 56
-#define CATCH_STM_FRAME 57
-#define WHITEHOLE 58
-#define N_CLOSURE_TYPES 59
+#define UNDERFLOW_FRAME 38
+#define STOP_FRAME 39
+#define BLOCKING_QUEUE 40
+#define BLACKHOLE 41
+#define MVAR_CLEAN 42
+#define MVAR_DIRTY 43
+#define ARR_WORDS 44
+#define MUT_ARR_PTRS_CLEAN 45
+#define MUT_ARR_PTRS_DIRTY 46
+#define MUT_ARR_PTRS_FROZEN0 47
+#define MUT_ARR_PTRS_FROZEN 48
+#define MUT_VAR_CLEAN 49
+#define MUT_VAR_DIRTY 50
+#define WEAK 51
+#define PRIM 52
+#define MUT_PRIM 53
+#define TSO 54
+#define STACK 55
+#define TREC_CHUNK 56
+#define ATOMICALLY_FRAME 57
+#define CATCH_RETRY_FRAME 58
+#define CATCH_STM_FRAME 59
+#define WHITEHOLE 60
+#define N_CLOSURE_TYPES 61
#endif /* RTS_STORAGE_CLOSURETYPES_H */
diff --git a/includes/rts/storage/Closures.h b/includes/rts/storage/Closures.h
index 2683ce7d49..f3929ee36f 100644
--- a/includes/rts/storage/Closures.h
+++ b/includes/rts/storage/Closures.h
@@ -166,6 +166,11 @@ typedef struct {
} StgCatchFrame;
typedef struct {
+ const StgInfoTable* info;
+ struct StgStack_ *next_chunk;
+} StgUnderflowFrame;
+
+typedef struct {
StgHeader header;
} StgStopFrame;
diff --git a/includes/rts/storage/TSO.h b/includes/rts/storage/TSO.h
index 0e9883f1a6..04e673fb12 100644
--- a/includes/rts/storage/TSO.h
+++ b/includes/rts/storage/TSO.h
@@ -83,7 +83,7 @@ typedef struct StgTSO_ {
Currently used for linking TSOs on:
* cap->run_queue_{hd,tl}
* (non-THREADED_RTS); the blocked_queue
- * and pointing to the relocated version of a ThreadRelocated
+ * and pointing to the next chunk for a ThreadOldStack
NOTE!!! do not modify _link directly, it is subject to
a write barrier for generational GC. Instead use the
@@ -97,7 +97,11 @@ typedef struct StgTSO_ {
struct StgTSO_* global_link; // Links threads on the
// generation->threads lists
- StgWord dirty; /* non-zero => dirty */
+ /*
+ * The thread's stack
+ */
+ struct StgStack_ *stackobj;
+
/*
* The tso->dirty flag indicates that this TSO's stack should be
* scanned during garbage collection. It also indicates that this
@@ -110,10 +114,6 @@ typedef struct StgTSO_ {
*
* tso->dirty is set by dirty_TSO(), and unset by the garbage
* collector (only).
- *
- * The link field has a separate dirty bit of its own, namely the
- * bit TSO_LINK_DIRTY in the tso->flags field, set by
- * setTSOLink().
*/
StgWord16 what_next; // Values defined in Constants.h
@@ -121,21 +121,21 @@ typedef struct StgTSO_ {
StgWord32 flags; // Values defined in Constants.h
StgTSOBlockInfo block_info;
StgThreadID id;
- int saved_errno;
+ StgWord32 saved_errno;
+ StgWord32 dirty; /* non-zero => dirty */
struct InCall_* bound;
struct Capability_* cap;
+
struct StgTRecHeader_ * trec; /* STM transaction record */
/*
- A list of threads blocked on this TSO waiting to throw
- exceptions. In order to access this field, the TSO must be
- locked using lockClosure/unlockClosure (see SMP.h).
+ * A list of threads blocked on this TSO waiting to throw exceptions.
*/
struct MessageThrowTo_ * blocked_exceptions;
/*
- A list of StgBlockingQueue objects, representing threads blocked
- on thunks that are under evaluation by this thread.
+ * A list of StgBlockingQueue objects, representing threads
+ * blocked on thunks that are under evaluation by this thread.
*/
struct StgBlockingQueue_ *bq;
@@ -149,14 +149,36 @@ typedef struct StgTSO_ {
StgWord32 saved_winerror;
#endif
- /* The thread stack... */
- StgWord32 stack_size; /* stack size in *words* */
- StgWord32 max_stack_size; /* maximum stack size in *words* */
- StgPtr sp;
-
- StgWord stack[FLEXIBLE_ARRAY];
+ /*
+ * sum of the sizes of all stack chunks (in words), used to decide
+ * whether to throw the StackOverflow exception when the stack
+ * overflows, or whether to just chain on another stack chunk.
+ *
+ * Note that this overestimates the real stack size, because each
+ * chunk will have a gap at the end, of +RTS -kb<size> words.
+ * This means stack overflows are not entirely accurate, because
+ * the more gaps there are, the sooner the stack will run into the
+ * hard +RTS -K<size> limit.
+ */
+ StgWord32 tot_stack_size;
+
} *StgTSOPtr;
+typedef struct StgStack_ {
+ StgHeader header;
+ StgWord32 stack_size; // stack size in *words*
+ StgWord32 dirty; // non-zero => dirty
+ StgPtr sp; // current stack pointer
+ StgWord stack[FLEXIBLE_ARRAY];
+} StgStack;
+
+// Calculate SpLim from a TSO (reads tso->stackobj, but no fields from
+// the stackobj itself).
+INLINE_HEADER StgPtr tso_SpLim (StgTSO* tso)
+{
+ return tso->stackobj->stack + RESERVED_STACK_WORDS;
+}
+
/* -----------------------------------------------------------------------------
functions
-------------------------------------------------------------------------- */
@@ -165,17 +187,7 @@ void dirty_TSO (Capability *cap, StgTSO *tso);
void setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target);
void setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target);
-// Apply to a TSO before looking at it if you are not sure whether it
-// might be ThreadRelocated or not (basically, that's most of the time
-// unless the TSO is the current TSO).
-//
-INLINE_HEADER StgTSO * deRefTSO(StgTSO *tso)
-{
- while (tso->what_next == ThreadRelocated) {
- tso = tso->_link;
- }
- return tso;
-}
+void dirty_STACK (Capability *cap, StgStack *stack);
/* -----------------------------------------------------------------------------
Invariants:
@@ -232,18 +244,6 @@ INLINE_HEADER StgTSO * deRefTSO(StgTSO *tso)
---------------------------------------------------------------------------- */
-/* Workaround for a bug/quirk in gcc on certain architectures.
- * symptom is that (&tso->stack - &tso->header) /= sizeof(StgTSO)
- * in other words, gcc pads the structure at the end.
- */
-
-extern StgTSO dummy_tso;
-
-#define TSO_STRUCT_SIZE \
- ((char *)&dummy_tso.stack - (char *)&dummy_tso.header)
-
-#define TSO_STRUCT_SIZEW (TSO_STRUCT_SIZE / sizeof(W_))
-
/* this is the NIL ptr for a TSO queue (e.g. runnable queue) */
#define END_TSO_QUEUE ((StgTSO *)(void*)&stg_END_TSO_QUEUE_closure)
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index 8a1b84a5cc..c52a3c9702 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -61,6 +61,7 @@ RTS_RET(stg_catch_stm_frame);
RTS_RET(stg_unmaskAsyncExceptionszh_ret);
RTS_RET(stg_maskUninterruptiblezh_ret);
RTS_RET(stg_maskAsyncExceptionszh_ret);
+RTS_RET(stg_stack_underflow_frame);
// RTS_FUN(stg_interp_constr_entry);
//
@@ -100,6 +101,7 @@ RTS_ENTRY(stg_STABLE_NAME);
RTS_ENTRY(stg_MVAR_CLEAN);
RTS_ENTRY(stg_MVAR_DIRTY);
RTS_ENTRY(stg_TSO);
+RTS_ENTRY(stg_STACK);
RTS_ENTRY(stg_ARR_WORDS);
RTS_ENTRY(stg_MUT_ARR_WORDS);
RTS_ENTRY(stg_MUT_ARR_PTRS_CLEAN);
@@ -119,6 +121,7 @@ RTS_ENTRY(stg_PAP);
RTS_ENTRY(stg_AP);
RTS_ENTRY(stg_AP_NOUPD);
RTS_ENTRY(stg_AP_STACK);
+RTS_ENTRY(stg_AP_STACK_NOUPD);
RTS_ENTRY(stg_dummy_ret);
RTS_ENTRY(stg_raise);
RTS_ENTRY(stg_raise_ret);
diff --git a/includes/stg/Ticky.h b/includes/stg/Ticky.h
index 2ede8ebdf9..a811aec4eb 100644
--- a/includes/stg/Ticky.h
+++ b/includes/stg/Ticky.h
@@ -190,7 +190,8 @@ EXTERN StgInt RET_SEMI_loads_avoided INIT(0);
#define TICK_UPD_SQUEEZED()
#define TICK_ALLOC_HEAP_NOCTR(x)
#define TICK_GC_FAILED_PROMOTION()
-#define TICK_ALLOC_TSO(g,s)
+#define TICK_ALLOC_TSO()
+#define TICK_ALLOC_STACK(g)
#define TICK_ALLOC_UP_THK(g,s)
#define TICK_ALLOC_SE_THK(g,s)