Implement stack chunks and separate TSO/STACK objects

This patch makes two changes to the way stacks are managed: 1. The stack is now stored in a separate object from the TSO. This means that it is easier to replace the stack object for a thread when the stack overflows or underflows; we don't have to leave behind the old TSO as an indirection any more. Consequently, we can remove ThreadRelocated and deRefTSO(), which were a pain. This is obviously the right thing, but the last time I tried to do it it made performance worse. This time I seem to have cracked it. 2. Stacks are now represented as a chain of chunks, rather than a single monolithic object. The big advantage here is that individual chunks are marked clean or dirty according to whether they contain pointers to the young generation, and the GC can avoid traversing clean stack chunks during a young-generation collection. This means that programs with deep stacks will see a big saving in GC overhead when using the default GC settings. A secondary advantage is that there is much less copying involved as the stack grows. Programs that quickly grow a deep stack will see big improvements. In some ways the implementation is simpler, as nothing special needs to be done to reclaim stack as the stack shrinks (the GC just recovers the dead stack chunks). On the other hand, we have to manage stack underflow between chunks, so there's a new stack frame (UNDERFLOW_FRAME), and we now have separate TSO and STACK objects. The total amount of code is probably about the same as before. There are new RTS flags: -ki<size> Sets the initial thread stack size (default 1k) Egs: -ki4k -ki2m -kc<size> Sets the stack chunk size (default 32k) -kb<size> Sets the stack chunk buffer size (default 1k) -ki was previously called just -k, and the old name is still accepted for backwards compatibility. These new options are documented.
author: Simon Marlow <marlowsd@gmail.com> 2010-12-15 12:08:43 +0000
committer: Simon Marlow <marlowsd@gmail.com> 2010-12-15 12:08:43 +0000
commit: f30d527344db528618f64a25250a3be557d9f287 (patch)
tree: 5b827afed254139a197cbdcdd37bebe8fa859d67 /rts/Threads.c
parent: 99b6e6ac44c6c610b0d60e3b70a2341c83d23106 (diff)
download: haskell-f30d527344db528618f64a25250a3be557d9f287.tar.gz
1 files changed, 242 insertions, 46 deletions
diff --git a/rts/Threads.c b/rts/Threads.c
index f6b1bac1a7..d6fe0e7697 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -18,8 +18,14 @@
 #include "ThreadLabels.h"
 #include "Updates.h"
 #include "Messages.h"
+#include "RaiseAsync.h"
+#include "Prelude.h"
+#include "Printer.h"
+#include "sm/Sanity.h"
 #include "sm/Storage.h"
 
+#include <string.h>
+
 /* Next thread ID to allocate.
  * LOCK: sched_mutex
  */
@@ -54,57 +60,67 @@ StgTSO *
 createThread(Capability *cap, nat size)
 {
     StgTSO *tso;
+    StgStack *stack;
     nat stack_size;
 
     /* sched_mutex is *not* required */
 
-    /* First check whether we should create a thread at all */
-
-    // ToDo: check whether size = stack_size - TSO_STRUCT_SIZEW
-
     /* catch ridiculously small stack sizes */
-    if (size < MIN_STACK_WORDS + TSO_STRUCT_SIZEW) {
-	size = MIN_STACK_WORDS + TSO_STRUCT_SIZEW;
+    if (size < MIN_STACK_WORDS + sizeofW(StgStack)) {
+        size = MIN_STACK_WORDS + sizeofW(StgStack);
     }
 
-    size = round_to_mblocks(size);
-    tso = (StgTSO *)allocate(cap, size);
-
-    stack_size = size - TSO_STRUCT_SIZEW;
-    TICK_ALLOC_TSO(stack_size, 0);
-
+    /* The size argument we are given includes all the per-thread
+     * overheads:
+     *
+     *    - The TSO structure
+     *    - The STACK header
+     *
+     * This is so that we can use a nice round power of 2 for the
+     * default stack size (e.g. 1k), and if we're allocating lots of
+     * threads back-to-back they'll fit nicely in a block.  It's a bit
+     * of a benchmark hack, but it doesn't do any harm.
+     */
+    stack_size = round_to_mblocks(size - sizeofW(StgTSO));
+    stack = (StgStack *)allocate(cap, stack_size);
+    TICK_ALLOC_STACK(stack_size);
+    SET_HDR(stack, &stg_STACK_info, CCS_SYSTEM);
+    stack->stack_size   = stack_size - sizeofW(StgStack);
+    stack->sp           = stack->stack + stack->stack_size;
+    stack->dirty        = 1;
+
+    tso = (StgTSO *)allocate(cap, sizeofW(StgTSO));
+    TICK_ALLOC_TSO();
     SET_HDR(tso, &stg_TSO_info, CCS_SYSTEM);
 
     // Always start with the compiled code evaluator
     tso->what_next = ThreadRunGHC;
-
     tso->why_blocked  = NotBlocked;
     tso->block_info.closure = (StgClosure *)END_TSO_QUEUE;
     tso->blocked_exceptions = END_BLOCKED_EXCEPTIONS_QUEUE;
     tso->bq = (StgBlockingQueue *)END_TSO_QUEUE;
     tso->flags = 0;
     tso->dirty = 1;
-    
+    tso->_link = END_TSO_QUEUE;
+
     tso->saved_errno = 0;
     tso->bound = NULL;
     tso->cap = cap;
     
-    tso->stack_size     = stack_size;
-    tso->max_stack_size = round_to_mblocks(RtsFlags.GcFlags.maxStkSize) 
-	                  - TSO_STRUCT_SIZEW;
-    tso->sp             = (P_)&(tso->stack) + stack_size;
+    tso->stackobj       = stack;
+    tso->tot_stack_size = stack->stack_size;
 
     tso->trec = NO_TREC;
-    
+
 #ifdef PROFILING
     tso->prof.CCCS = CCS_MAIN;
 #endif
     
-  /* put a stop frame on the stack */
-    tso->sp -= sizeofW(StgStopFrame);
-    SET_HDR((StgClosure*)tso->sp,(StgInfoTable *)&stg_stop_thread_info,CCS_SYSTEM);
-    tso->_link = END_TSO_QUEUE;
-    
+    // put a stop frame on the stack
+    stack->sp -= sizeofW(StgStopFrame);
+    SET_HDR((StgClosure*)stack->sp,
+            (StgInfoTable *)&stg_stop_thread_info,CCS_SYSTEM);
+
     /* Link the new thread on the global thread list.
      */
     ACQUIRE_LOCK(&sched_mutex);
@@ -220,12 +236,6 @@ removeThreadFromDeQueue (Capability *cap,
 void
 tryWakeupThread (Capability *cap, StgTSO *tso)
 {
-    tryWakeupThread_(cap, deRefTSO(tso));
-}
-
-void
-tryWakeupThread_ (Capability *cap, StgTSO *tso)
-{
     traceEventThreadWakeup (cap, tso, tso->cap->no);
 
 #ifdef THREADED_RTS
@@ -267,8 +277,8 @@ tryWakeupThread_ (Capability *cap, StgTSO *tso)
         }
 
         // remove the block frame from the stack
-        ASSERT(tso->sp[0] == (StgWord)&stg_block_throwto_info);
-        tso->sp += 3;
+        ASSERT(tso->stackobj->sp[0] == (StgWord)&stg_block_throwto_info);
+        tso->stackobj->sp += 3;
         goto unblock;
     }
 
@@ -416,7 +426,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val)
 
     i = v->header.info;
     if (i == &stg_TSO_info) {
-        owner = deRefTSO((StgTSO*)v);
+        owner = (StgTSO*)v;
         if (owner != tso) {
             checkBlockingQueues(cap, tso);
         }
@@ -429,7 +439,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val)
         return;
     }
 
-    owner = deRefTSO(((StgBlockingQueue*)v)->owner);
+    owner = ((StgBlockingQueue*)v)->owner;
 
     if (owner != tso) {
         checkBlockingQueues(cap, tso);
@@ -466,6 +476,202 @@ isThreadBound(StgTSO* tso USED_IF_THREADS)
   return rtsFalse;
 }
 
+/* -----------------------------------------------------------------------------
+   Stack overflow
+
+   If the thread has reached its maximum stack size, then raise the
+   StackOverflow exception in the offending thread.  Otherwise
+   relocate the TSO into a larger chunk of memory and adjust its stack
+   size appropriately.
+   -------------------------------------------------------------------------- */
+
+void
+threadStackOverflow (Capability *cap, StgTSO *tso)
+{
+    StgStack *new_stack, *old_stack;
+    StgUnderflowFrame *frame;
+
+    IF_DEBUG(sanity,checkTSO(tso));
+
+    if (tso->tot_stack_size >= RtsFlags.GcFlags.maxStkSize
+        && !(tso->flags & TSO_BLOCKEX)) {
+        // NB. never raise a StackOverflow exception if the thread is
+        // inside Control.Exceptino.block.  It is impractical to protect
+        // against stack overflow exceptions, since virtually anything
+        // can raise one (even 'catch'), so this is the only sensible
+        // thing to do here.  See bug #767.
+        //
+
+        if (tso->flags & TSO_SQUEEZED) {
+            return;
+        }
+        // #3677: In a stack overflow situation, stack squeezing may
+        // reduce the stack size, but we don't know whether it has been
+        // reduced enough for the stack check to succeed if we try
+        // again.  Fortunately stack squeezing is idempotent, so all we
+        // need to do is record whether *any* squeezing happened.  If we
+        // are at the stack's absolute -K limit, and stack squeezing
+        // happened, then we try running the thread again.  The
+        // TSO_SQUEEZED flag is set by threadPaused() to tell us whether
+        // squeezing happened or not.
+
+        debugTrace(DEBUG_gc,
+                   "threadStackOverflow of TSO %ld (%p): stack too large (now %ld; max is %ld)",
+                   (long)tso->id, tso, (long)tso->stackobj->stack_size,
+                   RtsFlags.GcFlags.maxStkSize);
+        IF_DEBUG(gc,
+                 /* If we're debugging, just print out the top of the stack */
+                 printStackChunk(tso->stackobj->sp,
+                                 stg_min(tso->stackobj->stack + tso->stackobj->stack_size,
+                                         tso->stackobj->sp+64)));
+
+        // Send this thread the StackOverflow exception
+        throwToSingleThreaded(cap, tso, (StgClosure *)stackOverflow_closure);
+    }
+
+
+    // We also want to avoid enlarging the stack if squeezing has
+    // already released some of it.  However, we don't want to get into
+    // a pathalogical situation where a thread has a nearly full stack
+    // (near its current limit, but not near the absolute -K limit),
+    // keeps allocating a little bit, squeezing removes a little bit,
+    // and then it runs again.  So to avoid this, if we squeezed *and*
+    // there is still less than BLOCK_SIZE_W words free, then we enlarge
+    // the stack anyway.
+    if ((tso->flags & TSO_SQUEEZED) && 
+        ((W_)(tso->stackobj->sp - tso->stackobj->stack) >= BLOCK_SIZE_W)) {
+        return;
+    }
+
+    debugTraceCap(DEBUG_sched, cap,
+                  "allocating new stack chunk of size %d bytes",
+                  RtsFlags.GcFlags.stkChunkSize * sizeof(W_));
+
+    old_stack = tso->stackobj;
+
+    new_stack = (StgStack*) allocate(cap, RtsFlags.GcFlags.stkChunkSize);
+    SET_HDR(new_stack, &stg_STACK_info, CCS_SYSTEM);
+    TICK_ALLOC_STACK(RtsFlags.GcFlags.stkChunkSize);
+
+    new_stack->dirty = 0; // begin clean, we'll mark it dirty below
+    new_stack->stack_size = RtsFlags.GcFlags.stkChunkSize - sizeofW(StgStack);
+    new_stack->sp = new_stack->stack + new_stack->stack_size;
+
+    tso->tot_stack_size += new_stack->stack_size;
+
+    new_stack->sp -= sizeofW(StgUnderflowFrame);
+    frame = (StgUnderflowFrame*)new_stack->sp;
+    frame->info = &stg_stack_underflow_frame_info;
+    frame->next_chunk  = old_stack;
+
+    {
+        StgWord *sp;
+        nat chunk_words, size;
+
+        // find the boundary of the chunk of old stack we're going to
+        // copy to the new stack.  We skip over stack frames until we
+        // reach the smaller of
+        //
+        //   * the chunk buffer size (+RTS -kb)
+        //   * the end of the old stack
+        //
+        for (sp = old_stack->sp;
+             sp < stg_min(old_stack->sp + RtsFlags.GcFlags.stkChunkBufferSize,
+                          old_stack->stack + old_stack->stack_size); )
+        {
+            size = stack_frame_sizeW((StgClosure*)sp);
+
+            // if including this frame would exceed the size of the
+            // new stack (taking into account the underflow frame),
+            // then stop at the previous frame.
+            if (sp + size > old_stack->stack + (new_stack->stack_size -
+                                                sizeofW(StgUnderflowFrame))) {
+                break;
+            }
+            sp += size;
+        }
+
+        // copy the stack chunk between tso->sp and sp to
+        //   new_tso->sp + (tso->sp - sp)
+        chunk_words = sp - old_stack->sp;
+
+        memcpy(/* dest   */ new_stack->sp - chunk_words,
+               /* source */ old_stack->sp,
+               /* size   */ chunk_words * sizeof(W_));
+
+        old_stack->sp += chunk_words;
+        new_stack->sp -= chunk_words;
+    }
+
+    // if the old stack chunk is now empty, discard it.  With the
+    // default settings, -ki1k -kb1k, this means the first stack chunk
+    // will be discarded after the first overflow, being replaced by a
+    // non-moving 32k chunk.
+    if (old_stack->sp == old_stack->stack + old_stack->stack_size) {
+        frame->next_chunk = new_stack;
+    }
+
+    tso->stackobj = new_stack;
+
+    // we're about to run it, better mark it dirty
+    dirty_STACK(cap, new_stack);
+
+    IF_DEBUG(sanity,checkTSO(tso));
+    // IF_DEBUG(scheduler,printTSO(new_tso));
+}
+
+
+/* ---------------------------------------------------------------------------
+   Stack underflow - called from the stg_stack_underflow_info frame
+   ------------------------------------------------------------------------ */
+
+nat // returns offset to the return address
+threadStackUnderflow (Capability *cap, StgTSO *tso)
+{
+    StgStack *new_stack, *old_stack;
+    StgUnderflowFrame *frame;
+    nat retvals;
+
+    debugTraceCap(DEBUG_sched, cap, "stack underflow");
+
+    old_stack = tso->stackobj;
+
+    frame = (StgUnderflowFrame*)(old_stack->stack + old_stack->stack_size
+                                 - sizeofW(StgUnderflowFrame));
+    ASSERT(frame->info == &stg_stack_underflow_frame_info);
+
+    new_stack = (StgStack*)frame->next_chunk;
+    tso->stackobj = new_stack;
+
+    retvals = (P_)frame - old_stack->sp;
+    if (retvals != 0)
+    {
+        // we have some return values to copy to the old stack
+        if ((new_stack->sp - new_stack->stack) < retvals)
+        {
+            barf("threadStackUnderflow: not enough space for return values");
+        }
+
+        new_stack->sp -= retvals;
+
+        memcpy(/* dest */ new_stack->sp,
+               /* src  */ old_stack->sp,
+               /* size */ retvals * sizeof(W_));
+    }
+
+    // empty the old stack.  The GC may still visit this object
+    // because it is on the mutable list.
+    old_stack->sp = old_stack->stack + old_stack->stack_size;
+
+    // restore the stack parameters, and update tot_stack_size
+    tso->tot_stack_size -= old_stack->stack_size;
+
+    // we're about to run it, better mark it dirty
+    dirty_STACK(cap, new_stack);
+
+    return retvals;
+}
+
 /* ----------------------------------------------------------------------------
  * Debugging: why is a thread blocked
  * ------------------------------------------------------------------------- */
@@ -529,10 +735,7 @@ printThreadStatus(StgTSO *t)
       void *label = lookupThreadLabel(t->id);
       if (label) debugBelch("[\"%s\"] ",(char *)label);
     }
-    if (t->what_next == ThreadRelocated) {
-	debugBelch("has been relocated...\n");
-    } else {
-	switch (t->what_next) {
+        switch (t->what_next) {
 	case ThreadKilled:
 	    debugBelch("has been killed");
 	    break;
@@ -544,11 +747,8 @@ printThreadStatus(StgTSO *t)
 	}
         if (t->dirty) {
             debugBelch(" (TSO_DIRTY)");
-        } else if (t->flags & TSO_LINK_DIRTY) {
-            debugBelch(" (TSO_LINK_DIRTY)");
         }
 	debugBelch("\n");
-    }
 }
 
 void
@@ -574,11 +774,7 @@ printAllThreads(void)
       if (t->why_blocked != NotBlocked) {
 	  printThreadStatus(t);
       }
-      if (t->what_next == ThreadRelocated) {
-	  next = t->_link;
-      } else {
-	  next = t->global_link;
-      }
+      next = t->global_link;
     }
   }
 }
author	Simon Marlow <marlowsd@gmail.com>	2010-12-15 12:08:43 +0000
committer	Simon Marlow <marlowsd@gmail.com>	2010-12-15 12:08:43 +0000
commit	f30d527344db528618f64a25250a3be557d9f287 (patch)
tree	5b827afed254139a197cbdcdd37bebe8fa859d67 /rts/Threads.c
parent	99b6e6ac44c6c610b0d60e3b70a2341c83d23106 (diff)
download	haskell-f30d527344db528618f64a25250a3be557d9f287.tar.gz