49 files changed, 1043 insertions, 961 deletions
diff --git a/compiler/cmm/CmmCPSGen.hs b/compiler/cmm/CmmCPSGen.hs
index 924ce9d4ab..45d0aebe3c 100644
--- a/compiler/cmm/CmmCPSGen.hs
+++ b/compiler/cmm/CmmCPSGen.hs
@@ -331,8 +331,8 @@ nursery_bdescr_start  = cmmOffset stgCurrentNursery oFFSET_bdescr_start
 nursery_bdescr_blocks = cmmOffset stgCurrentNursery oFFSET_bdescr_blocks
 
 tso_SP, tso_STACK, tso_CCCS :: ByteOff
-tso_SP    = tsoFieldB     oFFSET_StgTSO_sp
-tso_STACK = tsoFieldB     oFFSET_StgTSO_stack
+tso_SP    = tsoFieldB     undefined --oFFSET_StgTSO_sp
+tso_STACK = tsoFieldB     undefined --oFFSET_StgTSO_stack
 tso_CCCS  = tsoProfFieldB oFFSET_StgTSO_CCCS
 
 -- The TSO struct has a variable header, and an optional StgTSOProfInfo in
diff --git a/compiler/codeGen/CgForeignCall.hs b/compiler/codeGen/CgForeignCall.hs
index 8e8e34d77b..cdaccc98a8 100644
--- a/compiler/codeGen/CgForeignCall.hs
+++ b/compiler/codeGen/CgForeignCall.hs
@@ -202,8 +202,9 @@ maybe_assign_temp e
 
 emitSaveThreadState :: Code
 emitSaveThreadState = do
-  -- CurrentTSO->sp = Sp;
-  stmtC $ CmmStore (cmmOffset stgCurrentTSO tso_SP) stgSp
+  -- CurrentTSO->stackobj->sp = Sp;
+  stmtC $ CmmStore (cmmOffset (CmmLoad (cmmOffset stgCurrentTSO tso_stackobj) bWord)
+                              stack_SP) stgSp
   emitCloseNursery
   -- and save the current cost centre stack in the TSO when profiling:
   when opt_SccProfilingOn $
@@ -216,14 +217,17 @@ emitCloseNursery = stmtC $ CmmStore nursery_bdescr_free (cmmOffsetW stgHp 1)
 emitLoadThreadState :: Code
 emitLoadThreadState = do
   tso <- newTemp bWord -- TODO FIXME NOW
+  stack <- newTemp bWord -- TODO FIXME NOW
   stmtsC [
-	-- tso = CurrentTSO;
-  	CmmAssign (CmmLocal tso) stgCurrentTSO,
-	-- Sp = tso->sp;
-	CmmAssign sp (CmmLoad (cmmOffset (CmmReg (CmmLocal tso)) tso_SP)
-	                      bWord),
-	-- SpLim = tso->stack + RESERVED_STACK_WORDS;
-	CmmAssign spLim (cmmOffsetW (cmmOffset (CmmReg (CmmLocal tso)) tso_STACK)
+        -- tso = CurrentTSO
+        CmmAssign (CmmLocal tso) stgCurrentTSO,
+        -- stack = tso->stackobj
+        CmmAssign (CmmLocal stack) (CmmLoad (cmmOffset (CmmReg (CmmLocal tso)) tso_stackobj) bWord),
+        -- Sp = stack->sp;
+        CmmAssign sp (CmmLoad (cmmOffset (CmmReg (CmmLocal stack)) stack_SP)
+                              bWord),
+        -- SpLim = stack->stack + RESERVED_STACK_WORDS;
+        CmmAssign spLim (cmmOffsetW (cmmOffset (CmmReg (CmmLocal stack)) stack_STACK)
 			            rESERVED_STACK_WORDS),
         -- HpAlloc = 0;
         --   HpAlloc is assumed to be set to non-zero only by a failed
@@ -234,7 +238,7 @@ emitLoadThreadState = do
   -- and load the current cost centre stack from the TSO when profiling:
   when opt_SccProfilingOn $
 	stmtC (CmmStore curCCSAddr 
-		(CmmLoad (cmmOffset (CmmReg (CmmLocal tso)) tso_CCCS) bWord))
+                (CmmLoad (cmmOffset (CmmReg (CmmLocal tso)) tso_CCCS) bWord))
 
 emitOpenNursery :: Code
 emitOpenNursery = stmtsC [
@@ -262,20 +266,14 @@ nursery_bdescr_free   = cmmOffset stgCurrentNursery oFFSET_bdescr_free
 nursery_bdescr_start  = cmmOffset stgCurrentNursery oFFSET_bdescr_start
 nursery_bdescr_blocks = cmmOffset stgCurrentNursery oFFSET_bdescr_blocks
 
-tso_SP, tso_STACK, tso_CCCS :: ByteOff
-tso_SP    = tsoFieldB     oFFSET_StgTSO_sp
-tso_STACK = tsoFieldB     oFFSET_StgTSO_stack
-tso_CCCS  = tsoProfFieldB oFFSET_StgTSO_CCCS
+tso_stackobj, tso_CCCS, stack_STACK, stack_SP :: ByteOff
+tso_stackobj = closureField oFFSET_StgTSO_stackobj
+tso_CCCS     = closureField oFFSET_StgTSO_CCCS
+stack_STACK  = closureField oFFSET_StgStack_stack
+stack_SP     = closureField oFFSET_StgStack_sp
 
--- The TSO struct has a variable header, and an optional StgTSOProfInfo in
--- the middle.  The fields we're interested in are after the StgTSOProfInfo.
-tsoFieldB :: ByteOff -> ByteOff
-tsoFieldB off
-  | opt_SccProfilingOn = off + sIZEOF_StgTSOProfInfo + fixedHdrSize * wORD_SIZE
-  | otherwise          = off + fixedHdrSize * wORD_SIZE
-
-tsoProfFieldB :: ByteOff -> ByteOff
-tsoProfFieldB off = off + fixedHdrSize * wORD_SIZE
+closureField :: ByteOff -> ByteOff
+closureField off = off + fixedHdrSize * wORD_SIZE
 
 stgSp, stgHp, stgCurrentTSO, stgCurrentNursery :: CmmExpr
 stgSp		  = CmmReg sp
diff --git a/compiler/codeGen/StgCmmForeign.hs b/compiler/codeGen/StgCmmForeign.hs
index 83c430143e..7ddf597f40 100644
--- a/compiler/codeGen/StgCmmForeign.hs
+++ b/compiler/codeGen/StgCmmForeign.hs
@@ -243,10 +243,12 @@ nursery_bdescr_start  = cmmOffset stgCurrentNursery oFFSET_bdescr_start
 nursery_bdescr_blocks = cmmOffset stgCurrentNursery oFFSET_bdescr_blocks
 
 tso_SP, tso_STACK, tso_CCCS :: ByteOff
-tso_SP    = tsoFieldB     oFFSET_StgTSO_sp
-tso_STACK = tsoFieldB     oFFSET_StgTSO_stack
 tso_CCCS  = tsoProfFieldB oFFSET_StgTSO_CCCS
 
+ --ToDo: needs merging with changes to CgForeign
+tso_STACK = tsoFieldB     undefined
+tso_SP    = tsoFieldB     undefined
+
 -- The TSO struct has a variable header, and an optional StgTSOProfInfo in
 -- the middle.  The fields we're interested in are after the StgTSOProfInfo.
 tsoFieldB :: ByteOff -> ByteOff
diff --git a/docs/users_guide/runtime_control.xml b/docs/users_guide/runtime_control.xml
index 40409131cc..0e13f0e933 100644
--- a/docs/users_guide/runtime_control.xml
+++ b/docs/users_guide/runtime_control.xml
@@ -424,22 +424,88 @@
 
       <varlistentry>
 	<term>
-         <option>-k</option><replaceable>size</replaceable>
+         <option>-ki</option><replaceable>size</replaceable>
          <indexterm><primary><option>-k</option></primary><secondary>RTS option</secondary></indexterm>
-         <indexterm><primary>stack, minimum size</primary></indexterm>
+         <indexterm><primary>stack, initial size</primary></indexterm>
         </term>
 	<listitem>
-	  <para>&lsqb;Default: 1k&rsqb; Set the initial stack size for
-          new threads.  Thread stacks (including the main thread's
-          stack) live on the heap, and grow as required.  The default
-          value is good for concurrent applications with lots of small
-          threads; if your program doesn't fit this model then
-          increasing this option may help performance.</para>
-
-	  <para>The main thread is normally started with a slightly
-          larger heap to cut down on unnecessary stack growth while
-          the program is starting up.</para>
-	</listitem>
+          <para>
+            &lsqb;Default: 1k&rsqb; Set the initial stack size for new
+            threads.  (Note: this flag used to be
+            simply <option>-k</option>, but was renamed
+            to <option>-ki</option> in GHC 7.2.1.  The old name is
+            still accepted for backwards compatibility, but that may
+            be removed in a future version).
+          </para>
+
+          <para>
+            Thread stacks (including the main thread's stack) live on
+            the heap.  As the stack grows, new stack chunks are added
+            as required; if the stack shrinks again, these extra stack
+            chunks are reclaimed by the garbage collector.  The
+            default initial stack size is deliberately small, in order
+            to keep the time and space overhead for thread creation to
+            a minimum, and to make it practical to spawn threads for
+            even tiny pieces of work.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term>
+          <option>-kc</option><replaceable>size</replaceable>
+          <indexterm><primary><option>-kc</option></primary><secondary>RTS
+          option</secondary></indexterm>
+          <indexterm><primary>stack</primary><secondary>chunk size</secondary></indexterm>
+        </term>
+        <listitem>
+          <para>
+            &lsqb;Default: 32k&rsqb; Set the size of &ldquo;stack
+            chunks&rdquo;.  When a thread's current stack overflows, a
+            new stack chunk is created and added to the thread's
+            stack, until the limit set by <option>-K</option> is
+            reached.
+          </para>
+
+          <para>
+            The advantage of smaller stack chunks is that the garbage
+            collector can avoid traversing stack chunks if they are
+            known to be unmodified since the last collection, so
+            reducing the chunk size means that the garbage collector
+            can identify more stack as unmodified, and the GC overhead
+            might be reduced.  On the other hand, making stack chunks
+            too small adds some overhead as there will be more
+            overflow/underflow between chunks.  The default setting of
+            32k appears to be a reasonable compromise in most cases.
+          </para>
+        </listitem>
+      </varlistentry>
+
+      <varlistentry>
+        <term>
+          <option>-kb</option><replaceable>size</replaceable>
+          <indexterm><primary><option>-kc</option></primary><secondary>RTS
+          option</secondary></indexterm>
+          <indexterm><primary>stack</primary><secondary>chunk buffer size</secondary></indexterm>
+        </term>
+        <listitem>
+          <para>
+            &lsqb;Default: 1k&rsqb; Sets the stack chunk buffer size.
+            When a stack chunk overflows and a new stack chunk is
+            created, some of the data from the previous stack chunk is
+            moved into the new chunk, to avoid an immediate underflow
+            and repeated overflow/underflow at the boundary.  The
+            amount of stack moved is set by the <option>-kb</option>
+            option.
+          </para>
+          <para>
+            Note that to avoid wasting space, this value should
+            typically be less than 10&percnt; of the size of a stack
+            chunk (<option>-kc</option>), because in a chain of stack
+            chunks, each chunk will have a gap of unused space of this
+            size.
+          </para>
+        </listitem>
       </varlistentry>
 
       <varlistentry>
@@ -451,9 +517,14 @@
 	<listitem>
 	  <para>&lsqb;Default: 8M&rsqb; Set the maximum stack size for
           an individual thread to <replaceable>size</replaceable>
-          bytes.  This option is there purely to stop the program
-          eating up all the available memory in the machine if it gets
-          into an infinite loop.</para>
+          bytes.  If the thread attempts to exceed this limit, it will
+            be send the <literal>StackOverflow</literal> exception.
+          </para>
+          <para>
+            This option is there mainly to stop the program eating up
+            all the available memory in the machine if it gets into an
+            infinite loop.
+          </para>
 	</listitem>
       </varlistentry>
 
diff --git a/includes/Cmm.h b/includes/Cmm.h
index 0088c1aa05..6abe760be5 100644
--- a/includes/Cmm.h
+++ b/includes/Cmm.h
@@ -467,6 +467,12 @@
 #define mutArrPtrsCardWords(n) \
     ROUNDUP_BYTES_TO_WDS(((n) + (1 << MUT_ARR_PTRS_CARD_BITS) - 1) >> MUT_ARR_PTRS_CARD_BITS)
 
+#if defined(PROFILING) || (!defined(THREADED_RTS) && defined(DEBUG))
+#define OVERWRITING_CLOSURE(c) foreign "C" overwritingClosure(c "ptr")
+#else
+#define OVERWRITING_CLOSURE(c) /* nothing */
+#endif
+
 /* -----------------------------------------------------------------------------
    Voluntary Yields/Blocks
 
diff --git a/includes/mkDerivedConstants.c b/includes/mkDerivedConstants.c
index ade104a4be..0ed7ec67da 100644
--- a/includes/mkDerivedConstants.c
+++ b/includes/mkDerivedConstants.c
@@ -296,9 +296,12 @@ main(int argc, char *argv[])
     closure_field(StgTSO, dirty);
     closure_field(StgTSO, bq);
     closure_field_("StgTSO_CCCS", StgTSO, prof.CCCS);
-    tso_field(StgTSO, sp);
-    tso_field_offset(StgTSO, stack);
-    tso_field(StgTSO, stack_size);
+    closure_field(StgTSO, stackobj);
+
+    closure_field(StgStack, sp);
+    closure_field_offset(StgStack, stack);
+    closure_field(StgStack, stack_size);
+    closure_field(StgStack, dirty);
 
     struct_size(StgTSOProfInfo);
 
diff --git a/includes/rts/Constants.h b/includes/rts/Constants.h
index e21a893bbc..a4114ab999 100644
--- a/includes/rts/Constants.h
+++ b/includes/rts/Constants.h
@@ -198,8 +198,7 @@
 #define ThreadRunGHC    1	/* return to address on top of stack */
 #define ThreadInterpret 2	/* interpret this thread */
 #define ThreadKilled	3	/* thread has died, don't run it */
-#define ThreadRelocated	4	/* thread has moved, link points to new locn */
-#define ThreadComplete	5	/* thread has finished */
+#define ThreadComplete  4       /* thread has finished */
 
 /*
  * Constants for the why_blocked field of a TSO
@@ -266,11 +265,6 @@
 #define TSO_STOPPED_ON_BREAKPOINT 16 
 
 /*
- * TSO_LINK_DIRTY is set when a TSO's link field is modified
- */
-#define TSO_LINK_DIRTY 32
-
-/*
  * Used by the sanity checker to check whether TSOs are on the correct
  * mutable list.
  */
diff --git a/includes/rts/Flags.h b/includes/rts/Flags.h
index 8bfadaa0cd..75525d8984 100644
--- a/includes/rts/Flags.h
+++ b/includes/rts/Flags.h
@@ -29,6 +29,8 @@ struct GC_FLAGS {
 
     nat     maxStkSize;         /* in *words* */
     nat     initialStkSize;     /* in *words* */
+    nat     stkChunkSize;       /* in *words* */
+    nat     stkChunkBufferSize; /* in *words* */
 
     nat	    maxHeapSize;        /* in *blocks* */
     nat     minAllocAreaSize;   /* in *blocks* */
diff --git a/includes/rts/prof/LDV.h b/includes/rts/prof/LDV.h
index 77d873cceb..64266911bd 100644
--- a/includes/rts/prof/LDV.h
+++ b/includes/rts/prof/LDV.h
@@ -31,25 +31,16 @@
 
 #ifdef CMINUSMINUS
 
-#define LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(c) \
-  foreign "C" LDV_recordDead_FILL_SLOP_DYNAMIC(c "ptr")
-
 #else
 
 #define LDV_RECORD_CREATE(c)   \
   LDVW((c)) = ((StgWord)RTS_DEREF(era) << LDV_SHIFT) | LDV_STATE_CREATE
 
-void LDV_recordDead_FILL_SLOP_DYNAMIC( StgClosure *p );
-
-#define LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(c) \
-  LDV_recordDead_FILL_SLOP_DYNAMIC(c)
-
 #endif
 
 #else  /* !PROFILING */
 
 #define LDV_RECORD_CREATE(c)   /* nothing */
-#define LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(c)  /* nothing */
 
 #endif /* PROFILING */
 
diff --git a/includes/rts/storage/ClosureMacros.h b/includes/rts/storage/ClosureMacros.h
index aead2edd04..7123c20587 100644
--- a/includes/rts/storage/ClosureMacros.h
+++ b/includes/rts/storage/ClosureMacros.h
@@ -131,9 +131,9 @@
 
 // Use when changing a closure from one kind to another
 #define OVERWRITE_INFO(c, new_info)                             \
-   LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC((StgClosure *)(c));       \
-   SET_INFO((c), (new_info));                                  \
-   LDV_RECORD_CREATE(c);
+    OVERWRITING_CLOSURE((StgClosure *)(c));                     \
+    SET_INFO((c), (new_info));                                  \
+    LDV_RECORD_CREATE(c);
 
 /* -----------------------------------------------------------------------------
    How to get hold of the static link field for a static closure.
@@ -289,8 +289,8 @@ INLINE_HEADER StgOffset arr_words_sizeW( StgArrWords* x )
 INLINE_HEADER StgOffset mut_arr_ptrs_sizeW( StgMutArrPtrs* x )
 { return sizeofW(StgMutArrPtrs) + x->size; }
 
-INLINE_HEADER StgWord tso_sizeW ( StgTSO *tso )
-{ return TSO_STRUCT_SIZEW + tso->stack_size; }
+INLINE_HEADER StgWord stack_sizeW ( StgStack *stack )
+{ return sizeofW(StgStack) + stack->stack_size; }
 
 INLINE_HEADER StgWord bco_sizeW ( StgBCO *bco )
 { return bco->size; }
@@ -339,7 +339,9 @@ closure_sizeW_ (StgClosure *p, StgInfoTable *info)
     case MUT_ARR_PTRS_FROZEN0:
 	return mut_arr_ptrs_sizeW((StgMutArrPtrs*)p);
     case TSO:
-	return tso_sizeW((StgTSO *)p);
+        return sizeofW(StgTSO);
+    case STACK:
+        return stack_sizeW((StgStack*)p);
     case BCO:
 	return bco_sizeW((StgBCO *)p);
     case TREC_CHUNK:
@@ -417,4 +419,62 @@ INLINE_HEADER StgWord8 *mutArrPtrsCard (StgMutArrPtrs *a, lnat n)
     return ((StgWord8 *)&(a->payload[a->ptrs]) + n);
 }
 
+/* -----------------------------------------------------------------------------
+   Replacing a closure with a different one.  We must call
+   OVERWRITING_CLOSURE(p) on the old closure that is about to be
+   overwritten.
+
+   In PROFILING mode, LDV profiling requires that we fill the slop
+   with zeroes, and record the old closure as dead (LDV_recordDead()).
+
+   In DEBUG mode, we must overwrite the slop with zeroes, because the
+   sanity checker wants to walk through the heap checking all the
+   pointers.
+
+   In multicore mode, we *cannot* overwrite slop with zeroes, because
+   another thread might be reading it.  So,
+
+      PROFILING is not compatible with  +RTS -N<n> (for n > 1)
+
+      THREADED_RTS can be used with DEBUG, but full heap sanity
+      checking is disabled.
+
+   -------------------------------------------------------------------------- */
+
+#if defined(PROFILING) || (!defined(THREADED_RTS) && defined(DEBUG))
+#define OVERWRITING_CLOSURE(c) overwritingClosure(c)
+#else
+#define OVERWRITING_CLOSURE(c) /* nothing */
+#endif
+
+#ifdef PROFILING
+void LDV_recordDead (StgClosure *c, nat size);
+#endif
+
+#ifdef KEEP_INLINES
+void overwritingClosure (StgClosure *p);
+#else
+INLINE_HEADER
+#endif
+void
+overwritingClosure (StgClosure *p)
+{
+    nat size, i;
+
+#if defined(PROFILING)
+    if (era <= 0) return;
+#endif
+
+    size = closure_sizeW(p);
+
+    // For LDV profiling, we need to record the closure as dead
+#if defined(PROFILING)
+    LDV_recordDead((StgClosure *)(p), size);
+#endif
+
+    for (i = 0; i < size - sizeofW(StgThunkHeader); i++) {
+        ((StgThunk *)(p))->payload[i] = 0;
+    }
+}
+
 #endif /* RTS_STORAGE_CLOSUREMACROS_H */
diff --git a/includes/rts/storage/ClosureTypes.h b/includes/rts/storage/ClosureTypes.h
index b7489c90c1..75ec08bf18 100644
--- a/includes/rts/storage/ClosureTypes.h
+++ b/includes/rts/storage/ClosureTypes.h
@@ -56,27 +56,29 @@
 #define RET_FUN                 35
 #define UPDATE_FRAME	        36
 #define CATCH_FRAME	        37
-#define STOP_FRAME	        38
-#define BLOCKING_QUEUE		39
-#define BLACKHOLE	        40
-#define MVAR_CLEAN	        41
-#define MVAR_DIRTY	        42
-#define ARR_WORDS	        43
-#define MUT_ARR_PTRS_CLEAN      44
-#define MUT_ARR_PTRS_DIRTY      45
-#define MUT_ARR_PTRS_FROZEN0    46
-#define MUT_ARR_PTRS_FROZEN     47
-#define MUT_VAR_CLEAN	        48
-#define MUT_VAR_DIRTY	        49
-#define WEAK		        50
-#define PRIM		        51
-#define MUT_PRIM                52
-#define TSO		        53
-#define TREC_CHUNK              54
-#define ATOMICALLY_FRAME        55
-#define CATCH_RETRY_FRAME       56
-#define CATCH_STM_FRAME         57
-#define WHITEHOLE               58
-#define N_CLOSURE_TYPES         59
+#define UNDERFLOW_FRAME         38
+#define STOP_FRAME              39
+#define BLOCKING_QUEUE		40
+#define BLACKHOLE	        41
+#define MVAR_CLEAN	        42
+#define MVAR_DIRTY	        43
+#define ARR_WORDS	        44
+#define MUT_ARR_PTRS_CLEAN      45
+#define MUT_ARR_PTRS_DIRTY      46
+#define MUT_ARR_PTRS_FROZEN0    47
+#define MUT_ARR_PTRS_FROZEN     48
+#define MUT_VAR_CLEAN	        49
+#define MUT_VAR_DIRTY	        50
+#define WEAK		        51
+#define PRIM		        52
+#define MUT_PRIM                53
+#define TSO		        54
+#define STACK                   55
+#define TREC_CHUNK              56
+#define ATOMICALLY_FRAME        57
+#define CATCH_RETRY_FRAME       58
+#define CATCH_STM_FRAME         59
+#define WHITEHOLE               60
+#define N_CLOSURE_TYPES         61
 
 #endif /* RTS_STORAGE_CLOSURETYPES_H */
diff --git a/includes/rts/storage/Closures.h b/includes/rts/storage/Closures.h
index 2683ce7d49..f3929ee36f 100644
--- a/includes/rts/storage/Closures.h
+++ b/includes/rts/storage/Closures.h
@@ -166,6 +166,11 @@ typedef struct {
 } StgCatchFrame;
 
 typedef struct {
+    const StgInfoTable* info;
+    struct StgStack_ *next_chunk;
+} StgUnderflowFrame;
+
+typedef struct {
     StgHeader  header;
 } StgStopFrame;  
 
diff --git a/includes/rts/storage/TSO.h b/includes/rts/storage/TSO.h
index 0e9883f1a6..04e673fb12 100644
--- a/includes/rts/storage/TSO.h
+++ b/includes/rts/storage/TSO.h
@@ -83,7 +83,7 @@ typedef struct StgTSO_ {
       Currently used for linking TSOs on:
       * cap->run_queue_{hd,tl}
       * (non-THREADED_RTS); the blocked_queue
-      * and pointing to the relocated version of a ThreadRelocated
+      * and pointing to the next chunk for a ThreadOldStack
 
        NOTE!!!  do not modify _link directly, it is subject to
        a write barrier for generational GC.  Instead use the
@@ -97,7 +97,11 @@ typedef struct StgTSO_ {
     struct StgTSO_*         global_link;    // Links threads on the
                                             // generation->threads lists
     
-    StgWord                 dirty;          /* non-zero => dirty */
+    /*
+     * The thread's stack
+     */
+    struct StgStack_       *stackobj;
+
     /*
      * The tso->dirty flag indicates that this TSO's stack should be
      * scanned during garbage collection.  It also indicates that this
@@ -110,10 +114,6 @@ typedef struct StgTSO_ {
      *
      * tso->dirty is set by dirty_TSO(), and unset by the garbage
      * collector (only).
-     *
-     * The link field has a separate dirty bit of its own, namely the
-     * bit TSO_LINK_DIRTY in the tso->flags field, set by
-     * setTSOLink().
      */
 
     StgWord16               what_next;      // Values defined in Constants.h
@@ -121,21 +121,21 @@ typedef struct StgTSO_ {
     StgWord32               flags;          // Values defined in Constants.h
     StgTSOBlockInfo         block_info;
     StgThreadID             id;
-    int                     saved_errno;
+    StgWord32               saved_errno;
+    StgWord32               dirty;          /* non-zero => dirty */
     struct InCall_*         bound;
     struct Capability_*     cap;
+
     struct StgTRecHeader_ * trec;       /* STM transaction record */
 
     /*
-       A list of threads blocked on this TSO waiting to throw
-       exceptions.  In order to access this field, the TSO must be
-       locked using lockClosure/unlockClosure (see SMP.h).
+     * A list of threads blocked on this TSO waiting to throw exceptions.
     */
     struct MessageThrowTo_ * blocked_exceptions;
 
     /*
-      A list of StgBlockingQueue objects, representing threads blocked
-      on thunks that are under evaluation by this thread.
+     * A list of StgBlockingQueue objects, representing threads
+     * blocked on thunks that are under evaluation by this thread.
     */
     struct StgBlockingQueue_ *bq;
 
@@ -149,14 +149,36 @@ typedef struct StgTSO_ {
     StgWord32 saved_winerror;
 #endif
 
-    /* The thread stack... */
-    StgWord32	       stack_size;     /* stack size in *words* */
-    StgWord32          max_stack_size; /* maximum stack size in *words* */
-    StgPtr             sp;
-    
-    StgWord            stack[FLEXIBLE_ARRAY];
+    /*
+     * sum of the sizes of all stack chunks (in words), used to decide
+     * whether to throw the StackOverflow exception when the stack
+     * overflows, or whether to just chain on another stack chunk.
+     *
+     * Note that this overestimates the real stack size, because each
+     * chunk will have a gap at the end, of +RTS -kb<size> words.
+     * This means stack overflows are not entirely accurate, because
+     * the more gaps there are, the sooner the stack will run into the
+     * hard +RTS -K<size> limit.
+     */
+    StgWord32  tot_stack_size;
+
 } *StgTSOPtr;
 
+typedef struct StgStack_ {
+    StgHeader  header;
+    StgWord32  stack_size;     // stack size in *words*
+    StgWord32  dirty;          // non-zero => dirty
+    StgPtr     sp;             // current stack pointer
+    StgWord    stack[FLEXIBLE_ARRAY];
+} StgStack;
+
+// Calculate SpLim from a TSO (reads tso->stackobj, but no fields from
+// the stackobj itself).
+INLINE_HEADER StgPtr tso_SpLim (StgTSO* tso)
+{
+    return tso->stackobj->stack + RESERVED_STACK_WORDS;
+}
+
 /* -----------------------------------------------------------------------------
    functions
    -------------------------------------------------------------------------- */
@@ -165,17 +187,7 @@ void dirty_TSO  (Capability *cap, StgTSO *tso);
 void setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target);
 void setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target);
 
-// Apply to a TSO before looking at it if you are not sure whether it
-// might be ThreadRelocated or not (basically, that's most of the time
-// unless the TSO is the current TSO).
-//
-INLINE_HEADER StgTSO * deRefTSO(StgTSO *tso)
-{
-    while (tso->what_next == ThreadRelocated) {
-	tso = tso->_link;
-    }
-    return tso;
-}
+void dirty_STACK (Capability *cap, StgStack *stack);
 
 /* -----------------------------------------------------------------------------
    Invariants:
@@ -232,18 +244,6 @@ INLINE_HEADER StgTSO * deRefTSO(StgTSO *tso)
 
  ---------------------------------------------------------------------------- */
 
-/* Workaround for a bug/quirk in gcc on certain architectures.
- * symptom is that (&tso->stack - &tso->header) /=  sizeof(StgTSO)
- * in other words, gcc pads the structure at the end.
- */
-
-extern StgTSO dummy_tso;
-
-#define TSO_STRUCT_SIZE \
-   ((char *)&dummy_tso.stack - (char *)&dummy_tso.header)
-
-#define TSO_STRUCT_SIZEW (TSO_STRUCT_SIZE / sizeof(W_))
-
 /* this is the NIL ptr for a TSO queue (e.g. runnable queue) */
 #define END_TSO_QUEUE  ((StgTSO *)(void*)&stg_END_TSO_QUEUE_closure)
 
diff --git a/includes/stg/MiscClosures.h b/includes/stg/MiscClosures.h
index 8a1b84a5cc..c52a3c9702 100644
--- a/includes/stg/MiscClosures.h
+++ b/includes/stg/MiscClosures.h
@@ -61,6 +61,7 @@ RTS_RET(stg_catch_stm_frame);
 RTS_RET(stg_unmaskAsyncExceptionszh_ret);
 RTS_RET(stg_maskUninterruptiblezh_ret);
 RTS_RET(stg_maskAsyncExceptionszh_ret);
+RTS_RET(stg_stack_underflow_frame);
 
 // RTS_FUN(stg_interp_constr_entry);
 //
@@ -100,6 +101,7 @@ RTS_ENTRY(stg_STABLE_NAME);
 RTS_ENTRY(stg_MVAR_CLEAN);
 RTS_ENTRY(stg_MVAR_DIRTY);
 RTS_ENTRY(stg_TSO);
+RTS_ENTRY(stg_STACK);
 RTS_ENTRY(stg_ARR_WORDS);
 RTS_ENTRY(stg_MUT_ARR_WORDS);
 RTS_ENTRY(stg_MUT_ARR_PTRS_CLEAN);
@@ -119,6 +121,7 @@ RTS_ENTRY(stg_PAP);
 RTS_ENTRY(stg_AP);
 RTS_ENTRY(stg_AP_NOUPD);
 RTS_ENTRY(stg_AP_STACK);
+RTS_ENTRY(stg_AP_STACK_NOUPD);
 RTS_ENTRY(stg_dummy_ret);
 RTS_ENTRY(stg_raise);
 RTS_ENTRY(stg_raise_ret);
diff --git a/includes/stg/Ticky.h b/includes/stg/Ticky.h
index 2ede8ebdf9..a811aec4eb 100644
--- a/includes/stg/Ticky.h
+++ b/includes/stg/Ticky.h
@@ -190,7 +190,8 @@ EXTERN StgInt RET_SEMI_loads_avoided INIT(0);
 #define TICK_UPD_SQUEEZED()
 #define TICK_ALLOC_HEAP_NOCTR(x)
 #define TICK_GC_FAILED_PROMOTION()
-#define TICK_ALLOC_TSO(g,s)
+#define TICK_ALLOC_TSO()
+#define TICK_ALLOC_STACK(g)
 #define TICK_ALLOC_UP_THK(g,s)
 #define TICK_ALLOC_SE_THK(g,s)
 
diff --git a/rts/Apply.cmm b/rts/Apply.cmm
index 9af9b11a97..f9ac3b353c 100644
--- a/rts/Apply.cmm
+++ b/rts/Apply.cmm
@@ -350,3 +350,56 @@ for:
 
   ENTER();
 }
+
+/* -----------------------------------------------------------------------------
+   AP_STACK_NOUPD - exactly like AP_STACK, but doesn't push an update frame.
+   -------------------------------------------------------------------------- */
+
+INFO_TABLE(stg_AP_STACK_NOUPD,/*special layout*/0,0,AP_STACK,
+                                        "AP_STACK_NOUPD","AP_STACK_NOUPD")
+{
+  W_ Words;
+  W_ ap;
+
+  ap = R1;
+  
+  Words = StgAP_STACK_size(ap);
+
+  /* 
+   * Check for stack overflow.  IMPORTANT: use a _NP check here,
+   * because if the check fails, we might end up blackholing this very
+   * closure, in which case we must enter the blackhole on return rather
+   * than continuing to evaluate the now-defunct closure.
+   */
+  STK_CHK_NP(WDS(Words) + WDS(AP_STACK_SPLIM));
+  /* ensure there is at least AP_STACK_SPLIM words of headroom available
+   * after unpacking the AP_STACK. See bug #1466 */
+
+  Sp = Sp - WDS(Words);
+
+  TICK_ENT_AP();
+  LDV_ENTER(ap);
+
+  // Enter PAP cost centre
+  ENTER_CCS_PAP_CL(ap);   // ToDo: ENTER_CC_AP_CL 
+
+  // Reload the stack
+  W_ i;
+  W_ p;
+  p = ap + SIZEOF_StgHeader + OFFSET_StgAP_STACK_payload;
+  i = 0;
+for:
+  if (i < Words) {
+    Sp(i) = W_[p];
+    p = p + WDS(1);
+    i = i + 1;
+    goto for;
+  }
+
+  // Off we go!
+  TICK_ENT_VIA_NODE();
+
+  R1 = StgAP_STACK_fun(ap);
+
+  ENTER();
+}
diff --git a/rts/ClosureFlags.c b/rts/ClosureFlags.c
index d5181cae22..41810f4025 100644
--- a/rts/ClosureFlags.c
+++ b/rts/ClosureFlags.c
@@ -59,8 +59,9 @@ StgWord16 closure_flags[] = {
  [RET_FUN]		=  ( 0                                         ),
  [UPDATE_FRAME]        	=  (     _BTM                                  ),
  [CATCH_FRAME]	   	=  (     _BTM                                  ),
- [STOP_FRAME]	   	=  (     _BTM                                  ),
- [BLACKHOLE]		=  ( 	      _NS|              _UPT           ),
+ [UNDERFLOW_FRAME]      =  (     _BTM                                  ),
+ [STOP_FRAME]           =  (     _BTM                                  ),
+ [BLACKHOLE]            =  (          _NS|              _UPT           ),
  [BLOCKING_QUEUE]	=  ( 	      _NS|         _MUT|_UPT           ),
  [MVAR_CLEAN]	   	=  (_HNF|     _NS|         _MUT|_UPT           ),
  [MVAR_DIRTY]	   	=  (_HNF|     _NS|         _MUT|_UPT           ),
@@ -74,7 +75,8 @@ StgWord16 closure_flags[] = {
  [WEAK]		   	=  (_HNF|     _NS|              _UPT           ),
  [PRIM]  	   	=  (_HNF|     _NS|              _UPT           ),
  [MUT_PRIM]  	   	=  (_HNF|     _NS|         _MUT|_UPT           ),
- [TSO]                 	=  (_HNF|     _NS|         _MUT|_UPT           ),
+ [TSO]                  =  (_HNF|     _NS|         _MUT|_UPT           ),
+ [STACK]                =  (_HNF|     _NS|         _MUT|_UPT           ),
  [TREC_CHUNK]           =  (          _NS|         _MUT|_UPT           ),
  [ATOMICALLY_FRAME]     =  (     _BTM                                  ),
  [CATCH_RETRY_FRAME]    =  (     _BTM                                  ),
@@ -82,6 +84,6 @@ StgWord16 closure_flags[] = {
  [WHITEHOLE]		=  ( 0                                         )
 };
 
-#if N_CLOSURE_TYPES != 59
+#if N_CLOSURE_TYPES != 61
 #error Closure types changed: update ClosureFlags.c!
 #endif
diff --git a/rts/Exception.cmm b/rts/Exception.cmm
index 581dafdf4f..24da1c690e 100644
--- a/rts/Exception.cmm
+++ b/rts/Exception.cmm
@@ -283,11 +283,6 @@ stg_killThreadzh
      * If the exception went to a catch frame, we'll just continue from
      * the handler.
      */
-  loop:
-    if (StgTSO_what_next(target) == ThreadRelocated::I16) {
-        target = StgTSO__link(target);
-        goto loop;
-    }
     if (target == CurrentTSO) {
         /*
          * So what should happen if a thread calls "throwTo self" inside
@@ -436,9 +431,9 @@ stg_raisezh
 #endif
     
 retry_pop_stack:
-    StgTSO_sp(CurrentTSO) = Sp;
+    SAVE_THREAD_STATE();
     (frame_type) = foreign "C" raiseExceptionHelper(BaseReg "ptr", CurrentTSO "ptr", exception "ptr") [];
-    Sp = StgTSO_sp(CurrentTSO);
+    LOAD_THREAD_STATE();
     if (frame_type == ATOMICALLY_FRAME) {
       /* The exception has reached the edge of a memory transaction.  Check that 
        * the transaction is valid.  If not then perhaps the exception should
@@ -511,8 +506,10 @@ retry_pop_stack:
 	 * We will leave the stack in a GC'able state, see the stg_stop_thread
 	 * entry code in StgStartup.cmm.
 	 */
-	Sp = CurrentTSO + TSO_OFFSET_StgTSO_stack 
-		+ WDS(TO_W_(StgTSO_stack_size(CurrentTSO))) - WDS(2);
+        W_ stack;
+        stack = StgTSO_stackobj(CurrentTSO);
+        Sp = stack + OFFSET_StgStack_stack
+                + WDS(TO_W_(StgStack_stack_size(stack))) - WDS(2);
 	Sp(1) = exception;	/* save the exception */
 	Sp(0) = stg_enter_info; /* so that GC can traverse this stack */
 	StgTSO_what_next(CurrentTSO) = ThreadKilled::I16;
diff --git a/rts/Interpreter.c b/rts/Interpreter.c
index fa4a46fd12..ade4ad18ed 100644
--- a/rts/Interpreter.c
+++ b/rts/Interpreter.c
@@ -65,13 +65,13 @@
 #define BCO_LIT(n)    literals[n]
 
 #define LOAD_STACK_POINTERS					\
-    Sp = cap->r.rCurrentTSO->sp;				\
+    Sp = cap->r.rCurrentTSO->stackobj->sp;                      \
     /* We don't change this ... */				\
-    SpLim = cap->r.rCurrentTSO->stack + RESERVED_STACK_WORDS;
+    SpLim = tso_SpLim(cap->r.rCurrentTSO);
 
 #define SAVE_STACK_POINTERS			\
     ASSERT(Sp > SpLim); \
-    cap->r.rCurrentTSO->sp = Sp
+    cap->r.rCurrentTSO->stackobj->sp = Sp
 
 #define RETURN_TO_SCHEDULER(todo,retcode)	\
    SAVE_STACK_POINTERS;				\
@@ -266,7 +266,7 @@ eval_obj:
              debugBelch("Sp = %p\n", Sp);
              debugBelch("\n" );
 
-             printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+             printStackChunk(Sp,cap->r.rCurrentTSO->stackobj->stack+cap->r.rCurrentTSO->stackobj->stack_size);
              debugBelch("\n\n");
             );
 
@@ -381,11 +381,11 @@ do_return:
              debugBelch("Returning: "); printObj(obj);
              debugBelch("Sp = %p\n", Sp);
              debugBelch("\n" );
-             printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+             printStackChunk(Sp,cap->r.rCurrentTSO->stackobj->stack+cap->r.rCurrentTSO->stackobj->stack_size);
              debugBelch("\n\n");
             );
 
-    IF_DEBUG(sanity,checkStackChunk(Sp, cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size));
+    IF_DEBUG(sanity,checkStackChunk(Sp, cap->r.rCurrentTSO->stackobj->stack+cap->r.rCurrentTSO->stackobj->stack_size));
 
     switch (get_itbl((StgClosure *)Sp)->type) {
 
@@ -466,7 +466,7 @@ do_return:
 	INTERP_TICK(it_retto_other);
 	IF_DEBUG(interpreter,
 		 debugBelch("returning to unknown frame -- yielding to sched\n"); 
-		 printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
+                 printStackChunk(Sp,cap->r.rCurrentTSO->stackobj->stack+cap->r.rCurrentTSO->stackobj->stack_size);
 	    );
 	Sp -= 2;
 	Sp[1] = (W_)tagged_obj;
@@ -529,8 +529,8 @@ do_return_unboxed:
 	    INTERP_TICK(it_retto_other);
 	    IF_DEBUG(interpreter,
 		     debugBelch("returning to unknown frame -- yielding to sched\n"); 
-		     printStackChunk(Sp,cap->r.rCurrentTSO->stack+cap->r.rCurrentTSO->stack_size);
-		);
+                     printStackChunk(Sp,cap->r.rCurrentTSO->stackobj->stack+cap->r.rCurrentTSO->stackobj->stack_size);
+                );
 	    RETURN_TO_SCHEDULER_NO_PAUSE(ThreadRunGHC, ThreadYielding);
 	}
 	}
diff --git a/rts/LdvProfile.c b/rts/LdvProfile.c
index 021ecf0846..acec0572d7 100644
--- a/rts/LdvProfile.c
+++ b/rts/LdvProfile.c
@@ -168,6 +168,7 @@ processHeapClosureForDead( StgClosure *c )
 	// stack objects
     case UPDATE_FRAME:
     case CATCH_FRAME:
+    case UNDERFLOW_FRAME:
     case STOP_FRAME:
     case RET_DYN:
     case RET_BCO:
diff --git a/rts/Messages.c b/rts/Messages.c
index 5e0fa2544f..1730278930 100644
--- a/rts/Messages.c
+++ b/rts/Messages.c
@@ -98,11 +98,13 @@ loop:
         r = throwToMsg(cap, t);
 
         switch (r) {
-        case THROWTO_SUCCESS:
+        case THROWTO_SUCCESS: {
             // this message is done
-            unlockClosure((StgClosure*)m, &stg_MSG_NULL_info);
-            tryWakeupThread(cap, t->source);
+            StgTSO *source = t->source;
+            doneWithMsgThrowTo(t);
+            tryWakeupThread(cap, source);
             break;
+        }
         case THROWTO_BLOCKED:
             // unlock the message
             unlockClosure((StgClosure*)m, &stg_MSG_THROWTO_info);
@@ -203,7 +205,7 @@ loop:
 
     else if (info == &stg_TSO_info)
     {
-        owner = deRefTSO((StgTSO *)p);
+        owner = (StgTSO*)p;
 
 #ifdef THREADED_RTS
         if (owner->cap != cap) {
@@ -265,7 +267,7 @@ loop:
 
         ASSERT(bq->bh == bh);
 
-        owner = deRefTSO(bq->owner);
+        owner = bq->owner;
 
         ASSERT(owner != END_TSO_QUEUE);
 
diff --git a/rts/Messages.h b/rts/Messages.h
index 54650fd018..febb839ee9 100644
--- a/rts/Messages.h
+++ b/rts/Messages.h
@@ -15,4 +15,15 @@ void executeMessage (Capability *cap, Message *m);
 void sendMessage    (Capability *from_cap, Capability *to_cap, Message *msg);
 #endif
 
+#include "Capability.h"
+#include "Updates.h" // for DEBUG_FILL_SLOP
+
+INLINE_HEADER void
+doneWithMsgThrowTo (MessageThrowTo *m)
+{
+    OVERWRITING_CLOSURE((StgClosure*)m);
+    unlockClosure((StgClosure*)m, &stg_MSG_NULL_info);
+    LDV_RECORD_CREATE(m);
+}
+
 #include "EndPrivate.h"
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index 0cf26b2b7f..4f6c2526d4 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -634,11 +634,6 @@ stg_threadStatuszh
     W_ ret;
 
     tso = R1;
-    loop:
-      if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
-          tso = StgTSO__link(tso);
-          goto loop;
-      }
 
     what_next   = TO_W_(StgTSO_what_next(tso));
     why_blocked = TO_W_(StgTSO_why_blocked(tso));
@@ -939,9 +934,9 @@ stg_retryzh
 
   // Find the enclosing ATOMICALLY_FRAME or CATCH_RETRY_FRAME
 retry_pop_stack:
-  StgTSO_sp(CurrentTSO) = Sp;
-  (frame_type) = foreign "C" findRetryFrameHelper(CurrentTSO "ptr") [];
-  Sp = StgTSO_sp(CurrentTSO);
+  SAVE_THREAD_STATE();
+  (frame_type) = foreign "C" findRetryFrameHelper(MyCapability(), CurrentTSO "ptr") [];
+  LOAD_THREAD_STATE();
   frame = Sp;
   trec = StgTSO_trec(CurrentTSO);
   outer  = StgTRecHeader_enclosing_trec(trec);
@@ -1138,13 +1133,13 @@ stg_newMVarzh
 }
 
 
-#define PerformTake(tso, value)				\
-    W_[StgTSO_sp(tso) + WDS(1)] = value;		\
-    W_[StgTSO_sp(tso) + WDS(0)] = stg_gc_unpt_r1_info;
+#define PerformTake(stack, value)                       \
+    W_[StgStack_sp(stack) + WDS(1)] = value;            \
+    W_[StgStack_sp(stack) + WDS(0)] = stg_gc_unpt_r1_info;
 
-#define PerformPut(tso,lval)			\
-    StgTSO_sp(tso) = StgTSO_sp(tso) + WDS(3);	\
-    lval = W_[StgTSO_sp(tso) - WDS(1)];
+#define PerformPut(stack,lval)                      \
+    StgStack_sp(stack) = StgStack_sp(stack) + WDS(3);   \
+    lval = W_[StgStack_sp(stack) - WDS(1)];
 
 stg_takeMVarzh
 {
@@ -1224,24 +1219,20 @@ loop:
         StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
     }
 
-loop2:
-    if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
-        tso = StgTSO__link(tso);
-        goto loop2;
-    }
-
     ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
     ASSERT(StgTSO_block_info(tso) == mvar);
 
     // actually perform the putMVar for the thread that we just woke up
-    PerformPut(tso,StgMVar_value(mvar));
+    W_ stack;
+    stack = StgTSO_stackobj(tso);
+    PerformPut(stack, StgMVar_value(mvar));
 
     // indicate that the MVar operation has now completed.
     StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
     
     // no need to mark the TSO dirty, we have only written END_TSO_QUEUE.
 
-    foreign "C" tryWakeupThread_(MyCapability() "ptr", tso) [];
+    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
     
     unlockClosure(mvar, stg_MVAR_DIRTY_info);
     RET_P(val);
@@ -1303,24 +1294,20 @@ loop:
         StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
     }
 
-loop2:
-    if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
-        tso = StgTSO__link(tso);
-        goto loop2;
-    }
-
     ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
     ASSERT(StgTSO_block_info(tso) == mvar);
 
     // actually perform the putMVar for the thread that we just woke up
-    PerformPut(tso,StgMVar_value(mvar));
+    W_ stack;
+    stack = StgTSO_stackobj(tso);
+    PerformPut(stack, StgMVar_value(mvar));
 
     // indicate that the MVar operation has now completed.
     StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
     
     // no need to mark the TSO dirty, we have only written END_TSO_QUEUE.
 
-    foreign "C" tryWakeupThread_(MyCapability() "ptr", tso) [];
+    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
     
     unlockClosure(mvar, stg_MVAR_DIRTY_info);
     RET_NP(1,val);
@@ -1395,26 +1382,22 @@ loop:
         StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
     }
 
-loop2:
-    if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
-        tso = StgTSO__link(tso);
-        goto loop2;
-    }
-
     ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
     ASSERT(StgTSO_block_info(tso) == mvar);
 
     // actually perform the takeMVar
-    PerformTake(tso, val);
+    W_ stack;
+    stack = StgTSO_stackobj(tso);
+    PerformTake(stack, val);
 
     // indicate that the MVar operation has now completed.
     StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
-    
-    if (TO_W_(StgTSO_dirty(tso)) == 0) {
-        foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
+
+    if (TO_W_(StgStack_dirty(stack)) == 0) {
+        foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") [];
     }
     
-    foreign "C" tryWakeupThread_(MyCapability() "ptr", tso) [];
+    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
 
     unlockClosure(mvar, stg_MVAR_DIRTY_info);
     jump %ENTRY_CODE(Sp(0));
@@ -1468,26 +1451,22 @@ loop:
         StgMVar_tail(mvar) = stg_END_TSO_QUEUE_closure;
     }
 
-loop2:
-    if (TO_W_(StgTSO_what_next(tso)) == ThreadRelocated) {
-        tso = StgTSO__link(tso);
-        goto loop2;
-    }
-
     ASSERT(StgTSO_why_blocked(tso) == BlockedOnMVar::I16);
     ASSERT(StgTSO_block_info(tso) == mvar);
 
     // actually perform the takeMVar
-    PerformTake(tso, val);
+    W_ stack;
+    stack = StgTSO_stackobj(tso);
+    PerformTake(stack, val);
 
     // indicate that the MVar operation has now completed.
     StgTSO__link(tso) = stg_END_TSO_QUEUE_closure;
     
-    if (TO_W_(StgTSO_dirty(tso)) == 0) {
-        foreign "C" dirty_TSO(MyCapability() "ptr", tso "ptr") [];
+    if (TO_W_(StgStack_dirty(stack)) == 0) {
+        foreign "C" dirty_STACK(MyCapability() "ptr", stack "ptr") [];
     }
     
-    foreign "C" tryWakeupThread_(MyCapability() "ptr", tso) [];
+    foreign "C" tryWakeupThread(MyCapability() "ptr", tso) [];
 
     unlockClosure(mvar, stg_MVAR_DIRTY_info);
     RET_N(1);
diff --git a/rts/Printer.c b/rts/Printer.c
index 565a11ed77..fcc483dce6 100644
--- a/rts/Printer.c
+++ b/rts/Printer.c
@@ -276,6 +276,15 @@ printClosure( StgClosure *obj )
             break;
         }
 
+    case UNDERFLOW_FRAME:
+        {
+            StgUnderflowFrame* u = (StgUnderflowFrame*)obj;
+            debugBelch("UNDERFLOW_FRAME(");
+            printPtr((StgPtr)u->next_chunk);
+            debugBelch(")\n"); 
+            break;
+        }
+
     case STOP_FRAME:
         {
             StgStopFrame* u = (StgStopFrame*)obj;
@@ -461,13 +470,11 @@ printStackChunk( StgPtr sp, StgPtr spBottom )
 	    
 	case UPDATE_FRAME:
 	case CATCH_FRAME:
-	    printObj((StgClosure*)sp);
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
+            printObj((StgClosure*)sp);
 	    continue;
 
-	case STOP_FRAME:
-	    printObj((StgClosure*)sp);
-	    return;
-
 	case RET_DYN:
 	{ 
 	    StgRetDyn* r;
@@ -559,7 +566,8 @@ printStackChunk( StgPtr sp, StgPtr spBottom )
 
 void printTSO( StgTSO *tso )
 {
-    printStackChunk( tso->sp, tso->stack+tso->stack_size);
+    printStackChunk( tso->stackobj->sp,
+                     tso->stackobj->stack+tso->stackobj->stack_size);
 }
 
 /* --------------------------------------------------------------------------
@@ -1039,7 +1047,6 @@ char *what_next_strs[] = {
   [ThreadRunGHC]    = "ThreadRunGHC",
   [ThreadInterpret] = "ThreadInterpret",
   [ThreadKilled]    = "ThreadKilled",
-  [ThreadRelocated] = "ThreadRelocated",
   [ThreadComplete]  = "ThreadComplete"
 };
 
@@ -1102,6 +1109,7 @@ char *closure_type_names[] = {
  [RET_FUN]               = "RET_FUN",
  [UPDATE_FRAME]          = "UPDATE_FRAME",
  [CATCH_FRAME]           = "CATCH_FRAME",
+ [UNDERFLOW_FRAME]       = "UNDERFLOW_FRAME",
  [STOP_FRAME]            = "STOP_FRAME",
  [BLACKHOLE]             = "BLACKHOLE",
  [BLOCKING_QUEUE]        = "BLOCKING_QUEUE",
@@ -1118,6 +1126,7 @@ char *closure_type_names[] = {
  [PRIM]	                 = "PRIM",
  [MUT_PRIM]              = "MUT_PRIM",
  [TSO]                   = "TSO",
+ [STACK]                 = "STACK",
  [TREC_CHUNK]            = "TREC_CHUNK",
  [ATOMICALLY_FRAME]      = "ATOMICALLY_FRAME",
  [CATCH_RETRY_FRAME]     = "CATCH_RETRY_FRAME",
diff --git a/rts/ProfHeap.c b/rts/ProfHeap.c
index d398afd66f..39b64d4c51 100644
--- a/rts/ProfHeap.c
+++ b/rts/ProfHeap.c
@@ -947,19 +947,35 @@ heapCensusChain( Census *census, bdescr *bd )
 		prim = rtsTrue;
 #ifdef PROFILING
 		if (RtsFlags.ProfFlags.includeTSOs) {
-		    size = tso_sizeW((StgTSO *)p);
+                    size = sizeofW(StgTSO);
 		    break;
 		} else {
 		    // Skip this TSO and move on to the next object
-		    p += tso_sizeW((StgTSO *)p);
+                    p += sizeofW(StgTSO);
 		    continue;
 		}
 #else
-		size = tso_sizeW((StgTSO *)p);
+                size = sizeofW(StgTSO);
 		break;
 #endif
 
-	    case TREC_CHUNK:
+            case STACK:
+		prim = rtsTrue;
+#ifdef PROFILING
+		if (RtsFlags.ProfFlags.includeTSOs) {
+                    size = stack_sizeW((StgStack*)p);
+                    break;
+		} else {
+		    // Skip this TSO and move on to the next object
+                    p += stack_sizeW((StgStack*)p);
+		    continue;
+		}
+#else
+                size = stack_sizeW((StgStack*)p);
+		break;
+#endif
+
+            case TREC_CHUNK:
 		prim = rtsTrue;
 		size = sizeofW(StgTRecChunk);
 		break;
diff --git a/rts/ProfHeap.h b/rts/ProfHeap.h
index 48b5bafa48..c4a92e200b 100644
--- a/rts/ProfHeap.h
+++ b/rts/ProfHeap.h
@@ -14,7 +14,6 @@
 void    heapCensus         (void);
 nat     initHeapProfiling  (void);
 void    endHeapProfiling   (void);
-void    LDV_recordDead     (StgClosure *c, nat size);
 rtsBool strMatchesSelector (char* str, char* sel);
 
 #include "EndPrivate.h"
diff --git a/rts/RaiseAsync.c b/rts/RaiseAsync.c
index 7abccde0a5..550f703e8d 100644
--- a/rts/RaiseAsync.c
+++ b/rts/RaiseAsync.c
@@ -23,11 +23,11 @@
 #include "win32/IOManager.h"
 #endif
 
-static void raiseAsync (Capability *cap,
-			StgTSO *tso,
-			StgClosure *exception, 
-			rtsBool stop_at_atomically,
-			StgUpdateFrame *stop_here);
+static StgTSO* raiseAsync (Capability *cap,
+                           StgTSO *tso,
+                           StgClosure *exception,
+                           rtsBool stop_at_atomically,
+                           StgUpdateFrame *stop_here);
 
 static void removeFromQueues(Capability *cap, StgTSO *tso);
 
@@ -61,11 +61,9 @@ static void
 throwToSingleThreaded__ (Capability *cap, StgTSO *tso, StgClosure *exception, 
                          rtsBool stop_at_atomically, StgUpdateFrame *stop_here)
 {
-    tso = deRefTSO(tso);
-
     // Thread already dead?
     if (tso->what_next == ThreadComplete || tso->what_next == ThreadKilled) {
-	return;
+        return;
     }
 
     // Remove it from any blocking queues
@@ -81,13 +79,13 @@ throwToSingleThreaded (Capability *cap, StgTSO *tso, StgClosure *exception)
 }
 
 void
-throwToSingleThreaded_ (Capability *cap, StgTSO *tso, StgClosure *exception, 
+throwToSingleThreaded_ (Capability *cap, StgTSO *tso, StgClosure *exception,
                         rtsBool stop_at_atomically)
 {
     throwToSingleThreaded__ (cap, tso, exception, stop_at_atomically, NULL);
 }
 
-void
+void // cannot return a different TSO
 suspendComputation (Capability *cap, StgTSO *tso, StgUpdateFrame *stop_here)
 {
     throwToSingleThreaded__ (cap, tso, NULL, rtsFalse, stop_here);
@@ -192,9 +190,6 @@ retry:
 check_target:
     ASSERT(target != END_TSO_QUEUE);
 
-    // follow ThreadRelocated links in the target first
-    target = deRefTSO(target);
-
     // Thread already dead?
     if (target->what_next == ThreadComplete 
 	|| target->what_next == ThreadKilled) {
@@ -268,7 +263,7 @@ check_target:
             // might as well just do it now.  The message will
             // be a no-op when it arrives.
             unlockClosure((StgClosure*)m, i);
-            tryWakeupThread_(cap, target);
+            tryWakeupThread(cap, target);
             goto retry;
         }
 
@@ -286,7 +281,7 @@ check_target:
         }
 
         // nobody else can wake up this TSO after we claim the message
-        unlockClosure((StgClosure*)m, &stg_MSG_NULL_info);
+        doneWithMsgThrowTo(m);
 
         raiseAsync(cap, target, msg->exception, rtsFalse, NULL);
         return THROWTO_SUCCESS;
@@ -315,12 +310,7 @@ check_target:
 
 	info = lockClosure((StgClosure *)mvar);
 
-	if (target->what_next == ThreadRelocated) {
-	    target = target->_link;
-	    unlockClosure((StgClosure *)mvar,info);
-	    goto retry;
-	}
-	// we have the MVar, let's check whether the thread
+        // we have the MVar, let's check whether the thread
 	// is still blocked on the same MVar.
 	if (target->why_blocked != BlockedOnMVar
 	    || (StgMVar *)target->block_info.closure != mvar) {
@@ -334,7 +324,7 @@ check_target:
             // thread now anyway and ignore the message when it
             // arrives.
 	    unlockClosure((StgClosure *)mvar, info);
-            tryWakeupThread_(cap, target);
+            tryWakeupThread(cap, target);
             goto retry;
         }
 
@@ -505,7 +495,8 @@ maybePerformBlockedException (Capability *cap, StgTSO *tso)
 {
     MessageThrowTo *msg;
     const StgInfoTable *i;
-    
+    StgTSO *source;
+
     if (tso->what_next == ThreadComplete || tso->what_next == ThreadFinished) {
         if (tso->blocked_exceptions != END_BLOCKED_EXCEPTIONS_QUEUE) {
             awakenBlockedExceptionQueue(cap,tso);
@@ -537,8 +528,9 @@ maybePerformBlockedException (Capability *cap, StgTSO *tso)
         }
 
         throwToSingleThreaded(cap, msg->target, msg->exception);
-        unlockClosure((StgClosure*)msg,&stg_MSG_NULL_info);
-        tryWakeupThread(cap, msg->source);
+        source = msg->source;
+        doneWithMsgThrowTo(msg);
+        tryWakeupThread(cap, source);
         return 1;
     }
     return 0;
@@ -552,13 +544,15 @@ awakenBlockedExceptionQueue (Capability *cap, StgTSO *tso)
 {
     MessageThrowTo *msg;
     const StgInfoTable *i;
+    StgTSO *source;
 
     for (msg = tso->blocked_exceptions; msg != END_BLOCKED_EXCEPTIONS_QUEUE;
          msg = (MessageThrowTo*)msg->link) {
         i = lockClosure((StgClosure *)msg);
         if (i != &stg_MSG_NULL_info) {
-            unlockClosure((StgClosure *)msg,&stg_MSG_NULL_info);
-            tryWakeupThread(cap, msg->source);
+            source = msg->source;
+            doneWithMsgThrowTo(msg);
+            tryWakeupThread(cap, source);
         } else {
             unlockClosure((StgClosure *)msg,i);
         }
@@ -653,7 +647,7 @@ removeFromQueues(Capability *cap, StgTSO *tso)
       // ASSERT(m->header.info == &stg_WHITEHOLE_info);
 
       // unlock and revoke it at the same time
-      unlockClosure((StgClosure*)m,&stg_MSG_NULL_info);
+      doneWithMsgThrowTo(m);
       break;
   }
 
@@ -724,7 +718,7 @@ removeFromQueues(Capability *cap, StgTSO *tso)
  *
  * -------------------------------------------------------------------------- */
 
-static void
+static StgTSO *
 raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception, 
 	   rtsBool stop_at_atomically, StgUpdateFrame *stop_here)
 {
@@ -732,6 +726,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
     StgPtr sp, frame;
     StgClosure *updatee;
     nat i;
+    StgStack *stack;
 
     debugTraceCap(DEBUG_sched, cap,
                   "raising exception in thread %ld.", (long)tso->id);
@@ -747,25 +742,21 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
         fprintCCS_stderr(tso->prof.CCCS);
     }
 #endif
-    // ASSUMES: the thread is not already complete or dead, or
-    // ThreadRelocated.  Upper layers should deal with that.
+    // ASSUMES: the thread is not already complete or dead
+    // Upper layers should deal with that.
     ASSERT(tso->what_next != ThreadComplete && 
-           tso->what_next != ThreadKilled && 
-           tso->what_next != ThreadRelocated);
+           tso->what_next != ThreadKilled);
 
     // only if we own this TSO (except that deleteThread() calls this 
     ASSERT(tso->cap == cap);
 
-    // wake it up
-    if (tso->why_blocked != NotBlocked) {
-        tso->why_blocked = NotBlocked;
-        appendToRunQueue(cap,tso);
-    }        
+    stack = tso->stackobj;
 
     // mark it dirty; we're about to change its stack.
     dirty_TSO(cap, tso);
+    dirty_STACK(cap, stack);
 
-    sp = tso->sp;
+    sp = stack->sp;
     
     if (stop_here != NULL) {
         updatee = stop_here->updatee;
@@ -801,10 +792,13 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
 	// 
 	// 5. If it's a STOP_FRAME, then kill the thread.
         // 
-        // NB: if we pass an ATOMICALLY_FRAME then abort the associated 
+        // 6. If it's an UNDERFLOW_FRAME, then continue with the next
+        //    stack chunk.
+        //
+        // NB: if we pass an ATOMICALLY_FRAME then abort the associated
         // transaction
        
-	info = get_ret_itbl((StgClosure *)frame);
+        info = get_ret_itbl((StgClosure *)frame);
 
 	switch (info->i.type) {
 
@@ -859,12 +853,46 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
 	    continue; //no need to bump frame
 	}
 
-	case STOP_FRAME:
+        case UNDERFLOW_FRAME:
+        {
+	    StgAP_STACK * ap;
+	    nat words;
+	    
+	    // First build an AP_STACK consisting of the stack chunk above the
+	    // current update frame, with the top word on the stack as the
+	    // fun field.
+	    //
+	    words = frame - sp - 1;
+	    ap = (StgAP_STACK *)allocate(cap,AP_STACK_sizeW(words));
+	    
+	    ap->size = words;
+	    ap->fun  = (StgClosure *)sp[0];
+	    sp++;
+	    for(i=0; i < (nat)words; ++i) {
+		ap->payload[i] = (StgClosure *)*sp++;
+	    }
+	    
+            SET_HDR(ap,&stg_AP_STACK_NOUPD_info,
+		    ((StgClosure *)frame)->header.prof.ccs /* ToDo */); 
+            TICK_ALLOC_SE_THK(words+1,0);
+
+            stack->sp = sp;
+            threadStackUnderflow(cap,tso);
+            stack = tso->stackobj;
+            sp = stack->sp;
+
+            sp--;
+            sp[0] = (W_)ap;
+            frame = sp + 1;
+            continue;
+        }
+
+        case STOP_FRAME:
 	{
 	    // We've stripped the entire stack, the thread is now dead.
 	    tso->what_next = ThreadKilled;
-	    tso->sp = frame + sizeofW(StgStopFrame);
-	    return;
+            stack->sp = frame + sizeofW(StgStopFrame);
+            goto done;
 	}
 
 	case CATCH_FRAME:
@@ -906,17 +934,16 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
 	     */
 	    sp[0] = (W_)raise;
 	    sp[-1] = (W_)&stg_enter_info;
-	    tso->sp = sp-1;
+            stack->sp = sp-1;
 	    tso->what_next = ThreadRunGHC;
-	    IF_DEBUG(sanity, checkTSO(tso));
-	    return;
+            goto done;
 	}
 	    
 	case ATOMICALLY_FRAME:
 	    if (stop_at_atomically) {
 		ASSERT(tso->trec->enclosing_trec == NO_TREC);
 		stmCondemnTransaction(cap, tso -> trec);
-		tso->sp = frame - 2;
+                stack->sp = frame - 2;
                 // The ATOMICALLY_FRAME expects to be returned a
                 // result from the transaction, which it stores in the
                 // stack frame.  Hence we arrange to return a dummy
@@ -925,10 +952,10 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
                 // ATOMICALLY_FRAME instance for condemned
                 // transactions, but I don't fully understand the
                 // interaction with STM invariants.
-                tso->sp[1] = (W_)&stg_NO_TREC_closure;
-                tso->sp[0] = (W_)&stg_gc_unpt_r1_info;
-		tso->what_next = ThreadRunGHC;
-		return;
+                stack->sp[1] = (W_)&stg_NO_TREC_closure;
+                stack->sp[0] = (W_)&stg_gc_unpt_r1_info;
+                tso->what_next = ThreadRunGHC;
+                goto done;
 	    }
 	    // Not stop_at_atomically... fall through and abort the
 	    // transaction.
@@ -950,7 +977,7 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
             stmAbortTransaction(cap, trec);
 	    stmFreeAbortedTRec(cap, trec);
             tso -> trec = outer;
-	    break;
+            break;
 	    };
 	    
 	default:
@@ -961,8 +988,16 @@ raiseAsync(Capability *cap, StgTSO *tso, StgClosure *exception,
 	frame += stack_frame_sizeW((StgClosure *)frame);
     }
 
-    // if we got here, then we stopped at stop_here
-    ASSERT(stop_here != NULL);
+done:
+    IF_DEBUG(sanity, checkTSO(tso));
+
+    // wake it up
+    if (tso->why_blocked != NotBlocked) {
+        tso->why_blocked = NotBlocked;
+        appendToRunQueue(cap,tso);
+    }        
+
+    return tso;
 }
 
 
diff --git a/rts/RetainerProfile.c b/rts/RetainerProfile.c
index c5a7bf7897..48473d2480 100644
--- a/rts/RetainerProfile.c
+++ b/rts/RetainerProfile.c
@@ -597,11 +597,13 @@ push( StgClosure *c, retainer c_child_r, StgClosure **first_child )
     case AP:
     case AP_STACK:
     case TSO:
+    case STACK:
     case IND_STATIC:
     case CONSTR_NOCAF_STATIC:
 	// stack objects
     case UPDATE_FRAME:
     case CATCH_FRAME:
+    case UNDERFLOW_FRAME:
     case STOP_FRAME:
     case RET_DYN:
     case RET_BCO:
@@ -925,13 +927,15 @@ pop( StgClosure **c, StgClosure **cp, retainer *r )
 	case AP:
 	case AP_STACK:
 	case TSO:
-	case IND_STATIC:
+        case STACK:
+        case IND_STATIC:
 	case CONSTR_NOCAF_STATIC:
 	    // stack objects
 	case RET_DYN:
 	case UPDATE_FRAME:
 	case CATCH_FRAME:
-	case STOP_FRAME:
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
 	case RET_BCO:
 	case RET_SMALL:
 	case RET_BIG:
@@ -1001,6 +1005,7 @@ isRetainer( StgClosure *c )
 	//
 	// TSOs MUST be retainers: they constitute the set of roots.
     case TSO:
+    case STACK:
 
 	// mutable objects
     case MUT_PRIM:
@@ -1080,6 +1085,7 @@ isRetainer( StgClosure *c )
 	// legal objects during retainer profiling.
     case UPDATE_FRAME:
     case CATCH_FRAME:
+    case UNDERFLOW_FRAME:
     case STOP_FRAME:
     case RET_DYN:
     case RET_BCO:
@@ -1257,8 +1263,8 @@ retainSRT (StgClosure **srt, nat srt_bitmap, StgClosure *c, retainer c_child_r)
  *    RSET(c) and RSET(c_child_r) are valid, i.e., their
  *    interpretation conforms to the current value of flip (even when they
  *    are interpreted to be NULL).
- *    If *c is TSO, its state is not any of ThreadRelocated, ThreadComplete,
- *    or ThreadKilled, which means that its stack is ready to process.
+ *    If *c is TSO, its state is not ThreadComplete,or ThreadKilled, 
+ *    which means that its stack is ready to process.
  *  Note:
  *    This code was almost plagiarzied from GC.c! For each pointer,
  *    retainClosure() is invoked instead of evacuate().
@@ -1291,11 +1297,8 @@ retainStack( StgClosure *c, retainer c_child_r,
     // debugBelch("retainStack() called: oldStackBoundary = 0x%x, currentStackBoundary = 0x%x\n", oldStackBoundary, currentStackBoundary);
 #endif
 
-    ASSERT(get_itbl(c)->type != TSO || 
-	   (((StgTSO *)c)->what_next != ThreadRelocated &&
-	    ((StgTSO *)c)->what_next != ThreadComplete &&
-	    ((StgTSO *)c)->what_next != ThreadKilled));
-    
+    ASSERT(get_itbl(c)->type == STACK);
+
     p = stackStart;
     while (p < stackEnd) {
 	info = get_ret_itbl((StgClosure *)p);
@@ -1307,7 +1310,8 @@ retainStack( StgClosure *c, retainer c_child_r,
 	    p += sizeofW(StgUpdateFrame);
 	    continue;
 
-	case STOP_FRAME:
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
 	case CATCH_FRAME:
 	case CATCH_STM_FRAME:
 	case CATCH_RETRY_FRAME:
@@ -1560,14 +1564,7 @@ inner_loop:
 #endif
 	    goto loop;
 	}
-	if (((StgTSO *)c)->what_next == ThreadRelocated) {
-#ifdef DEBUG_RETAINER
-	    debugBelch("ThreadRelocated encountered in retainClosure()\n");
-#endif
-	    c = (StgClosure *)((StgTSO *)c)->_link;
-	    goto inner_loop;
-	}
-	break;
+        break;
 
     case IND_STATIC:
 	// We just skip IND_STATIC, so its retainer set is never computed.
@@ -1681,10 +1678,10 @@ inner_loop:
     // than attempting to save the current position, because doing so
     // would be hard.
     switch (typeOfc) {
-    case TSO:
+    case STACK:
 	retainStack(c, c_child_r,
-		    ((StgTSO *)c)->sp,
-		    ((StgTSO *)c)->stack + ((StgTSO *)c)->stack_size);
+                    ((StgStack *)c)->sp,
+                    ((StgStack *)c)->stack + ((StgStack *)c)->stack_size);
 	goto loop;
 
     case PAP:
diff --git a/rts/RtsAPI.c b/rts/RtsAPI.c
index 53628dc61a..8fcf8ce812 100644
--- a/rts/RtsAPI.c
+++ b/rts/RtsAPI.c
@@ -375,8 +375,8 @@ rts_getBool (HaskellObj p)
    -------------------------------------------------------------------------- */
 
 INLINE_HEADER void pushClosure   (StgTSO *tso, StgWord c) {
-  tso->sp--;
-  tso->sp[0] = (W_) c;
+  tso->stackobj->sp--;
+  tso->stackobj->sp[0] = (W_) c;
 }
 
 StgTSO *
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index c11cc3e925..b0dd42b38c 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -69,6 +69,8 @@ void initRtsFlagsDefaults(void)
 
     RtsFlags.GcFlags.maxStkSize		= (8 * 1024 * 1024) / sizeof(W_);
     RtsFlags.GcFlags.initialStkSize	= 1024 / sizeof(W_);
+    RtsFlags.GcFlags.stkChunkSize       = (32 * 1024) / sizeof(W_);
+    RtsFlags.GcFlags.stkChunkBufferSize = (1 * 1024) / sizeof(W_);
 
     RtsFlags.GcFlags.minAllocAreaSize   = (512 * 1024)        / BLOCK_SIZE;
     RtsFlags.GcFlags.minOldGenSize      = (1024 * 1024)       / BLOCK_SIZE;
@@ -194,7 +196,9 @@ usage_text[] = {
 "  --info   Print information about the RTS used by this program",
 "",
 "  -K<size> Sets the maximum stack size (default 8M)  Egs: -K32k   -K512k",
-"  -k<size> Sets the initial thread stack size (default 1k)  Egs: -k4k   -k2m",
+"  -ki<size> Sets the initial thread stack size (default 1k)  Egs: -ki4k -ki2m",
+"  -kc<size> Sets the stack chunk size (default 32k)",
+"  -kb<size> Sets the stack chunk buffer size (default 1k)",
 "",
 "  -A<size> Sets the minimum allocation area size (default 512k) Egs: -A1m -A10k",
 "  -M<size> Sets the maximum heap size (default unlimited)  Egs: -M256k -M1G",
@@ -693,15 +697,31 @@ error = rtsTrue;
 
 	      case 'K':
                   RtsFlags.GcFlags.maxStkSize =
-                      decodeSize(rts_argv[arg], 2, 1, HS_WORD_MAX) / sizeof(W_);
+                      decodeSize(rts_argv[arg], 2, sizeof(W_), HS_WORD_MAX) / sizeof(W_);
                   break;
 
 	      case 'k':
+		switch(rts_argv[arg][2]) {
+                case 'c':
+                  RtsFlags.GcFlags.stkChunkSize =
+                      decodeSize(rts_argv[arg], 3, sizeof(W_), HS_WORD_MAX) / sizeof(W_);
+                  break;
+                case 'b':
+                  RtsFlags.GcFlags.stkChunkBufferSize =
+                      decodeSize(rts_argv[arg], 3, sizeof(W_), HS_WORD_MAX) / sizeof(W_);
+                  break;
+                case 'i':
+                  RtsFlags.GcFlags.initialStkSize =
+                      decodeSize(rts_argv[arg], 3, sizeof(W_), HS_WORD_MAX) / sizeof(W_);
+                  break;
+                default:
                   RtsFlags.GcFlags.initialStkSize =
-                      decodeSize(rts_argv[arg], 2, 1, HS_WORD_MAX) / sizeof(W_);
+                      decodeSize(rts_argv[arg], 2, sizeof(W_), HS_WORD_MAX) / sizeof(W_);
                   break;
+                }
+                break;
 
-	      case 'M':
+              case 'M':
                   RtsFlags.GcFlags.maxHeapSize =
                       decodeSize(rts_argv[arg], 2, BLOCK_SIZE, HS_WORD_MAX) / BLOCK_SIZE;
                   /* user give size in *bytes* but "maxHeapSize" is in *blocks* */
@@ -1203,6 +1223,12 @@ error = rtsTrue;
         RtsFlags.ProfFlags.profileIntervalTicks = 0;
     }
 
+    if (RtsFlags.GcFlags.stkChunkBufferSize >
+        RtsFlags.GcFlags.stkChunkSize / 2) {
+        errorBelch("stack chunk buffer size (-kb) must be less than 50%% of the stack chunk size (-kc)");
+        error = rtsTrue;
+    }
+
     if (error) {
 	const char **p;
 
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 0b1dec4085..c115d2bde6 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -140,9 +140,7 @@ static void scheduleActivateSpark(Capability *cap);
 #endif
 static void schedulePostRunThread(Capability *cap, StgTSO *t);
 static rtsBool scheduleHandleHeapOverflow( Capability *cap, StgTSO *t );
-static void scheduleHandleStackOverflow( Capability *cap, Task *task, 
-					 StgTSO *t);
-static rtsBool scheduleHandleYield( Capability *cap, StgTSO *t, 
+static rtsBool scheduleHandleYield( Capability *cap, StgTSO *t,
 				    nat prev_what_next );
 static void scheduleHandleThreadBlocked( StgTSO *t );
 static rtsBool scheduleHandleThreadFinished( Capability *cap, Task *task,
@@ -151,9 +149,6 @@ static rtsBool scheduleNeedHeapProfile(rtsBool ready_to_gc);
 static Capability *scheduleDoGC(Capability *cap, Task *task,
 				rtsBool force_major);
 
-static StgTSO *threadStackOverflow(Capability *cap, StgTSO *tso);
-static StgTSO *threadStackUnderflow(Capability *cap, Task *task, StgTSO *tso);
-
 static void deleteThread (Capability *cap, StgTSO *tso);
 static void deleteAllThreads (Capability *cap);
 
@@ -426,6 +421,7 @@ run_thread:
     cap->in_haskell = rtsTrue;
 
     dirty_TSO(cap,t);
+    dirty_STACK(cap,t->stackobj);
 
 #if defined(THREADED_RTS)
     if (recent_activity == ACTIVITY_DONE_GC) {
@@ -503,10 +499,6 @@ run_thread:
     
     schedulePostRunThread(cap,t);
 
-    if (ret != StackOverflow) {
-        t = threadStackUnderflow(cap,task,t);
-    }
-
     ready_to_gc = rtsFalse;
 
     switch (ret) {
@@ -515,8 +507,11 @@ run_thread:
 	break;
 
     case StackOverflow:
-	scheduleHandleStackOverflow(cap,task,t);
-	break;
+        // just adjust the stack for this thread, then pop it back
+        // on the run queue.
+        threadStackOverflow(cap, t);
+        pushOnRunQueue(cap,t);
+        break;
 
     case ThreadYielding:
 	if (scheduleHandleYield(cap, t, prev_what_next)) {
@@ -729,8 +724,7 @@ schedulePushWork(Capability *cap USED_IF_THREADS,
 	    for (; t != END_TSO_QUEUE; t = next) {
 		next = t->_link;
 		t->_link = END_TSO_QUEUE;
-		if (t->what_next == ThreadRelocated
-		    || t->bound == task->incall // don't move my bound thread
+                if (t->bound == task->incall // don't move my bound thread
 		    || tsoLocked(t)) {  // don't move a locked thread
 		    setTSOLink(cap, prev, t);
                     setTSOPrev(cap, t, prev);
@@ -1098,30 +1092,6 @@ scheduleHandleHeapOverflow( Capability *cap, StgTSO *t )
 }
 
 /* -----------------------------------------------------------------------------
- * Handle a thread that returned to the scheduler with ThreadStackOverflow
- * -------------------------------------------------------------------------- */
-
-static void
-scheduleHandleStackOverflow (Capability *cap, Task *task, StgTSO *t)
-{
-    /* just adjust the stack for this thread, then pop it back
-     * on the run queue.
-     */
-    { 
-	/* enlarge the stack */
-	StgTSO *new_t = threadStackOverflow(cap, t);
-	
-	/* The TSO attached to this Task may have moved, so update the
-	 * pointer to it.
-	 */
-	if (task->incall->tso == t) {
-	    task->incall->tso = new_t;
-	}
-	pushOnRunQueue(cap,new_t);
-    }
-}
-
-/* -----------------------------------------------------------------------------
  * Handle a thread that returned to the scheduler with ThreadYielding
  * -------------------------------------------------------------------------- */
 
@@ -1241,8 +1211,8 @@ scheduleHandleThreadFinished (Capability *cap STG_UNUSED, Task *task, StgTSO *t)
 
 	  if (t->what_next == ThreadComplete) {
 	      if (task->incall->ret) {
-		  // NOTE: return val is tso->sp[1] (see StgStartup.hc)
-		  *(task->incall->ret) = (StgClosure *)task->incall->tso->sp[1]; 
+                  // NOTE: return val is stack->sp[1] (see StgStartup.hc)
+                  *(task->incall->ret) = (StgClosure *)task->incall->tso->stackobj->sp[1];
 	      }
 	      task->incall->stat = Success;
 	  } else {
@@ -1578,10 +1548,7 @@ forkProcess(HsStablePtr *entry
 
         for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
           for (t = generations[g].threads; t != END_TSO_QUEUE; t = next) {
-	    if (t->what_next == ThreadRelocated) {
-		next = t->_link;
-	    } else {
-		next = t->global_link;
+                next = t->global_link;
 		// don't allow threads to catch the ThreadKilled
 		// exception, but we do want to raiseAsync() because these
 		// threads may be evaluating thunks that we need later.
@@ -1593,7 +1560,6 @@ forkProcess(HsStablePtr *entry
                 // won't get a chance to exit in the usual way (see
                 // also scheduleHandleThreadFinished).
                 t->bound = NULL;
-	    }
           }
 	}
 	
@@ -1661,12 +1627,8 @@ deleteAllThreads ( Capability *cap )
     debugTrace(DEBUG_sched,"deleting all threads");
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
         for (t = generations[g].threads; t != END_TSO_QUEUE; t = next) {
-            if (t->what_next == ThreadRelocated) {
-                next = t->_link;
-            } else {
                 next = t->global_link;
                 deleteThread(cap,t);
-            }
         }
     }
 
@@ -1850,6 +1812,7 @@ resumeThread (void *task_)
 
     /* We might have GC'd, mark the TSO dirty again */
     dirty_TSO(cap,tso);
+    dirty_STACK(cap,tso->stackobj);
 
     IF_DEBUG(sanity, checkTSO(tso));
 
@@ -2108,189 +2071,6 @@ performMajorGC(void)
     performGC_(rtsTrue);
 }
 
-/* -----------------------------------------------------------------------------
-   Stack overflow
-
-   If the thread has reached its maximum stack size, then raise the
-   StackOverflow exception in the offending thread.  Otherwise
-   relocate the TSO into a larger chunk of memory and adjust its stack
-   size appropriately.
-   -------------------------------------------------------------------------- */
-
-static StgTSO *
-threadStackOverflow(Capability *cap, StgTSO *tso)
-{
-  nat new_stack_size, stack_words;
-  lnat new_tso_size;
-  StgPtr new_sp;
-  StgTSO *dest;
-
-  IF_DEBUG(sanity,checkTSO(tso));
-
-  if (tso->stack_size >= tso->max_stack_size
-      && !(tso->flags & TSO_BLOCKEX)) {
-      // NB. never raise a StackOverflow exception if the thread is
-      // inside Control.Exceptino.block.  It is impractical to protect
-      // against stack overflow exceptions, since virtually anything
-      // can raise one (even 'catch'), so this is the only sensible
-      // thing to do here.  See bug #767.
-      //
-
-      if (tso->flags & TSO_SQUEEZED) {
-          return tso;
-      }
-      // #3677: In a stack overflow situation, stack squeezing may
-      // reduce the stack size, but we don't know whether it has been
-      // reduced enough for the stack check to succeed if we try
-      // again.  Fortunately stack squeezing is idempotent, so all we
-      // need to do is record whether *any* squeezing happened.  If we
-      // are at the stack's absolute -K limit, and stack squeezing
-      // happened, then we try running the thread again.  The
-      // TSO_SQUEEZED flag is set by threadPaused() to tell us whether
-      // squeezing happened or not.
-
-      debugTrace(DEBUG_gc,
-		 "threadStackOverflow of TSO %ld (%p): stack too large (now %ld; max is %ld)",
-		 (long)tso->id, tso, (long)tso->stack_size, (long)tso->max_stack_size);
-      IF_DEBUG(gc,
-	       /* If we're debugging, just print out the top of the stack */
-	       printStackChunk(tso->sp, stg_min(tso->stack+tso->stack_size, 
-						tso->sp+64)));
-
-      // Send this thread the StackOverflow exception
-      throwToSingleThreaded(cap, tso, (StgClosure *)stackOverflow_closure);
-      return tso;
-  }
-
-
-  // We also want to avoid enlarging the stack if squeezing has
-  // already released some of it.  However, we don't want to get into
-  // a pathalogical situation where a thread has a nearly full stack
-  // (near its current limit, but not near the absolute -K limit),
-  // keeps allocating a little bit, squeezing removes a little bit,
-  // and then it runs again.  So to avoid this, if we squeezed *and*
-  // there is still less than BLOCK_SIZE_W words free, then we enlarge
-  // the stack anyway.
-  if ((tso->flags & TSO_SQUEEZED) && 
-      ((W_)(tso->sp - tso->stack) >= BLOCK_SIZE_W)) {
-      return tso;
-  }
-
-  /* Try to double the current stack size.  If that takes us over the
-   * maximum stack size for this thread, then use the maximum instead
-   * (that is, unless we're already at or over the max size and we
-   * can't raise the StackOverflow exception (see above), in which
-   * case just double the size). Finally round up so the TSO ends up as
-   * a whole number of blocks.
-   */
-  if (tso->stack_size >= tso->max_stack_size) {
-      new_stack_size = tso->stack_size * 2;
-  } else { 
-      new_stack_size = stg_min(tso->stack_size * 2, tso->max_stack_size);
-  }
-  new_tso_size   = (lnat)BLOCK_ROUND_UP(new_stack_size * sizeof(W_) + 
-				       TSO_STRUCT_SIZE)/sizeof(W_);
-  new_tso_size = round_to_mblocks(new_tso_size);  /* Be MBLOCK-friendly */
-  new_stack_size = new_tso_size - TSO_STRUCT_SIZEW;
-
-  debugTrace(DEBUG_sched, 
-	     "increasing stack size from %ld words to %d.",
-	     (long)tso->stack_size, new_stack_size);
-
-  dest = (StgTSO *)allocate(cap,new_tso_size);
-  TICK_ALLOC_TSO(new_stack_size,0);
-
-  /* copy the TSO block and the old stack into the new area */
-  memcpy(dest,tso,TSO_STRUCT_SIZE);
-  stack_words = tso->stack + tso->stack_size - tso->sp;
-  new_sp = (P_)dest + new_tso_size - stack_words;
-  memcpy(new_sp, tso->sp, stack_words * sizeof(W_));
-
-  /* relocate the stack pointers... */
-  dest->sp         = new_sp;
-  dest->stack_size = new_stack_size;
-	
-  /* Mark the old TSO as relocated.  We have to check for relocated
-   * TSOs in the garbage collector and any primops that deal with TSOs.
-   *
-   * It's important to set the sp value to just beyond the end
-   * of the stack, so we don't attempt to scavenge any part of the
-   * dead TSO's stack.
-   */
-  setTSOLink(cap,tso,dest);
-  write_barrier(); // other threads seeing ThreadRelocated will look at _link
-  tso->what_next = ThreadRelocated;
-  tso->sp = (P_)&(tso->stack[tso->stack_size]);
-  tso->why_blocked = NotBlocked;
-
-  IF_DEBUG(sanity,checkTSO(dest));
-#if 0
-  IF_DEBUG(scheduler,printTSO(dest));
-#endif
-
-  return dest;
-}
-
-static StgTSO *
-threadStackUnderflow (Capability *cap, Task *task, StgTSO *tso)
-{
-    bdescr *bd, *new_bd;
-    lnat free_w, tso_size_w;
-    StgTSO *new_tso;
-
-    tso_size_w = tso_sizeW(tso);
-
-    if (tso_size_w < MBLOCK_SIZE_W ||
-          // TSO is less than 2 mblocks (since the first mblock is
-          // shorter than MBLOCK_SIZE_W)
-        (tso_size_w - BLOCKS_PER_MBLOCK*BLOCK_SIZE_W) % MBLOCK_SIZE_W != 0 ||
-          // or TSO is not a whole number of megablocks (ensuring
-          // precondition of splitLargeBlock() below)
-        (tso_size_w <= round_up_to_mblocks(RtsFlags.GcFlags.initialStkSize)) ||
-          // or TSO is smaller than the minimum stack size (rounded up)
-        (nat)(tso->stack + tso->stack_size - tso->sp) > tso->stack_size / 4) 
-          // or stack is using more than 1/4 of the available space
-    {
-        // then do nothing
-        return tso;
-    }
-
-    // this is the number of words we'll free
-    free_w = round_to_mblocks(tso_size_w/2);
-
-    bd = Bdescr((StgPtr)tso);
-    new_bd = splitLargeBlock(bd, free_w / BLOCK_SIZE_W);
-    bd->free = bd->start + TSO_STRUCT_SIZEW;
-
-    new_tso = (StgTSO *)new_bd->start;
-    memcpy(new_tso,tso,TSO_STRUCT_SIZE);
-    new_tso->stack_size = new_bd->free - new_tso->stack;
-
-    // The original TSO was dirty and probably on the mutable
-    // list. The new TSO is not yet on the mutable list, so we better
-    // put it there.
-    new_tso->dirty = 0;
-    new_tso->flags &= ~TSO_LINK_DIRTY;
-    dirty_TSO(cap, new_tso);
-
-    debugTrace(DEBUG_sched, "thread %ld: reducing TSO size from %lu words to %lu",
-               (long)tso->id, tso_size_w, tso_sizeW(new_tso));
-
-    tso->_link = new_tso; // no write barrier reqd: same generation
-    write_barrier(); // other threads seeing ThreadRelocated will look at _link
-    tso->what_next = ThreadRelocated;
-
-    // The TSO attached to this Task may have moved, so update the
-    // pointer to it.
-    if (task->incall->tso == tso) {
-        task->incall->tso = new_tso;
-    }
-
-    IF_DEBUG(sanity,checkTSO(new_tso));
-
-    return new_tso;
-}
-
 /* ---------------------------------------------------------------------------
    Interrupt execution
    - usually called inside a signal handler so it mustn't do anything fancy.   
@@ -2337,7 +2117,7 @@ void wakeUpRts(void)
    exception.
    -------------------------------------------------------------------------- */
 
-static void 
+static void
 deleteThread (Capability *cap STG_UNUSED, StgTSO *tso)
 {
     // NOTE: must only be called on a TSO that we have exclusive
@@ -2347,12 +2127,12 @@ deleteThread (Capability *cap STG_UNUSED, StgTSO *tso)
 
     if (tso->why_blocked != BlockedOnCCall &&
 	tso->why_blocked != BlockedOnCCall_Interruptible) {
-	throwToSingleThreaded(tso->cap,tso,NULL);
+        throwToSingleThreaded(tso->cap,tso,NULL);
     }
 }
 
 #ifdef FORKPROCESS_PRIMOP_SUPPORTED
-static void 
+static void
 deleteThread_(Capability *cap, StgTSO *tso)
 { // for forkProcess only:
   // like deleteThread(), but we delete threads in foreign calls, too.
@@ -2406,7 +2186,7 @@ raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
     // we update any closures pointed to from update frames with the
     // raise closure that we just built.
     //
-    p = tso->sp;
+    p = tso->stackobj->sp;
     while(1) {
 	info = get_ret_itbl((StgClosure *)p);
 	next = p + stack_frame_sizeW((StgClosure *)p);
@@ -2427,20 +2207,25 @@ raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
 
         case ATOMICALLY_FRAME:
 	    debugTrace(DEBUG_stm, "found ATOMICALLY_FRAME at %p", p);
-            tso->sp = p;
+            tso->stackobj->sp = p;
             return ATOMICALLY_FRAME;
 	    
 	case CATCH_FRAME:
-	    tso->sp = p;
+            tso->stackobj->sp = p;
 	    return CATCH_FRAME;
 
         case CATCH_STM_FRAME:
 	    debugTrace(DEBUG_stm, "found CATCH_STM_FRAME at %p", p);
-            tso->sp = p;
+            tso->stackobj->sp = p;
             return CATCH_STM_FRAME;
 	    
-	case STOP_FRAME:
-	    tso->sp = p;
+        case UNDERFLOW_FRAME:
+            threadStackUnderflow(cap,tso);
+            p = tso->stackobj->sp;
+            continue;
+
+        case STOP_FRAME:
+            tso->stackobj->sp = p;
 	    return STOP_FRAME;
 
         case CATCH_RETRY_FRAME:
@@ -2470,12 +2255,12 @@ raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception)
    -------------------------------------------------------------------------- */
 
 StgWord
-findRetryFrameHelper (StgTSO *tso)
+findRetryFrameHelper (Capability *cap, StgTSO *tso)
 {
   StgPtr           p, next;
   StgRetInfoTable *info;
 
-  p = tso -> sp;
+  p = tso->stackobj->sp;
   while (1) {
     info = get_ret_itbl((StgClosure *)p);
     next = p + stack_frame_sizeW((StgClosure *)p);
@@ -2484,13 +2269,13 @@ findRetryFrameHelper (StgTSO *tso)
     case ATOMICALLY_FRAME:
 	debugTrace(DEBUG_stm,
 		   "found ATOMICALLY_FRAME at %p during retry", p);
-	tso->sp = p;
+        tso->stackobj->sp = p;
 	return ATOMICALLY_FRAME;
       
     case CATCH_RETRY_FRAME:
 	debugTrace(DEBUG_stm,
 		   "found CATCH_RETRY_FRAME at %p during retrry", p);
-	tso->sp = p;
+        tso->stackobj->sp = p;
 	return CATCH_RETRY_FRAME;
       
     case CATCH_STM_FRAME: {
@@ -2499,13 +2284,17 @@ findRetryFrameHelper (StgTSO *tso)
         debugTrace(DEBUG_stm,
 		   "found CATCH_STM_FRAME at %p during retry", p);
         debugTrace(DEBUG_stm, "trec=%p outer=%p", trec, outer);
-	stmAbortTransaction(tso -> cap, trec);
-	stmFreeAbortedTRec(tso -> cap, trec);
+        stmAbortTransaction(cap, trec);
+        stmFreeAbortedTRec(cap, trec);
 	tso -> trec = outer;
         p = next; 
         continue;
     }
       
+    case UNDERFLOW_FRAME:
+        threadStackUnderflow(cap,tso);
+        p = tso->stackobj->sp;
+        continue;
 
     default:
       ASSERT(info->i.type != CATCH_FRAME);
diff --git a/rts/Schedule.h b/rts/Schedule.h
index a00d81af4f..edba8f5f16 100644
--- a/rts/Schedule.h
+++ b/rts/Schedule.h
@@ -44,7 +44,7 @@ void wakeUpRts(void);
 StgWord raiseExceptionHelper (StgRegTable *reg, StgTSO *tso, StgClosure *exception);
 
 /* findRetryFrameHelper */
-StgWord findRetryFrameHelper (StgTSO *tso);
+StgWord findRetryFrameHelper (Capability *cap, StgTSO *tso);
 
 /* Entry point for a new worker */
 void scheduleWorker (Capability *cap, Task *task);
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
index c981cbec18..b4a037d5d6 100644
--- a/rts/StgMiscClosures.cmm
+++ b/rts/StgMiscClosures.cmm
@@ -19,6 +19,23 @@ import EnterCriticalSection;
 import LeaveCriticalSection;
 
 /* ----------------------------------------------------------------------------
+   Stack underflow
+   ------------------------------------------------------------------------- */
+
+INFO_TABLE_RET (stg_stack_underflow_frame, UNDERFLOW_FRAME, P_ unused)
+{
+    W_ new_tso;
+    W_ ret_off;
+
+    SAVE_THREAD_STATE();
+    ("ptr" ret_off) = foreign "C" threadStackUnderflow(MyCapability(),
+                                                       CurrentTSO);
+    LOAD_THREAD_STATE();
+
+    jump %ENTRY_CODE(Sp(ret_off));
+}
+
+/* ----------------------------------------------------------------------------
    Support for the bytecode interpreter.
    ------------------------------------------------------------------------- */
 
@@ -353,6 +370,9 @@ loop:
 INFO_TABLE(stg_TSO, 0,0,TSO, "TSO", "TSO")
 { foreign "C" barf("TSO object entered!") never returns; }
 
+INFO_TABLE(stg_STACK, 0,0, STACK, "STACK", "STACK")
+{ foreign "C" barf("STACK object entered!") never returns; }
+
 /* ----------------------------------------------------------------------------
    Weak pointers
 
diff --git a/rts/ThreadPaused.c b/rts/ThreadPaused.c
index 94a5a15f46..aeae1d4128 100644
--- a/rts/ThreadPaused.c
+++ b/rts/ThreadPaused.c
@@ -44,13 +44,13 @@ stackSqueeze(Capability *cap, StgTSO *tso, StgPtr bottom)
     //    contains two values: the size of the gap, and the distance
     //    to the next gap (or the stack top).
 
-    frame = tso->sp;
+    frame = tso->stackobj->sp;
 
     ASSERT(frame < bottom);
     
     prev_was_update_frame = rtsFalse;
     current_gap_size = 0;
-    gap = (struct stack_gap *) (tso->sp - sizeofW(StgUpdateFrame));
+    gap = (struct stack_gap *) (frame - sizeofW(StgUpdateFrame));
 
     while (frame <= bottom) {
 	
@@ -150,7 +150,7 @@ stackSqueeze(Capability *cap, StgTSO *tso, StgPtr bottom)
 	next_gap_start = (StgWord8*)gap + sizeof(StgUpdateFrame);
 	sp = next_gap_start;
 
-	while ((StgPtr)gap > tso->sp) {
+        while ((StgPtr)gap > tso->stackobj->sp) {
 
 	    // we're working in *bytes* now...
 	    gap_start = next_gap_start;
@@ -164,7 +164,7 @@ stackSqueeze(Capability *cap, StgTSO *tso, StgPtr bottom)
 	    memmove(sp, next_gap_start, chunk_size);
 	}
 
-	tso->sp = (StgPtr)sp;
+        tso->stackobj->sp = (StgPtr)sp;
     }
 }    
 
@@ -201,27 +201,27 @@ threadPaused(Capability *cap, StgTSO *tso)
     // blackholing, or eager blackholing consistently.  See Note
     // [upd-black-hole] in sm/Scav.c.
 
-    stack_end = &tso->stack[tso->stack_size];
+    stack_end = tso->stackobj->stack + tso->stackobj->stack_size;
     
-    frame = (StgClosure *)tso->sp;
+    frame = (StgClosure *)tso->stackobj->sp;
 
-    while (1) {
-	// If we've already marked this frame, then stop here.
-	if (frame->header.info == (StgInfoTable *)&stg_marked_upd_frame_info) {
-	    if (prev_was_update_frame) {
-		words_to_squeeze += sizeofW(StgUpdateFrame);
-		weight += weight_pending;
-		weight_pending = 0;
-	    }
-	    goto end;
-	}
-
-	info = get_ret_itbl(frame);
+    while ((P_)frame < stack_end) {
+        info = get_ret_itbl(frame);
 	
 	switch (info->i.type) {
-	    
+
 	case UPDATE_FRAME:
 
+            // If we've already marked this frame, then stop here.
+            if (frame->header.info == (StgInfoTable *)&stg_marked_upd_frame_info) {
+                if (prev_was_update_frame) {
+                    words_to_squeeze += sizeofW(StgUpdateFrame);
+                    weight += weight_pending;
+                    weight_pending = 0;
+                }
+                goto end;
+            }
+
 	    SET_INFO(frame, (StgInfoTable *)&stg_marked_upd_frame_info);
 
 	    bh = ((StgUpdateFrame *)frame)->updatee;
@@ -235,7 +235,7 @@ threadPaused(Capability *cap, StgTSO *tso)
             {
 		debugTrace(DEBUG_squeeze,
 			   "suspending duplicate work: %ld words of stack",
-			   (long)((StgPtr)frame - tso->sp));
+                           (long)((StgPtr)frame - tso->stackobj->sp));
 
 		// If this closure is already an indirection, then
 		// suspend the computation up to this point.
@@ -245,25 +245,22 @@ threadPaused(Capability *cap, StgTSO *tso)
 
 		// Now drop the update frame, and arrange to return
 		// the value to the frame underneath:
-		tso->sp = (StgPtr)frame + sizeofW(StgUpdateFrame) - 2;
-		tso->sp[1] = (StgWord)bh;
+                tso->stackobj->sp = (StgPtr)frame + sizeofW(StgUpdateFrame) - 2;
+                tso->stackobj->sp[1] = (StgWord)bh;
                 ASSERT(bh->header.info != &stg_TSO_info);
-		tso->sp[0] = (W_)&stg_enter_info;
+                tso->stackobj->sp[0] = (W_)&stg_enter_info;
 
 		// And continue with threadPaused; there might be
 		// yet more computation to suspend.
-                frame = (StgClosure *)(tso->sp + 2);
+                frame = (StgClosure *)(tso->stackobj->sp + 2);
                 prev_was_update_frame = rtsFalse;
                 continue;
 	    }
 
+
             // zero out the slop so that the sanity checker can tell
             // where the next closure is.
-            DEBUG_FILL_SLOP(bh);
-
-            // @LDV profiling
-            // We pretend that bh is now dead.
-            LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC((StgClosure *)bh);
+            OVERWRITING_CLOSURE(bh);
 
             // an EAGER_BLACKHOLE or CAF_BLACKHOLE gets turned into a
             // BLACKHOLE here.
@@ -301,7 +298,8 @@ threadPaused(Capability *cap, StgTSO *tso)
 	    prev_was_update_frame = rtsTrue;
 	    break;
 	    
-	case STOP_FRAME:
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
 	    goto end;
 	    
 	    // normal stack frames; do nothing except advance the pointer
diff --git a/rts/Threads.c b/rts/Threads.c
index f6b1bac1a7..d6fe0e7697 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -18,8 +18,14 @@
 #include "ThreadLabels.h"
 #include "Updates.h"
 #include "Messages.h"
+#include "RaiseAsync.h"
+#include "Prelude.h"
+#include "Printer.h"
+#include "sm/Sanity.h"
 #include "sm/Storage.h"
 
+#include <string.h>
+
 /* Next thread ID to allocate.
  * LOCK: sched_mutex
  */
@@ -54,57 +60,67 @@ StgTSO *
 createThread(Capability *cap, nat size)
 {
     StgTSO *tso;
+    StgStack *stack;
     nat stack_size;
 
     /* sched_mutex is *not* required */
 
-    /* First check whether we should create a thread at all */
-
-    // ToDo: check whether size = stack_size - TSO_STRUCT_SIZEW
-
     /* catch ridiculously small stack sizes */
-    if (size < MIN_STACK_WORDS + TSO_STRUCT_SIZEW) {
-	size = MIN_STACK_WORDS + TSO_STRUCT_SIZEW;
+    if (size < MIN_STACK_WORDS + sizeofW(StgStack)) {
+        size = MIN_STACK_WORDS + sizeofW(StgStack);
     }
 
-    size = round_to_mblocks(size);
-    tso = (StgTSO *)allocate(cap, size);
-
-    stack_size = size - TSO_STRUCT_SIZEW;
-    TICK_ALLOC_TSO(stack_size, 0);
-
+    /* The size argument we are given includes all the per-thread
+     * overheads:
+     *
+     *    - The TSO structure
+     *    - The STACK header
+     *
+     * This is so that we can use a nice round power of 2 for the
+     * default stack size (e.g. 1k), and if we're allocating lots of
+     * threads back-to-back they'll fit nicely in a block.  It's a bit
+     * of a benchmark hack, but it doesn't do any harm.
+     */
+    stack_size = round_to_mblocks(size - sizeofW(StgTSO));
+    stack = (StgStack *)allocate(cap, stack_size);
+    TICK_ALLOC_STACK(stack_size);
+    SET_HDR(stack, &stg_STACK_info, CCS_SYSTEM);
+    stack->stack_size   = stack_size - sizeofW(StgStack);
+    stack->sp           = stack->stack + stack->stack_size;
+    stack->dirty        = 1;
+
+    tso = (StgTSO *)allocate(cap, sizeofW(StgTSO));
+    TICK_ALLOC_TSO();
     SET_HDR(tso, &stg_TSO_info, CCS_SYSTEM);
 
     // Always start with the compiled code evaluator
     tso->what_next = ThreadRunGHC;
-
     tso->why_blocked  = NotBlocked;
     tso->block_info.closure = (StgClosure *)END_TSO_QUEUE;
     tso->blocked_exceptions = END_BLOCKED_EXCEPTIONS_QUEUE;
     tso->bq = (StgBlockingQueue *)END_TSO_QUEUE;
     tso->flags = 0;
     tso->dirty = 1;
-    
+    tso->_link = END_TSO_QUEUE;
+
     tso->saved_errno = 0;
     tso->bound = NULL;
     tso->cap = cap;
     
-    tso->stack_size     = stack_size;
-    tso->max_stack_size = round_to_mblocks(RtsFlags.GcFlags.maxStkSize) 
-	                  - TSO_STRUCT_SIZEW;
-    tso->sp             = (P_)&(tso->stack) + stack_size;
+    tso->stackobj       = stack;
+    tso->tot_stack_size = stack->stack_size;
 
     tso->trec = NO_TREC;
-    
+
 #ifdef PROFILING
     tso->prof.CCCS = CCS_MAIN;
 #endif
     
-  /* put a stop frame on the stack */
-    tso->sp -= sizeofW(StgStopFrame);
-    SET_HDR((StgClosure*)tso->sp,(StgInfoTable *)&stg_stop_thread_info,CCS_SYSTEM);
-    tso->_link = END_TSO_QUEUE;
-    
+    // put a stop frame on the stack
+    stack->sp -= sizeofW(StgStopFrame);
+    SET_HDR((StgClosure*)stack->sp,
+            (StgInfoTable *)&stg_stop_thread_info,CCS_SYSTEM);
+
     /* Link the new thread on the global thread list.
      */
     ACQUIRE_LOCK(&sched_mutex);
@@ -220,12 +236,6 @@ removeThreadFromDeQueue (Capability *cap,
 void
 tryWakeupThread (Capability *cap, StgTSO *tso)
 {
-    tryWakeupThread_(cap, deRefTSO(tso));
-}
-
-void
-tryWakeupThread_ (Capability *cap, StgTSO *tso)
-{
     traceEventThreadWakeup (cap, tso, tso->cap->no);
 
 #ifdef THREADED_RTS
@@ -267,8 +277,8 @@ tryWakeupThread_ (Capability *cap, StgTSO *tso)
         }
 
         // remove the block frame from the stack
-        ASSERT(tso->sp[0] == (StgWord)&stg_block_throwto_info);
-        tso->sp += 3;
+        ASSERT(tso->stackobj->sp[0] == (StgWord)&stg_block_throwto_info);
+        tso->stackobj->sp += 3;
         goto unblock;
     }
 
@@ -416,7 +426,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val)
 
     i = v->header.info;
     if (i == &stg_TSO_info) {
-        owner = deRefTSO((StgTSO*)v);
+        owner = (StgTSO*)v;
         if (owner != tso) {
             checkBlockingQueues(cap, tso);
         }
@@ -429,7 +439,7 @@ updateThunk (Capability *cap, StgTSO *tso, StgClosure *thunk, StgClosure *val)
         return;
     }
 
-    owner = deRefTSO(((StgBlockingQueue*)v)->owner);
+    owner = ((StgBlockingQueue*)v)->owner;
 
     if (owner != tso) {
         checkBlockingQueues(cap, tso);
@@ -466,6 +476,202 @@ isThreadBound(StgTSO* tso USED_IF_THREADS)
   return rtsFalse;
 }
 
+/* -----------------------------------------------------------------------------
+   Stack overflow
+
+   If the thread has reached its maximum stack size, then raise the
+   StackOverflow exception in the offending thread.  Otherwise
+   relocate the TSO into a larger chunk of memory and adjust its stack
+   size appropriately.
+   -------------------------------------------------------------------------- */
+
+void
+threadStackOverflow (Capability *cap, StgTSO *tso)
+{
+    StgStack *new_stack, *old_stack;
+    StgUnderflowFrame *frame;
+
+    IF_DEBUG(sanity,checkTSO(tso));
+
+    if (tso->tot_stack_size >= RtsFlags.GcFlags.maxStkSize
+        && !(tso->flags & TSO_BLOCKEX)) {
+        // NB. never raise a StackOverflow exception if the thread is
+        // inside Control.Exceptino.block.  It is impractical to protect
+        // against stack overflow exceptions, since virtually anything
+        // can raise one (even 'catch'), so this is the only sensible
+        // thing to do here.  See bug #767.
+        //
+
+        if (tso->flags & TSO_SQUEEZED) {
+            return;
+        }
+        // #3677: In a stack overflow situation, stack squeezing may
+        // reduce the stack size, but we don't know whether it has been
+        // reduced enough for the stack check to succeed if we try
+        // again.  Fortunately stack squeezing is idempotent, so all we
+        // need to do is record whether *any* squeezing happened.  If we
+        // are at the stack's absolute -K limit, and stack squeezing
+        // happened, then we try running the thread again.  The
+        // TSO_SQUEEZED flag is set by threadPaused() to tell us whether
+        // squeezing happened or not.
+
+        debugTrace(DEBUG_gc,
+                   "threadStackOverflow of TSO %ld (%p): stack too large (now %ld; max is %ld)",
+                   (long)tso->id, tso, (long)tso->stackobj->stack_size,
+                   RtsFlags.GcFlags.maxStkSize);
+        IF_DEBUG(gc,
+                 /* If we're debugging, just print out the top of the stack */
+                 printStackChunk(tso->stackobj->sp,
+                                 stg_min(tso->stackobj->stack + tso->stackobj->stack_size,
+                                         tso->stackobj->sp+64)));
+
+        // Send this thread the StackOverflow exception
+        throwToSingleThreaded(cap, tso, (StgClosure *)stackOverflow_closure);
+    }
+
+
+    // We also want to avoid enlarging the stack if squeezing has
+    // already released some of it.  However, we don't want to get into
+    // a pathalogical situation where a thread has a nearly full stack
+    // (near its current limit, but not near the absolute -K limit),
+    // keeps allocating a little bit, squeezing removes a little bit,
+    // and then it runs again.  So to avoid this, if we squeezed *and*
+    // there is still less than BLOCK_SIZE_W words free, then we enlarge
+    // the stack anyway.
+    if ((tso->flags & TSO_SQUEEZED) && 
+        ((W_)(tso->stackobj->sp - tso->stackobj->stack) >= BLOCK_SIZE_W)) {
+        return;
+    }
+
+    debugTraceCap(DEBUG_sched, cap,
+                  "allocating new stack chunk of size %d bytes",
+                  RtsFlags.GcFlags.stkChunkSize * sizeof(W_));
+
+    old_stack = tso->stackobj;
+
+    new_stack = (StgStack*) allocate(cap, RtsFlags.GcFlags.stkChunkSize);
+    SET_HDR(new_stack, &stg_STACK_info, CCS_SYSTEM);
+    TICK_ALLOC_STACK(RtsFlags.GcFlags.stkChunkSize);
+
+    new_stack->dirty = 0; // begin clean, we'll mark it dirty below
+    new_stack->stack_size = RtsFlags.GcFlags.stkChunkSize - sizeofW(StgStack);
+    new_stack->sp = new_stack->stack + new_stack->stack_size;
+
+    tso->tot_stack_size += new_stack->stack_size;
+
+    new_stack->sp -= sizeofW(StgUnderflowFrame);
+    frame = (StgUnderflowFrame*)new_stack->sp;
+    frame->info = &stg_stack_underflow_frame_info;
+    frame->next_chunk  = old_stack;
+
+    {
+        StgWord *sp;
+        nat chunk_words, size;
+
+        // find the boundary of the chunk of old stack we're going to
+        // copy to the new stack.  We skip over stack frames until we
+        // reach the smaller of
+        //
+        //   * the chunk buffer size (+RTS -kb)
+        //   * the end of the old stack
+        //
+        for (sp = old_stack->sp;
+             sp < stg_min(old_stack->sp + RtsFlags.GcFlags.stkChunkBufferSize,
+                          old_stack->stack + old_stack->stack_size); )
+        {
+            size = stack_frame_sizeW((StgClosure*)sp);
+
+            // if including this frame would exceed the size of the
+            // new stack (taking into account the underflow frame),
+            // then stop at the previous frame.
+            if (sp + size > old_stack->stack + (new_stack->stack_size -
+                                                sizeofW(StgUnderflowFrame))) {
+                break;
+            }
+            sp += size;
+        }
+
+        // copy the stack chunk between tso->sp and sp to
+        //   new_tso->sp + (tso->sp - sp)
+        chunk_words = sp - old_stack->sp;
+
+        memcpy(/* dest   */ new_stack->sp - chunk_words,
+               /* source */ old_stack->sp,
+               /* size   */ chunk_words * sizeof(W_));
+
+        old_stack->sp += chunk_words;
+        new_stack->sp -= chunk_words;
+    }
+
+    // if the old stack chunk is now empty, discard it.  With the
+    // default settings, -ki1k -kb1k, this means the first stack chunk
+    // will be discarded after the first overflow, being replaced by a
+    // non-moving 32k chunk.
+    if (old_stack->sp == old_stack->stack + old_stack->stack_size) {
+        frame->next_chunk = new_stack;
+    }
+
+    tso->stackobj = new_stack;
+
+    // we're about to run it, better mark it dirty
+    dirty_STACK(cap, new_stack);
+
+    IF_DEBUG(sanity,checkTSO(tso));
+    // IF_DEBUG(scheduler,printTSO(new_tso));
+}
+
+
+/* ---------------------------------------------------------------------------
+   Stack underflow - called from the stg_stack_underflow_info frame
+   ------------------------------------------------------------------------ */
+
+nat // returns offset to the return address
+threadStackUnderflow (Capability *cap, StgTSO *tso)
+{
+    StgStack *new_stack, *old_stack;
+    StgUnderflowFrame *frame;
+    nat retvals;
+
+    debugTraceCap(DEBUG_sched, cap, "stack underflow");
+
+    old_stack = tso->stackobj;
+
+    frame = (StgUnderflowFrame*)(old_stack->stack + old_stack->stack_size
+                                 - sizeofW(StgUnderflowFrame));
+    ASSERT(frame->info == &stg_stack_underflow_frame_info);
+
+    new_stack = (StgStack*)frame->next_chunk;
+    tso->stackobj = new_stack;
+
+    retvals = (P_)frame - old_stack->sp;
+    if (retvals != 0)
+    {
+        // we have some return values to copy to the old stack
+        if ((new_stack->sp - new_stack->stack) < retvals)
+        {
+            barf("threadStackUnderflow: not enough space for return values");
+        }
+
+        new_stack->sp -= retvals;
+
+        memcpy(/* dest */ new_stack->sp,
+               /* src  */ old_stack->sp,
+               /* size */ retvals * sizeof(W_));
+    }
+
+    // empty the old stack.  The GC may still visit this object
+    // because it is on the mutable list.
+    old_stack->sp = old_stack->stack + old_stack->stack_size;
+
+    // restore the stack parameters, and update tot_stack_size
+    tso->tot_stack_size -= old_stack->stack_size;
+
+    // we're about to run it, better mark it dirty
+    dirty_STACK(cap, new_stack);
+
+    return retvals;
+}
+
 /* ----------------------------------------------------------------------------
  * Debugging: why is a thread blocked
  * ------------------------------------------------------------------------- */
@@ -529,10 +735,7 @@ printThreadStatus(StgTSO *t)
       void *label = lookupThreadLabel(t->id);
       if (label) debugBelch("[\"%s\"] ",(char *)label);
     }
-    if (t->what_next == ThreadRelocated) {
-	debugBelch("has been relocated...\n");
-    } else {
-	switch (t->what_next) {
+        switch (t->what_next) {
 	case ThreadKilled:
 	    debugBelch("has been killed");
 	    break;
@@ -544,11 +747,8 @@ printThreadStatus(StgTSO *t)
 	}
         if (t->dirty) {
             debugBelch(" (TSO_DIRTY)");
-        } else if (t->flags & TSO_LINK_DIRTY) {
-            debugBelch(" (TSO_LINK_DIRTY)");
         }
 	debugBelch("\n");
-    }
 }
 
 void
@@ -574,11 +774,7 @@ printAllThreads(void)
       if (t->why_blocked != NotBlocked) {
 	  printThreadStatus(t);
       }
-      if (t->what_next == ThreadRelocated) {
-	  next = t->_link;
-      } else {
-	  next = t->global_link;
-      }
+      next = t->global_link;
     }
   }
 }
diff --git a/rts/Threads.h b/rts/Threads.h
index 776dd93cab..857658a2d0 100644
--- a/rts/Threads.h
+++ b/rts/Threads.h
@@ -21,9 +21,6 @@ void wakeBlockingQueue   (Capability *cap, StgBlockingQueue *bq);
 void tryWakeupThread     (Capability *cap, StgTSO *tso);
 void migrateThread       (Capability *from, StgTSO *tso, Capability *to);
 
-// like tryWakeupThread(), but assumes the TSO is not ThreadRelocated
-void tryWakeupThread_    (Capability *cap, StgTSO *tso);
-
 // Wakes up a thread on a Capability (probably a different Capability
 // from the one held by the current Task).
 //
@@ -41,6 +38,10 @@ rtsBool removeThreadFromDeQueue   (Capability *cap, StgTSO **head, StgTSO **tail
 
 StgBool isThreadBound (StgTSO* tso);
 
+// Overfow/underflow
+void threadStackOverflow  (Capability *cap, StgTSO *tso);
+nat  threadStackUnderflow (Capability *cap, StgTSO *tso);
+
 #ifdef DEBUG
 void printThreadBlockage (StgTSO *tso);
 void printThreadStatus (StgTSO *t);
diff --git a/rts/Trace.h b/rts/Trace.h
index 97d9514751..27de60e4ed 100644
--- a/rts/Trace.h
+++ b/rts/Trace.h
@@ -265,7 +265,7 @@ void dtraceUserMsgWrapper(Capability *cap, char *msg);
 INLINE_HEADER void traceEventCreateThread(Capability *cap STG_UNUSED, 
                                           StgTSO     *tso STG_UNUSED)
 {
-    traceSchedEvent(cap, EVENT_CREATE_THREAD, tso, tso->stack_size);
+    traceSchedEvent(cap, EVENT_CREATE_THREAD, tso, tso->stackobj->stack_size);
     dtraceCreateThread((EventCapNo)cap->no, (EventThreadID)tso->id);
 }
 
diff --git a/rts/Updates.h b/rts/Updates.h
index 2258c988bb..954f02afe1 100644
--- a/rts/Updates.h
+++ b/rts/Updates.h
@@ -18,101 +18,12 @@
    -------------------------------------------------------------------------- */
 
 /* LDV profiling:
- * We call LDV_recordDead_FILL_SLOP_DYNAMIC(p1) regardless of the generation in 
- * which p1 resides.
- *
- * Note: 
  *   After all, we do *NOT* need to call LDV_RECORD_CREATE() for IND
  *   closures because they are inherently used. But, it corrupts
  *   the invariants that every closure keeps its creation time in the profiling
  *  field. So, we call LDV_RECORD_CREATE().
  */
 
-/* In the DEBUG case, we also zero out the slop of the old closure,
- * so that the sanity checker can tell where the next closure is.
- *
- * Two important invariants: we should never try to update a closure
- * to point to itself, and the closure being updated should not
- * already have been updated (the mutable list will get messed up
- * otherwise).
- *
- * NB. We do *not* do this in THREADED_RTS mode, because when we have the
- * possibility of multiple threads entering the same closure, zeroing
- * the slop in one of the threads would have a disastrous effect on
- * the other (seen in the wild!).
- */
-#ifdef CMINUSMINUS
-
-#define FILL_SLOP(p)							\
-  W_ inf;								\
-  W_ sz;								\
-  W_ i;									\
-  inf = %GET_STD_INFO(p);						\
-  if (%INFO_TYPE(inf) != HALF_W_(BLACKHOLE)) {				\
-      if (%INFO_TYPE(inf) == HALF_W_(THUNK_SELECTOR)) {			\
-	  sz = BYTES_TO_WDS(SIZEOF_StgSelector_NoThunkHdr);		\
-     } else {								\
-          if (%INFO_TYPE(inf) == HALF_W_(AP_STACK)) {			\
-              sz = StgAP_STACK_size(p) + BYTES_TO_WDS(SIZEOF_StgAP_STACK_NoThunkHdr); \
-          } else {							\
-              if (%INFO_TYPE(inf) == HALF_W_(AP)) {			\
-	          sz = TO_W_(StgAP_n_args(p)) +  BYTES_TO_WDS(SIZEOF_StgAP_NoThunkHdr);	\
-              } else {							\
-                  sz = TO_W_(%INFO_PTRS(inf)) + TO_W_(%INFO_NPTRS(inf)); \
-	      }								\
-          }								\
-      }									\
-      i = 0;								\
-      for:								\
-        if (i < sz) {							\
-          StgThunk_payload(p,i) = 0;					\
-          i = i + 1;							\
-          goto for;							\
-        }								\
-  }
-
-#else /* !CMINUSMINUS */
-
-INLINE_HEADER void
-FILL_SLOP(StgClosure *p)
-{						
-    StgInfoTable *inf = get_itbl(p);		
-    nat i, sz;
-
-    switch (inf->type) {
-    case BLACKHOLE:
-	goto no_slop;
-	// we already filled in the slop when we overwrote the thunk
-	// with BLACKHOLE, and also an evacuated BLACKHOLE is only the
-	// size of an IND.
-    case THUNK_SELECTOR:
-	sz = sizeofW(StgSelector) - sizeofW(StgThunkHeader);
-	break;
-    case AP:
-	sz = ((StgAP *)p)->n_args + sizeofW(StgAP) - sizeofW(StgThunkHeader);
-	break;
-    case AP_STACK:
-	sz = ((StgAP_STACK *)p)->size + sizeofW(StgAP_STACK) - sizeofW(StgThunkHeader);
-	break;
-    default:
-	sz = inf->layout.payload.ptrs + inf->layout.payload.nptrs;
-        break;
-    }
-    for (i = 0; i < sz; i++) {
-	((StgThunk *)p)->payload[i] = 0;
-    }
-no_slop:
-    ;
-}
-
-#endif /* CMINUSMINUS */
-
-#if !defined(DEBUG) || defined(THREADED_RTS)
-#define DEBUG_FILL_SLOP(p) /* do nothing */
-#else
-#define DEBUG_FILL_SLOP(p) FILL_SLOP(p)
-#endif
-
 /* We have two versions of this macro (sadly), one for use in C-- code,
  * and the other for C.
  *
@@ -128,9 +39,8 @@ no_slop:
 #define updateWithIndirection(p1, p2, and_then)	\
     W_ bd;							\
 								\
-    DEBUG_FILL_SLOP(p1);					\
-    LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(p1);			\
-    StgInd_indirectee(p1) = p2;					\
+    OVERWRITING_CLOSURE(p1);                                    \
+    StgInd_indirectee(p1) = p2;                                 \
     prim %write_barrier() [];					\
     SET_INFO(p1, stg_BLACKHOLE_info);                           \
     LDV_RECORD_CREATE(p1);                                      \
@@ -155,8 +65,7 @@ INLINE_HEADER void updateWithIndirection (Capability *cap,
     ASSERT( (P_)p1 != (P_)p2 );
     /* not necessarily true: ASSERT( !closure_IND(p1) ); */
     /* occurs in RaiseAsync.c:raiseAsync() */
-    DEBUG_FILL_SLOP(p1);
-    LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC(p1);
+    OVERWRITING_CLOSURE(p1);
     ((StgInd *)p1)->indirectee = p2;
     write_barrier();
     SET_INFO(p1, &stg_BLACKHOLE_info);
diff --git a/rts/posix/Select.c b/rts/posix/Select.c
index 0127b3cef4..3c87fbdc70 100644
--- a/rts/posix/Select.c
+++ b/rts/posix/Select.c
@@ -63,10 +63,6 @@ wakeUpSleepingThreads(lnat ticks)
 
     while (sleeping_queue != END_TSO_QUEUE) {
 	tso = sleeping_queue;
-        if (tso->what_next == ThreadRelocated) {
-            sleeping_queue = tso->_link;
-            continue;
-        }
         if (((long)ticks - (long)tso->block_info.target) < 0) {
             break;
         }
@@ -259,11 +255,7 @@ awaitEvent(rtsBool wait)
 	  for(tso = blocked_queue_hd; tso != END_TSO_QUEUE; tso = next) {
 	      next = tso->_link;
 
-              if (tso->what_next == ThreadRelocated) {
-                  continue;
-              }
-
-	      switch (tso->why_blocked) {
+              switch (tso->why_blocked) {
 	      case BlockedOnRead:
 		  ready = unblock_all || FD_ISSET(tso->block_info.fd, &rfd);
 		  break;
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index 75c88326ad..bf0c5e6a3b 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -577,48 +577,6 @@ freeChain_lock(bdescr *bd)
     RELEASE_SM_LOCK;
 }
 
-// splitBlockGroup(bd,B) splits bd in two.  Afterward, bd will have B
-// blocks, and a new block descriptor pointing to the remainder is
-// returned.
-bdescr *
-splitBlockGroup (bdescr *bd, nat blocks)
-{
-    bdescr *new_bd;
-
-    if (bd->blocks <= blocks) {
-        barf("splitLargeBlock: too small");
-    }
-
-    if (bd->blocks > BLOCKS_PER_MBLOCK) {
-        nat low_mblocks, high_mblocks;
-        void *new_mblock;
-        if ((blocks - BLOCKS_PER_MBLOCK) % (MBLOCK_SIZE / BLOCK_SIZE) != 0) {
-            barf("splitLargeBlock: not a multiple of a megablock");
-        }
-        low_mblocks = 1 + (blocks - BLOCKS_PER_MBLOCK) / (MBLOCK_SIZE / BLOCK_SIZE);
-        high_mblocks = (bd->blocks - blocks) / (MBLOCK_SIZE / BLOCK_SIZE);
-
-        new_mblock = (void *) ((P_)MBLOCK_ROUND_DOWN(bd) + (W_)low_mblocks * MBLOCK_SIZE_W);
-        initMBlock(new_mblock);
-        new_bd = FIRST_BDESCR(new_mblock);
-        new_bd->blocks = MBLOCK_GROUP_BLOCKS(high_mblocks);
-
-        ASSERT(blocks + new_bd->blocks == 
-               bd->blocks + BLOCKS_PER_MBLOCK - MBLOCK_SIZE/BLOCK_SIZE);
-    }
-    else
-    {
-        // NB. we're not updating all the bdescrs in the split groups to
-        // point to the new heads, so this can only be used for large
-        // objects which do not start in the non-head block.
-        new_bd = bd + blocks;
-        new_bd->blocks = bd->blocks - blocks;
-    }
-    bd->blocks = blocks;
-
-    return new_bd;
-}
-
 static void
 initMBlock(void *mblock)
 {
diff --git a/rts/sm/Compact.c b/rts/sm/Compact.c
index 977e31d811..4f3dcf2525 100644
--- a/rts/sm/Compact.c
+++ b/rts/sm/Compact.c
@@ -335,8 +335,9 @@ thread_stack(StgPtr p, StgPtr stack_end)
         case CATCH_STM_FRAME:
         case ATOMICALLY_FRAME:
 	case UPDATE_FRAME:
-	case STOP_FRAME:
-	case CATCH_FRAME:
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
+        case CATCH_FRAME:
 	case RET_SMALL:
 	    bitmap = BITMAP_BITS(info->i.layout.bitmap);
 	    size   = BITMAP_SIZE(info->i.layout.bitmap);
@@ -480,8 +481,8 @@ thread_TSO (StgTSO *tso)
     
     thread_(&tso->trec);
 
-    thread_stack(tso->sp, &(tso->stack[tso->stack_size]));
-    return (StgPtr)tso + tso_sizeW(tso);
+    thread_(&tso->stackobj);
+    return (StgPtr)tso + sizeofW(StgTSO);
 }
 
 
@@ -521,9 +522,12 @@ update_fwd_large( bdescr *bd )
           continue;
       }
 
-    case TSO:
-	thread_TSO((StgTSO *)p);
-	continue;
+    case STACK:
+    {
+        StgStack *stack = (StgStack*)p;
+        thread_stack(stack->sp, stack->stack + stack->stack_size);
+        continue;
+    }
 
     case AP_STACK:
 	thread_AP_STACK((StgAP_STACK *)p);
@@ -706,6 +710,13 @@ thread_obj (StgInfoTable *info, StgPtr p)
     case TSO:
 	return thread_TSO((StgTSO *)p);
     
+    case STACK:
+    {
+        StgStack *stack = (StgStack*)p;
+        thread_stack(stack->sp, stack->stack + stack->stack_size);
+        return p + stack_sizeW(stack);
+    }
+
     case TREC_CHUNK:
     {
         StgWord i;
@@ -899,8 +910,8 @@ update_bkwd_compact( generation *gen )
 	    }
 
 	    // relocate TSOs
-	    if (info->type == TSO) {
-		move_TSO((StgTSO *)p, (StgTSO *)free);
+            if (info->type == STACK) {
+                move_STACK((StgStack *)p, (StgStack *)free);
 	    }
 
 	    free += size;
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index 61cf10bcbe..65da0762f2 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -485,14 +485,7 @@ loop:
       /* evacuate large objects by re-linking them onto a different list.
        */
       if (bd->flags & BF_LARGE) {
-	  info = get_itbl(q);
-	  if (info->type == TSO && 
-	      ((StgTSO *)q)->what_next == ThreadRelocated) {
-	      q = (StgClosure *)((StgTSO *)q)->_link;
-              *p = q;
-	      goto loop;
-	  }
-	  evacuate_large((P_)q);
+          evacuate_large((P_)q);
 	  return;
       }
       
@@ -675,6 +668,7 @@ loop:
   case RET_BIG:
   case RET_DYN:
   case UPDATE_FRAME:
+  case UNDERFLOW_FRAME:
   case STOP_FRAME:
   case CATCH_FRAME:
   case CATCH_STM_FRAME:
@@ -709,31 +703,28 @@ loop:
       return;
 
   case TSO:
-    {
-      StgTSO *tso = (StgTSO *)q;
+      copy(p,info,q,sizeofW(StgTSO),gen);
+      evacuate((StgClosure**)&(((StgTSO*)(*p))->stackobj));
+      return;
 
-      /* Deal with redirected TSOs (a TSO that's had its stack enlarged).
-       */
-      if (tso->what_next == ThreadRelocated) {
-	q = (StgClosure *)tso->_link;
-	*p = q;
-	goto loop;
-      }
+  case STACK:
+    {
+      StgStack *stack = (StgStack *)q;
 
-      /* To evacuate a small TSO, we need to adjust the stack pointer
+      /* To evacuate a small STACK, we need to adjust the stack pointer
        */
       {
-	  StgTSO *new_tso;
+          StgStack *new_stack;
 	  StgPtr r, s;
           rtsBool mine;
 
-	  mine = copyPart(p,(StgClosure *)tso, tso_sizeW(tso), 
-                          sizeofW(StgTSO), gen);
+          mine = copyPart(p,(StgClosure *)stack, stack_sizeW(stack),
+                          sizeofW(StgStack), gen);
           if (mine) {
-              new_tso = (StgTSO *)*p;
-              move_TSO(tso, new_tso);
-              for (r = tso->sp, s = new_tso->sp;
-                   r < tso->stack+tso->stack_size;) {
+              new_stack = (StgStack *)*p;
+              move_STACK(stack, new_stack);
+              for (r = stack->sp, s = new_stack->sp;
+                   r < stack->stack + stack->stack_size;) {
                   *s++ = *r++;
               }
           }
@@ -952,7 +943,7 @@ selector_loop:
               // For the purposes of LDV profiling, we have destroyed
               // the original selector thunk, p.
               SET_INFO(p, (StgInfoTable *)info_ptr);
-              LDV_RECORD_DEAD_FILL_SLOP_DYNAMIC((StgClosure *)p);
+              OVERWRITING_CLOSURE(p);
               SET_INFO(p, &stg_WHITEHOLE_info);
 #endif
 
diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c
index f69c81d5e0..97af17a02c 100644
--- a/rts/sm/GCAux.c
+++ b/rts/sm/GCAux.c
@@ -67,12 +67,7 @@ isAlive(StgClosure *p)
 
     // large objects use the evacuated flag
     if (bd->flags & BF_LARGE) {
-        if (get_itbl(q)->type == TSO &&
-            ((StgTSO *)p)->what_next == ThreadRelocated) {
-            p = (StgClosure *)((StgTSO *)p)->_link;
-            continue;
-        }
-	return NULL;
+        return NULL;
     }
 
     // check the mark bit for compacted steps
@@ -98,13 +93,6 @@ isAlive(StgClosure *p)
       p = ((StgInd *)q)->indirectee;
       continue;
 
-    case TSO:
-      if (((StgTSO *)q)->what_next == ThreadRelocated) {
-	p = (StgClosure *)((StgTSO *)q)->_link;
-	continue;
-      } 
-      return NULL;
-
     default:
       // dead. 
       return NULL;
diff --git a/rts/sm/MarkWeak.c b/rts/sm/MarkWeak.c
index aadd5757af..72f0ade797 100644
--- a/rts/sm/MarkWeak.c
+++ b/rts/sm/MarkWeak.c
@@ -260,12 +260,6 @@ static rtsBool tidyThreadList (generation *gen)
         }
         
         ASSERT(get_itbl(t)->type == TSO);
-        if (t->what_next == ThreadRelocated) {
-            next = t->_link;
-            *prev = next;
-            continue;
-        }
-        
         next = t->global_link;
         
         // if the thread is not masking exceptions but there are
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index dfa98657a0..22b7f64136 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -35,6 +35,7 @@
 static void      checkSmallBitmap    ( StgPtr payload, StgWord bitmap, nat );
 static void      checkLargeBitmap    ( StgPtr payload, StgLargeBitmap*, nat );
 static void      checkClosureShallow ( StgClosure * );
+static void      checkSTACK          (StgStack *stack);
 
 /* -----------------------------------------------------------------------------
    Check stack sanity
@@ -139,6 +140,7 @@ checkStackFrame( StgPtr c )
     case CATCH_STM_FRAME:
     case CATCH_FRAME:
       // small bitmap cases (<= 32 entries)
+    case UNDERFLOW_FRAME:
     case STOP_FRAME:
     case RET_SMALL:
 	size = BITMAP_SIZE(info->i.layout.bitmap);
@@ -331,7 +333,7 @@ checkClosure( StgClosure* p )
 
         ASSERT(get_itbl(bq->owner)->type == TSO);
         ASSERT(bq->queue == (MessageBlackHole*)END_TSO_QUEUE 
-               || get_itbl(bq->queue)->type == TSO);
+               || bq->queue->header.info == &stg_MSG_BLACKHOLE_info);
         ASSERT(bq->link == (StgBlockingQueue*)END_TSO_QUEUE || 
                get_itbl(bq->link)->type == IND ||
                get_itbl(bq->link)->type == BLOCKING_QUEUE);
@@ -384,6 +386,7 @@ checkClosure( StgClosure* p )
     case RET_BIG:
     case RET_DYN:
     case UPDATE_FRAME:
+    case UNDERFLOW_FRAME:
     case STOP_FRAME:
     case CATCH_FRAME:
     case ATOMICALLY_FRAME:
@@ -431,7 +434,11 @@ checkClosure( StgClosure* p )
 
     case TSO:
         checkTSO((StgTSO *)p);
-        return tso_sizeW((StgTSO *)p);
+        return sizeofW(StgTSO);
+
+    case STACK:
+        checkSTACK((StgStack*)p);
+        return stack_sizeW((StgStack*)p);
 
     case TREC_CHUNK:
       {
@@ -514,19 +521,21 @@ checkLargeObjects(bdescr *bd)
   }
 }
 
-void
-checkTSO(StgTSO *tso)
+static void
+checkSTACK (StgStack *stack)
 {
-    StgPtr sp = tso->sp;
-    StgPtr stack = tso->stack;
-    StgOffset stack_size = tso->stack_size;
-    StgPtr stack_end = stack + stack_size;
+    StgPtr sp = stack->sp;
+    StgOffset stack_size = stack->stack_size;
+    StgPtr stack_end = stack->stack + stack_size;
 
-    if (tso->what_next == ThreadRelocated) {
-      checkTSO(tso->_link);
-      return;
-    }
+    ASSERT(stack->stack <= sp && sp <= stack_end);
 
+    checkStackChunk(sp, stack_end);
+}
+
+void
+checkTSO(StgTSO *tso)
+{
     if (tso->what_next == ThreadKilled) {
       /* The garbage collector doesn't bother following any pointers
        * from dead threads, so don't check sanity here.  
@@ -537,16 +546,24 @@ checkTSO(StgTSO *tso)
     ASSERT(tso->_link == END_TSO_QUEUE || 
            tso->_link->header.info == &stg_MVAR_TSO_QUEUE_info ||
            tso->_link->header.info == &stg_TSO_info);
-    ASSERT(LOOKS_LIKE_CLOSURE_PTR(tso->block_info.closure));
+
+    if (   tso->why_blocked == BlockedOnMVar
+	|| tso->why_blocked == BlockedOnBlackHole
+	|| tso->why_blocked == BlockedOnMsgThrowTo
+        || tso->why_blocked == NotBlocked
+	) {
+        ASSERT(LOOKS_LIKE_CLOSURE_PTR(tso->block_info.closure));
+    }
+
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(tso->bq));
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(tso->blocked_exceptions));
+    ASSERT(LOOKS_LIKE_CLOSURE_PTR(tso->stackobj));
 
-    ASSERT(stack <= sp && sp < stack_end);
-
-    checkStackChunk(sp, stack_end);
+    // XXX are we checking the stack twice?
+    checkSTACK(tso->stackobj);
 }
 
-/* 
+/*
    Check that all TSOs have been evacuated.
    Optionally also check the sanity of the TSOs.
 */
@@ -564,11 +581,9 @@ checkGlobalTSOList (rtsBool checkTSOs)
           if (checkTSOs)
               checkTSO(tso);
 
-          tso = deRefTSO(tso);
-
           // If this TSO is dirty and in an old generation, it better
           // be on the mutable list.
-          if (tso->dirty || (tso->flags & TSO_LINK_DIRTY)) {
+          if (tso->dirty) {
               ASSERT(Bdescr((P_)tso)->gen_no == 0 || (tso->flags & TSO_MARKED));
               tso->flags &= ~TSO_MARKED;
           }
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index d01442b34e..d7e16eaf2a 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -51,14 +51,6 @@ scavengeTSO (StgTSO *tso)
 {
     rtsBool saved_eager;
 
-    if (tso->what_next == ThreadRelocated) {
-        // the only way this can happen is if the old TSO was on the
-        // mutable list.  We might have other links to this defunct
-        // TSO, so we must update its link field.
-        evacuate((StgClosure**)&tso->_link);
-        return;
-    }
-
     debugTrace(DEBUG_gc,"scavenging thread %d",(int)tso->id);
 
     // update the pointer from the Task.
@@ -69,17 +61,13 @@ scavengeTSO (StgTSO *tso)
     saved_eager = gct->eager_promotion;
     gct->eager_promotion = rtsFalse;
 
-
     evacuate((StgClosure **)&tso->blocked_exceptions);
     evacuate((StgClosure **)&tso->bq);
     
     // scavange current transaction record
     evacuate((StgClosure **)&tso->trec);
-    
-    // scavenge this thread's stack 
-    scavenge_stack(tso->sp, &(tso->stack[tso->stack_size]));
 
-    tso->dirty = gct->failed_to_evac;
+    evacuate((StgClosure **)&tso->stackobj);
 
     evacuate((StgClosure **)&tso->_link);
     if (   tso->why_blocked == BlockedOnMVar
@@ -99,11 +87,7 @@ scavengeTSO (StgTSO *tso)
     }
 #endif
 
-    if (tso->dirty == 0 && gct->failed_to_evac) {
-        tso->flags |= TSO_LINK_DIRTY;
-    } else {
-        tso->flags &= ~TSO_LINK_DIRTY;
-    }
+    tso->dirty = gct->failed_to_evac;
 
     gct->eager_promotion = saved_eager;
 }
@@ -661,12 +645,25 @@ scavenge_block (bdescr *bd)
 
     case TSO:
     { 
-	StgTSO *tso = (StgTSO *)p;
-        scavengeTSO(tso);
-	p += tso_sizeW(tso);
+        scavengeTSO((StgTSO *)p);
+        p += sizeofW(StgTSO);
 	break;
     }
 
+    case STACK:
+    {
+        StgStack *stack = (StgStack*)p;
+
+        gct->eager_promotion = rtsFalse;
+
+        scavenge_stack(stack->sp, stack->stack + stack->stack_size);
+        stack->dirty = gct->failed_to_evac;
+        p += stack_sizeW(stack);
+
+        gct->eager_promotion = saved_eager_promotion;
+        break;
+    }
+
     case MUT_PRIM:
       {
 	StgPtr end;
@@ -991,6 +988,19 @@ scavenge_mark_stack(void)
 	    break;
 	}
 
+        case STACK:
+        {
+            StgStack *stack = (StgStack*)p;
+
+            gct->eager_promotion = rtsFalse;
+
+            scavenge_stack(stack->sp, stack->stack + stack->stack_size);
+            stack->dirty = gct->failed_to_evac;
+
+            gct->eager_promotion = saved_eager_promotion;
+            break;
+        }
+
         case MUT_PRIM:
         {
             StgPtr end;
@@ -1227,6 +1237,19 @@ scavenge_one(StgPtr p)
 	break;
     }
   
+    case STACK:
+    {
+        StgStack *stack = (StgStack*)p;
+
+        gct->eager_promotion = rtsFalse;
+
+        scavenge_stack(stack->sp, stack->stack + stack->stack_size);
+        stack->dirty = gct->failed_to_evac;
+
+        gct->eager_promotion = saved_eager_promotion;
+        break;
+    }
+
     case MUT_PRIM:
     {
 	StgPtr end;
@@ -1374,33 +1397,7 @@ scavenge_mutable_list(bdescr *bd, generation *gen)
 		recordMutableGen_GC((StgClosure *)p,gen->no);
 		continue;
             }
-	    case TSO: {
-		StgTSO *tso = (StgTSO *)p;
-		if (tso->dirty == 0) {
-                    // Should be on the mutable list because its link
-                    // field is dirty.  However, in parallel GC we may
-                    // have a thread on multiple mutable lists, so
-                    // this assertion would be invalid:
-                    // ASSERT(tso->flags & TSO_LINK_DIRTY);
-
-                    evacuate((StgClosure **)&tso->_link);
-                    if (   tso->why_blocked == BlockedOnMVar
-                        || tso->why_blocked == BlockedOnBlackHole
-                        || tso->why_blocked == BlockedOnMsgThrowTo
-                        || tso->why_blocked == NotBlocked
-                        ) {
-                        evacuate((StgClosure **)&tso->block_info.prev);
-                    }
-                    if (gct->failed_to_evac) {
-                        recordMutableGen_GC((StgClosure *)p,gen->no);
-                        gct->failed_to_evac = rtsFalse;
-                    } else {
-                        tso->flags &= ~TSO_LINK_DIRTY;
-                    }
-		    continue;
-		}
-	    }
-	    default:
+            default:
 		;
 	    }
 
@@ -1643,6 +1640,7 @@ scavenge_stack(StgPtr p, StgPtr stack_end)
     case CATCH_STM_FRAME:
     case CATCH_RETRY_FRAME:
     case ATOMICALLY_FRAME:
+    case UNDERFLOW_FRAME:
     case STOP_FRAME:
     case CATCH_FRAME:
     case RET_SMALL:
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 2172f9bdb0..4247d28e05 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -495,12 +495,12 @@ resizeNurseries (nat blocks)
 
 
 /* -----------------------------------------------------------------------------
-   move_TSO is called to update the TSO structure after it has been
+   move_STACK is called to update the TSO structure after it has been
    moved from one place to another.
    -------------------------------------------------------------------------- */
 
 void
-move_TSO (StgTSO *src, StgTSO *dest)
+move_STACK (StgStack *src, StgStack *dest)
 {
     ptrdiff_t diff;
 
@@ -510,45 +510,6 @@ move_TSO (StgTSO *src, StgTSO *dest)
 }
 
 /* -----------------------------------------------------------------------------
-   split N blocks off the front of the given bdescr, returning the
-   new block group.  We add the remainder to the large_blocks list
-   in the same step as the original block.
-   -------------------------------------------------------------------------- */
-
-bdescr *
-splitLargeBlock (bdescr *bd, nat blocks)
-{
-    bdescr *new_bd;
-
-    ACQUIRE_SM_LOCK;
-
-    ASSERT(countBlocks(bd->gen->large_objects) == bd->gen->n_large_blocks);
-
-    // subtract the original number of blocks from the counter first
-    bd->gen->n_large_blocks -= bd->blocks;
-
-    new_bd = splitBlockGroup (bd, blocks);
-    initBdescr(new_bd, bd->gen, bd->gen->to);
-    new_bd->flags   = BF_LARGE | (bd->flags & BF_EVACUATED); 
-    // if new_bd is in an old generation, we have to set BF_EVACUATED
-    new_bd->free    = bd->free;
-    dbl_link_onto(new_bd, &bd->gen->large_objects);
-
-    ASSERT(new_bd->free <= new_bd->start + new_bd->blocks * BLOCK_SIZE_W);
-
-    // add the new number of blocks to the counter.  Due to the gaps
-    // for block descriptors, new_bd->blocks + bd->blocks might not be
-    // equal to the original bd->blocks, which is why we do it this way.
-    bd->gen->n_large_blocks += bd->blocks + new_bd->blocks;
-
-    ASSERT(countBlocks(bd->gen->large_objects) == bd->gen->n_large_blocks);
-
-    RELEASE_SM_LOCK;
-
-    return new_bd;
-}
-
-/* -----------------------------------------------------------------------------
    allocate()
 
    This allocates memory in the current thread - it is intended for
@@ -731,8 +692,8 @@ dirty_MUT_VAR(StgRegTable *reg, StgClosure *p)
 void
 setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target)
 {
-    if (tso->dirty == 0 && (tso->flags & TSO_LINK_DIRTY) == 0) {
-        tso->flags |= TSO_LINK_DIRTY;
+    if (tso->dirty == 0) {
+        tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
     }
     tso->_link = target;
@@ -741,8 +702,8 @@ setTSOLink (Capability *cap, StgTSO *tso, StgTSO *target)
 void
 setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target)
 {
-    if (tso->dirty == 0 && (tso->flags & TSO_LINK_DIRTY) == 0) {
-        tso->flags |= TSO_LINK_DIRTY;
+    if (tso->dirty == 0) {
+        tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
     }
     tso->block_info.prev = target;
@@ -751,10 +712,19 @@ setTSOPrev (Capability *cap, StgTSO *tso, StgTSO *target)
 void
 dirty_TSO (Capability *cap, StgTSO *tso)
 {
-    if (tso->dirty == 0 && (tso->flags & TSO_LINK_DIRTY) == 0) {
+    if (tso->dirty == 0) {
+        tso->dirty = 1;
         recordClosureMutated(cap,(StgClosure*)tso);
     }
-    tso->dirty = 1;
+}
+
+void
+dirty_STACK (Capability *cap, StgStack *stack)
+{
+    if (stack->dirty == 0) {
+        stack->dirty = 1;
+        recordClosureMutated(cap,(StgClosure*)stack);
+    }
 }
 
 /*
diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h
index e541193573..3ff3380462 100644
--- a/rts/sm/Storage.h
+++ b/rts/sm/Storage.h
@@ -146,7 +146,7 @@ extern bdescr *exec_block;
 
 #define END_OF_STATIC_LIST ((StgClosure*)1)
 
-void move_TSO  (StgTSO *src, StgTSO *dest);
+void move_STACK  (StgStack *src, StgStack *dest);
 
 extern StgClosure * caf_list;
 extern StgClosure * revertible_caf_list;
diff --git a/rts/win32/AsyncIO.c b/rts/win32/AsyncIO.c
index 5dedee0c70..ff2e1a2498 100644
--- a/rts/win32/AsyncIO.c
+++ b/rts/win32/AsyncIO.c
@@ -276,20 +276,7 @@ start:
 	    prev = NULL;
 	    for(tso = blocked_queue_hd ; tso != END_TSO_QUEUE; tso = tso->_link) {
 	
-                if (tso->what_next == ThreadRelocated) {
-                    /* Drop the TSO from blocked_queue */
-                    if (prev) {
-                        setTSOLink(&MainCapability, prev, tso->_link);
-                    } else {
-                        blocked_queue_hd = tso->_link;
-                    }
-                    if (blocked_queue_tl == tso) {
-                        blocked_queue_tl = prev ? prev : END_TSO_QUEUE;
-                    }
-                    continue;
-                }
-
-		switch(tso->why_blocked) {
+                switch(tso->why_blocked) {
 		case BlockedOnRead:
 		case BlockedOnWrite:
 		case BlockedOnDoProc: