9 files changed, 487 insertions, 205 deletions
diff --git a/includes/ClosureTypes.h b/includes/ClosureTypes.h
index b7bebd6c6b..8247a1516f 100644
--- a/includes/ClosureTypes.h
+++ b/includes/ClosureTypes.h
@@ -93,6 +93,7 @@
 #define ATOMICALLY_FRAME        69
 #define CATCH_RETRY_FRAME       70
 #define CATCH_STM_FRAME         71
-#define N_CLOSURE_TYPES         72
+#define WHITEHOLE               72
+#define N_CLOSURE_TYPES         73
 
 #endif /* CLOSURETYPES_H */
diff --git a/rts/ClosureFlags.c b/rts/ClosureFlags.c
index 12e6632f24..eea609eff7 100644
--- a/rts/ClosureFlags.c
+++ b/rts/ClosureFlags.c
@@ -97,9 +97,10 @@ StgWord16 closure_flags[] = {
 /* TREC_HEADER          = */ (          _NS|         _MUT|_UPT           ),
 /* ATOMICALLY_FRAME     = */ (     _BTM                                  ),
 /* CATCH_RETRY_FRAME    = */ (     _BTM                                  ),
-/* CATCH_STM_FRAME      = */ (     _BTM                                  )
+/* CATCH_STM_FRAME      = */ (     _BTM                                  ),
+/* WHITEHOLE		= */ ( 0                                         )
 };
 
-#if N_CLOSURE_TYPES != 72
+#if N_CLOSURE_TYPES != 73
 #error Closure types changed: update ClosureFlags.c!
 #endif
diff --git a/rts/StgMiscClosures.cmm b/rts/StgMiscClosures.cmm
index 270c600f7c..f9076ca9a9 100644
--- a/rts/StgMiscClosures.cmm
+++ b/rts/StgMiscClosures.cmm
@@ -394,12 +394,9 @@ INFO_TABLE(stg_SE_CAF_BLACKHOLE,0,1,SE_CAF_BLACKHOLE,"SE_CAF_BLACKHOLE","SE_CAF_
 
 /* ----------------------------------------------------------------------------
    Whiteholes are used for the "locked" state of a closure (see lockClosure())
-
-   The closure type is BLAKCHOLE, just because we need a valid closure type
-   for sanity checking.
    ------------------------------------------------------------------------- */
 
-INFO_TABLE(stg_WHITEHOLE, 0,0, BLACKHOLE, "WHITEHOLE", "WHITEHOLE")
+INFO_TABLE(stg_WHITEHOLE, 0,0, WHITEHOLE, "WHITEHOLE", "WHITEHOLE")
 { foreign "C" barf("WHITEHOLE object entered!") never returns; }
 
 /* ----------------------------------------------------------------------------
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index 1c4899e37e..8b8595606b 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -310,7 +310,8 @@ allocGroup (nat n)
 {
     bdescr *bd, *rem;
 
-    ASSERT_SM_LOCK();
+    // Todo: not true in multithreaded GC, where we use allocBlock_sync().
+    // ASSERT_SM_LOCK();
 
     if (n == 0) barf("allocGroup: requested zero blocks");
     
@@ -439,7 +440,8 @@ freeGroup(bdescr *p)
 {
   nat p_on_free_list = 0;
 
-  ASSERT_SM_LOCK();
+  // Todo: not true in multithreaded GC
+  // ASSERT_SM_LOCK();
 
   ASSERT(p->free != (P_)-1);
 
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index 5b37729749..fe788cdb6f 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -27,68 +27,130 @@
 
 static StgClosure * eval_thunk_selector (StgSelector * p, rtsBool);
 
-STATIC_INLINE void 
-upd_evacuee(StgClosure *p, StgClosure *dest)
+STATIC_INLINE StgPtr
+alloc_for_copy (nat size, step *stp)
 {
-    // not true: (ToDo: perhaps it should be)
-    // ASSERT(Bdescr((P_)dest)->flags & BF_EVACUATED);
-    SET_INFO(p, &stg_EVACUATED_info);
-    ((StgEvacuated *)p)->evacuee = dest;
+    StgPtr to;
+    step_workspace *ws;
+    bdescr *bd;
+
+    /* Find out where we're going, using the handy "to" pointer in 
+     * the step of the source object.  If it turns out we need to
+     * evacuate to an older generation, adjust it here (see comment
+     * by evacuate()).
+     */
+    if (stp->gen_no < gct->evac_gen) {
+	if (gct->eager_promotion) {
+	    stp = &generations[gct->evac_gen].steps[0];
+	} else {
+	    gct->failed_to_evac = rtsTrue;
+	}
+    }
+    
+    ws = &gct->steps[stp->gen_no][stp->no];
+    
+    /* chain a new block onto the to-space for the destination step if
+     * necessary.
+     */
+    bd = ws->todo_bd;
+    to = bd->free;
+    if (to + size >= bd->start + BLOCK_SIZE_W) {
+	bd = gc_alloc_todo_block(ws);
+	to = bd->free;
+    }
+    bd->free = to + size;
+
+    return to;
 }
+  
+STATIC_INLINE StgPtr
+alloc_for_copy_noscav (nat size, step *stp)
+{
+    StgPtr to;
+    step_workspace *ws;
+    bdescr *bd;
 
+    /* Find out where we're going, using the handy "to" pointer in 
+     * the step of the source object.  If it turns out we need to
+     * evacuate to an older generation, adjust it here (see comment
+     * by evacuate()).
+     */
+    if (stp->gen_no < gct->evac_gen) {
+	if (gct->eager_promotion) {
+	    stp = &generations[gct->evac_gen].steps[0];
+	} else {
+	    gct->failed_to_evac = rtsTrue;
+	}
+    }
+    
+    ws = &gct->steps[stp->gen_no][stp->no];
+    
+    /* chain a new block onto the to-space for the destination step if
+     * necessary.
+     */
+    bd = ws->scavd_list;
+    to = bd->free;
+    if (to + size >= bd->start + BLOCK_SIZE_W) {
+	bd = gc_alloc_scavd_block(ws);
+	to = bd->free;
+    }
+    bd->free = to + size;
 
+    return to;
+}
+  
 STATIC_INLINE StgClosure *
 copy_tag(StgClosure *src, nat size, step *stp,StgWord tag)
 {
   StgPtr to, from;
   nat i;
-  step_workspace *ws;
-  bdescr *bd;
-
-  TICK_GC_WORDS_COPIED(size);
-  /* Find out where we're going, using the handy "to" pointer in 
-   * the step of the source object.  If it turns out we need to
-   * evacuate to an older generation, adjust it here (see comment
-   * by evacuate()).
-   */
-  if (stp->gen_no < gct->evac_gen) {
-      if (gct->eager_promotion) {
-	  stp = &generations[gct->evac_gen].steps[0];
-      } else {
-	  gct->failed_to_evac = rtsTrue;
-      }
-  }
-
-  ws = &gct->steps[stp->gen_no][stp->no];
+  StgWord info;
+
+#ifdef THREADED_RTS
+    do {
+	info = xchg((StgPtr)&src->header.info, (W_)&stg_WHITEHOLE_info);
+	// so..  what is it?
+    } while (info == (W_)&stg_WHITEHOLE_info);
+    if (info == (W_)&stg_EVACUATED_info) {
+	src->header.info = (const StgInfoTable *)info;
+	return evacuate(src); // does the failed_to_evac stuff
+    }
+#else
+    info = (W_)src->header.info;
+    src->header.info = &stg_EVACUATED_info;
+#endif
 
-  /* chain a new block onto the to-space for the destination step if
-   * necessary.
-   */
-  bd = ws->todo_bd;
-  to = bd->free;
-  if (to + size >= bd->start + BLOCK_SIZE_W) {
-      bd = gc_alloc_todo_block(ws);
-      to = bd->free;
-  }
+    to = alloc_for_copy(size,stp);
+    
+    TICK_GC_WORDS_COPIED(size);
 
-  from = (StgPtr)src;
-  bd->free = to + size;
-  for (i = 0; i < size; i++) { // unroll for small i
-      to[i] = from[i];
-  }
+    from = (StgPtr)src;
+    to[0] = info;
+    for (i = 1; i < size; i++) { // unroll for small i
+	to[i] = from[i];
+    }
+    
+    // retag pointer before updating EVACUATE closure and returning
+    to = (StgPtr)TAG_CLOSURE(tag,(StgClosure*)to);
 
-  /* retag pointer before updating EVACUATE closure and returning */
-  to = (StgPtr)TAG_CLOSURE(tag,(StgClosure*)to);
+//  if (to+size+2 < bd->start + BLOCK_SIZE_W) {
+//      __builtin_prefetch(to + size + 2, 1);
+//  }
 
-  upd_evacuee((StgClosure *)from,(StgClosure *)to);
+    ((StgEvacuated*)from)->evacuee = (StgClosure *)to;
+#ifdef THREADED_RTS
+    write_barrier();
+    ((StgEvacuated*)from)->header.info = &stg_EVACUATED_info;
+#endif
 
 #ifdef PROFILING
-  // We store the size of the just evacuated object in the LDV word so that
-  // the profiler can guess the position of the next object later.
-  SET_EVACUAEE_FOR_LDV(from, size);
+    // We store the size of the just evacuated object in the LDV word so that
+    // the profiler can guess the position of the next object later.
+    SET_EVACUAEE_FOR_LDV(from, size);
 #endif
-  return (StgClosure *)to;
+    return (StgClosure *)to;
 }
+  
 
 // Same as copy() above, except the object will be allocated in memory
 // that will not be scavenged.  Used for object that have no pointer
@@ -96,54 +158,48 @@ copy_tag(StgClosure *src, nat size, step *stp,StgWord tag)
 STATIC_INLINE StgClosure *
 copy_noscav_tag(StgClosure *src, nat size, step *stp, StgWord tag)
 {
-  StgPtr to, from;
-  nat i;
-  step_workspace *ws;
-  bdescr *bd;
-
-  TICK_GC_WORDS_COPIED(size);
-  /* Find out where we're going, using the handy "to" pointer in 
-   * the step of the source object.  If it turns out we need to
-   * evacuate to an older generation, adjust it here (see comment
-   * by evacuate()).
-   */
-  if (stp->gen_no < gct->evac_gen) {
-      if (gct->eager_promotion) {
-	  stp = &generations[gct->evac_gen].steps[0];
-      } else {
-	  gct->failed_to_evac = rtsTrue;
-      }
-  }
-
-  ws = &gct->steps[stp->gen_no][stp->no];
-
-  /* chain a new block onto the to-space for the destination step if
-   * necessary.
-   */
-  bd = ws->scavd_list;
-  to = bd->free;
-  if (to + size >= bd->start + BLOCK_SIZE_W) {
-      bd = gc_alloc_scavd_block(ws);
-      to = bd->free;
-  }
-
-  from = (StgPtr)src;
-  bd->free = to + size;
-  for (i = 0; i < size; i++) { // unroll for small i
-      to[i] = from[i];
-  }
+    StgPtr to, from;
+    nat i;
+    StgWord info;
+
+#ifdef THREADED_RTS
+    do {
+	info = xchg((StgPtr)&src->header.info, (W_)&stg_WHITEHOLE_info);
+    } while (info == (W_)&stg_WHITEHOLE_info);
+    if (info == (W_)&stg_EVACUATED_info) {
+	src->header.info = (const StgInfoTable *)info;
+	return evacuate(src); // does the failed_to_evac stuff
+    }
+#else
+    info = (W_)src->header.info;
+    src->header.info = &stg_EVACUATED_info;
+#endif
+    
+    to = alloc_for_copy_noscav(size,stp);
 
-  /* retag pointer before updating EVACUATE closure and returning */
-  to = (StgPtr)TAG_CLOSURE(tag,(StgClosure*)to);
+    TICK_GC_WORDS_COPIED(size);
+    
+    from = (StgPtr)src;
+    to[0] = info;
+    for (i = 1; i < size; i++) { // unroll for small i
+	to[i] = from[i];
+    }
 
-  upd_evacuee((StgClosure *)from,(StgClosure *)to);
+    // retag pointer before updating EVACUATE closure and returning
+    to = (StgPtr)TAG_CLOSURE(tag,(StgClosure*)to);
 
+    ((StgEvacuated*)from)->evacuee = (StgClosure *)to;
+#ifdef THREADED_RTS
+    write_barrier();
+    ((StgEvacuated*)from)->header.info = &stg_EVACUATED_info;
+#endif
+    
 #ifdef PROFILING
-  // We store the size of the just evacuated object in the LDV word so that
-  // the profiler can guess the position of the next object later.
-  SET_EVACUAEE_FOR_LDV(from, size);
+    // We store the size of the just evacuated object in the LDV word so that
+    // the profiler can guess the position of the next object later.
+    SET_EVACUAEE_FOR_LDV(from, size);
 #endif
-  return (StgClosure *)to;
+    return (StgClosure *)to;
 }
 
 
@@ -154,46 +210,48 @@ copy_noscav_tag(StgClosure *src, nat size, step *stp, StgWord tag)
 static StgClosure *
 copyPart(StgClosure *src, nat size_to_reserve, nat size_to_copy, step *stp)
 {
-  StgPtr to, from;
-  nat i;
-  step_workspace *ws;
-  bdescr *bd;
-
-  TICK_GC_WORDS_COPIED(size_to_copy);
-  if (stp->gen_no < gct->evac_gen) {
-      if (gct->eager_promotion) {
-	  stp = &generations[gct->evac_gen].steps[0];
-      } else {
-	  gct->failed_to_evac = rtsTrue;
-      }
-  }
-
-  ws = &gct->steps[stp->gen_no][stp->no];
-
-  bd = ws->todo_bd;
-  to = bd->free;
-  if (to + size_to_reserve >= bd->start + BLOCK_SIZE_W) {
-      bd = gc_alloc_todo_block(ws);
-      to = bd->free;
-  }
+    StgPtr to, from;
+    nat i;
+    StgWord info;
+    
+#ifdef THREADED_RTS
+    do {
+	info = xchg((StgPtr)&src->header.info, (W_)&stg_WHITEHOLE_info);
+    } while (info == (W_)&stg_WHITEHOLE_info);
+    if (info == (W_)&stg_EVACUATED_info) {
+	src->header.info = (const StgInfoTable *)info;
+	return evacuate(src); // does the failed_to_evac stuff
+    }
+#else
+    info = (W_)src->header.info;
+    src->header.info = &stg_EVACUATED_info;
+#endif
+    
+    to = alloc_for_copy(size_to_reserve, stp);
 
-  from = (StgPtr)src;
-  bd->free = to + size_to_reserve;
-  for (i = 0; i < size_to_copy; i++) { // unroll for small i
-      to[i] = from[i];
-  }
-  
-  upd_evacuee((StgClosure *)from,(StgClosure *)to);
+    TICK_GC_WORDS_COPIED(size_to_copy);
 
+    from = (StgPtr)src;
+    to[0] = info;
+    for (i = 1; i < size_to_copy; i++) { // unroll for small i
+	to[i] = from[i];
+    }
+    
+    ((StgEvacuated*)from)->evacuee = (StgClosure *)to;
+#ifdef THREADED_RTS
+    write_barrier();
+    ((StgEvacuated*)from)->header.info = &stg_EVACUATED_info;
+#endif
+    
 #ifdef PROFILING
-  // We store the size of the just evacuated object in the LDV word so that
-  // the profiler can guess the position of the next object later.
-  SET_EVACUAEE_FOR_LDV(from, size_to_reserve);
-  // fill the slop
-  if (size_to_reserve - size_to_copy > 0)
-    LDV_FILL_SLOP(to + size_to_copy - 1, (int)(size_to_reserve - size_to_copy)); 
+    // We store the size of the just evacuated object in the LDV word so that
+    // the profiler can guess the position of the next object later.
+    SET_EVACUAEE_FOR_LDV(from, size_to_reserve);
+    // fill the slop
+    if (size_to_reserve - size_to_copy > 0)
+	LDV_FILL_SLOP(to + size_to_copy - 1, (int)(size_to_reserve - size_to_copy)); 
 #endif
-  return (StgClosure *)to;
+    return (StgClosure *)to;
 }
 
 
@@ -343,18 +401,24 @@ loop:
       switch (info->type) {
 
       case THUNK_STATIC:
-	  if (info->srt_bitmap != 0 && 
-	      *THUNK_STATIC_LINK((StgClosure *)q) == NULL) {
-	      *THUNK_STATIC_LINK((StgClosure *)q) = static_objects;
-	      static_objects = (StgClosure *)q;
+	  if (info->srt_bitmap != 0) {
+	      ACQUIRE_SPIN_LOCK(&static_objects_sync);
+	      if (*THUNK_STATIC_LINK((StgClosure *)q) == NULL) {
+		  *THUNK_STATIC_LINK((StgClosure *)q) = static_objects;
+		  static_objects = (StgClosure *)q;
+	      }
+	      RELEASE_SPIN_LOCK(&static_objects_sync);
 	  }
 	  return q;
 	  
       case FUN_STATIC:
-	  if (info->srt_bitmap != 0 && 
-	      *FUN_STATIC_LINK((StgClosure *)q) == NULL) {
-	      *FUN_STATIC_LINK((StgClosure *)q) = static_objects;
-	      static_objects = (StgClosure *)q;
+	  if (info->srt_bitmap != 0) {
+	      ACQUIRE_SPIN_LOCK(&static_objects_sync);
+	      if (*FUN_STATIC_LINK((StgClosure *)q) == NULL) {
+		  *FUN_STATIC_LINK((StgClosure *)q) = static_objects;
+		  static_objects = (StgClosure *)q;
+	      }
+	      RELEASE_SPIN_LOCK(&static_objects_sync);
 	  }
 	  return q;
 	  
@@ -363,17 +427,25 @@ loop:
 	   * on the CAF list, so don't do anything with it here (we'll
 	   * scavenge it later).
 	   */
-	  if (((StgIndStatic *)q)->saved_info == NULL
-	      && *IND_STATIC_LINK((StgClosure *)q) == NULL) {
-	      *IND_STATIC_LINK((StgClosure *)q) = static_objects;
-	      static_objects = (StgClosure *)q;
+	  if (((StgIndStatic *)q)->saved_info == NULL) {
+	      ACQUIRE_SPIN_LOCK(&static_objects_sync);
+	      if (*IND_STATIC_LINK((StgClosure *)q) == NULL) {
+		  *IND_STATIC_LINK((StgClosure *)q) = static_objects;
+		  static_objects = (StgClosure *)q;
+	      }
+	      RELEASE_SPIN_LOCK(&static_objects_sync);
 	  }
 	  return q;
 	  
       case CONSTR_STATIC:
 	  if (*STATIC_LINK(info,(StgClosure *)q) == NULL) {
-	      *STATIC_LINK(info,(StgClosure *)q) = static_objects;
-	      static_objects = (StgClosure *)q;
+	      ACQUIRE_SPIN_LOCK(&static_objects_sync);
+	      // re-test, after acquiring lock
+	      if (*STATIC_LINK(info,(StgClosure *)q) == NULL) {
+		  *STATIC_LINK(info,(StgClosure *)q) = static_objects;
+		  static_objects = (StgClosure *)q;
+	      }
+	      RELEASE_SPIN_LOCK(&static_objects_sync);
 	        /* I am assuming that static_objects pointers are not
 		 * written to other objects, and thus, no need to retag. */
 	  }
@@ -456,6 +528,9 @@ loop:
   
   switch (info->type) {
 
+  case WHITEHOLE:
+      goto loop;
+
   case MUT_VAR_CLEAN:
   case MUT_VAR_DIRTY:
   case MVAR_CLEAN:
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 17bc2041ef..eed2da7020 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -47,6 +47,7 @@
 #include "Scav.h"
 #include "GCUtils.h"
 #include "MarkWeak.h"
+#include "Sparks.h"
 
 #include <string.h> // for memset()
 
@@ -117,12 +118,16 @@ nat mutlist_MUTVARS,
 /* Thread-local data for each GC thread
  */
 gc_thread *gc_threads = NULL;
-gc_thread *gct = NULL;  // this thread's gct TODO: make thread-local
+// gc_thread *gct = NULL;  // this thread's gct TODO: make thread-local
 
 // For stats:
 long copied;        // *words* copied & scavenged during this GC
 long scavd_copied;  // *words* copied only during this GC
 
+#ifdef THREADED_RTS
+SpinLock recordMutableGen_sync;
+#endif
+
 /* -----------------------------------------------------------------------------
    Static function declarations
    -------------------------------------------------------------------------- */
@@ -137,6 +142,11 @@ static void init_gc_thread          (gc_thread *t);
 static void update_task_list        (void);
 static void resize_generations      (void);
 static void resize_nursery          (void);
+static void start_gc_threads        (void);
+static void gc_thread_work          (void);
+static nat  inc_running             (void);
+static nat  dec_running             (void);
+static void wakeup_gc_threads       (nat n_threads);
 
 #if 0 && defined(DEBUG)
 static void gcCAFs                  (void);
@@ -227,6 +237,10 @@ GarbageCollect ( rtsBool force_major_gc )
    */
   alloc_gc_threads();
 
+  /* Start threads, so they can be spinning up while we finish initialisation.
+   */
+  start_gc_threads();
+
   /* How many threads will be participating in this GC?
    * We don't try to parallelise minor GC.
    */
@@ -253,8 +267,11 @@ GarbageCollect ( rtsBool force_major_gc )
    */
   static_objects = END_OF_STATIC_LIST;
   scavenged_static_objects = END_OF_STATIC_LIST;
+
 #ifdef THREADED_RTS
   initSpinLock(&static_objects_sync);
+  initSpinLock(&recordMutableGen_sync);
+  initSpinLock(&gc_alloc_block_sync);
 #endif
 
   // Initialise all the generations/steps that we're collecting.
@@ -283,12 +300,16 @@ GarbageCollect ( rtsBool force_major_gc )
       init_gc_thread(&gc_threads[t]);
   }
 
+  // the main thread is running: this prevents any other threads from
+  // exiting prematurely, so we can start them now.
+  inc_running();
+  wakeup_gc_threads(n_threads);
+
   // Initialise stats
   copied = 0;
   scavd_copied = 0;
 
-  // start threads etc.
-  // For now, we just have one thread, and set gct to gc_threads[0]
+  // this is the main thread
   gct = &gc_threads[0];
 
   /* -----------------------------------------------------------------------
@@ -329,25 +350,28 @@ GarbageCollect ( rtsBool force_major_gc )
    * Repeatedly scavenge all the areas we know about until there's no
    * more scavenging to be done.
    */
-  { 
-    rtsBool flag;
-  loop:
-    flag = rtsFalse;
-
-    scavenge_loop();
-
-    // if any blackholes are alive, make the threads that wait on
-    // them alive too.
-    if (traverseBlackholeQueue())
-	flag = rtsTrue;
-
-    if (flag) { goto loop; }
+  for (;;)
+  {
+      gc_thread_work();
+      // The other threads are now stopped.  We might recurse back to
+      // here, but from now on this is the only thread.
+      
+      // if any blackholes are alive, make the threads that wait on
+      // them alive too.
+      if (traverseBlackholeQueue()) {
+	  inc_running(); 
+	  continue;
+      }
+  
+      // must be last...  invariant is that everything is fully
+      // scavenged at this point.
+      if (traverseWeakPtrList()) { // returns rtsTrue if evaced something 
+	  inc_running();
+	  continue;
+      }
 
-    // must be last...  invariant is that everything is fully
-    // scavenged at this point.
-    if (traverseWeakPtrList()) { // returns rtsTrue if evaced something 
-      goto loop;
-    }
+      // If we get to here, there's really nothing left to do.
+      break;
   }
 
   // Update pointers from the Task list
@@ -400,7 +424,8 @@ GarbageCollect ( rtsBool force_major_gc )
 		  ws = &thr->steps[g][s];
 		  if (g==0 && s==0) continue;
 
-		  ASSERT( ws->scan_bd == ws->todo_bd );
+		  // Not true?
+		  // ASSERT( ws->scan_bd == ws->todo_bd );
 		  ASSERT( ws->scan_bd ? ws->scan == ws->scan_bd->free : 1 );
 
 		  // Push the final block
@@ -679,25 +704,6 @@ GetRoots( evac_fn evac )
     Capability *cap;
     Task *task;
 
-#if defined(GRAN)
-    for (i=0; i<=RtsFlags.GranFlags.proc; i++) {
-	if ((run_queue_hds[i] != END_TSO_QUEUE) && ((run_queue_hds[i] != NULL)))
-	    evac((StgClosure **)&run_queue_hds[i]);
-	if ((run_queue_tls[i] != END_TSO_QUEUE) && ((run_queue_tls[i] != NULL)))
-	    evac((StgClosure **)&run_queue_tls[i]);
-	
-	if ((blocked_queue_hds[i] != END_TSO_QUEUE) && ((blocked_queue_hds[i] != NULL)))
-	    evac((StgClosure **)&blocked_queue_hds[i]);
-	if ((blocked_queue_tls[i] != END_TSO_QUEUE) && ((blocked_queue_tls[i] != NULL)))
-	    evac((StgClosure **)&blocked_queue_tls[i]);
-	if ((ccalling_threadss[i] != END_TSO_QUEUE) && ((ccalling_threadss[i] != NULL)))
-	    evac((StgClosure **)&ccalling_threads[i]);
-    }
-
-    markEventQueue();
-
-#else /* !GRAN */
-
     for (i = 0; i < n_capabilities; i++) {
 	cap = &capabilities[i];
 	evac((StgClosure **)(void *)&cap->run_queue_hd);
@@ -715,17 +721,15 @@ GetRoots( evac_fn evac )
 
     }
     
-
 #if !defined(THREADED_RTS)
     evac((StgClosure **)(void *)&blocked_queue_hd);
     evac((StgClosure **)(void *)&blocked_queue_tl);
     evac((StgClosure **)(void *)&sleeping_queue);
 #endif 
-#endif
 
     // evac((StgClosure **)&blackhole_queue);
 
-#if defined(THREADED_RTS) || defined(PARALLEL_HASKELL) || defined(GRAN)
+#if defined(THREADED_RTS)
     markSparkQueue(evac);
 #endif
     
@@ -856,6 +860,14 @@ alloc_gc_thread (gc_thread *t, int n)
     nat g, s;
     step_workspace *ws;
 
+#ifdef THREADED_RTS
+    t->id = 0;
+    initCondition(&t->wake_cond);
+    initMutex(&t->wake_mutex);
+    t->wakeup = rtsFalse;
+    t->exit   = rtsFalse;
+#endif
+
     t->thread_index = n;
     t->free_blocks = NULL;
     t->gc_count = 0;
@@ -897,7 +909,6 @@ alloc_gc_threads (void)
     if (gc_threads == NULL) {
 #if defined(THREADED_RTS)
         nat i;
-
 	gc_threads = stgMallocBytes (RtsFlags.ParFlags.gcThreads * 
 				     sizeof(gc_thread), 
 				     "alloc_gc_threads");
@@ -915,6 +926,146 @@ alloc_gc_threads (void)
 }
 
 /* ----------------------------------------------------------------------------
+   Start GC threads
+   ------------------------------------------------------------------------- */
+
+static nat gc_running_threads;
+
+#if defined(THREADED_RTS)
+static Mutex gc_running_mutex;
+#endif
+
+static nat
+inc_running (void)
+{
+    nat n_running;
+    ACQUIRE_LOCK(&gc_running_mutex);
+    n_running = ++gc_running_threads;
+    RELEASE_LOCK(&gc_running_mutex);
+    return n_running;
+}
+
+static nat
+dec_running (void)
+{
+    nat n_running;
+    ACQUIRE_LOCK(&gc_running_mutex);
+    n_running = --gc_running_threads;
+    RELEASE_LOCK(&gc_running_mutex);
+    return n_running;
+}
+
+//
+// gc_thread_work(): Scavenge until there's no work left to do and all
+// the running threads are idle.
+//
+static void
+gc_thread_work (void)
+{
+    nat r;
+	
+    debugTrace(DEBUG_gc, "GC thread %d working", gct->thread_index);
+
+    // gc_running_threads has already been incremented for us; either
+    // this is the main thread and we incremented it inside
+    // GarbageCollect(), or this is a worker thread and the main
+    // thread bumped gc_running_threads before waking us up.
+
+loop:
+    scavenge_loop();
+    // scavenge_loop() only exits when there's no work to do
+    r = dec_running();
+    
+    debugTrace(DEBUG_gc, "GC thread %d idle (%d still running)", 
+	       gct->thread_index, r);
+
+    while (gc_running_threads != 0) {
+	if (any_work()) {
+	    inc_running();
+	    goto loop;
+	}
+	// any_work() does not remove the work from the queue, it
+	// just checks for the presence of work.  If we find any,
+	// then we increment gc_running_threads and go back to 
+	// scavenge_loop() to perform any pending work.
+    }
+    
+    // All threads are now stopped
+    debugTrace(DEBUG_gc, "GC thread %d finished.", gct->thread_index);
+}
+
+
+#if defined(THREADED_RTS)
+static void
+gc_thread_mainloop (void)
+{
+    while (!gct->exit) {
+
+	// Wait until we're told to wake up
+	ACQUIRE_LOCK(&gct->wake_mutex);
+	while (!gct->wakeup) {
+	    debugTrace(DEBUG_gc, "GC thread %d standing by...", 
+		       gct->thread_index);
+	    waitCondition(&gct->wake_cond, &gct->wake_mutex);
+	}
+	RELEASE_LOCK(&gct->wake_mutex);
+	gct->wakeup = rtsFalse;
+	if (gct->exit) break;
+
+	gc_thread_work();
+    }
+}	
+#endif
+
+#if defined(THREADED_RTS)
+static void
+gc_thread_entry (gc_thread *my_gct)
+{
+    gct = my_gct;
+    debugTrace(DEBUG_gc, "GC thread %d starting...", gct->thread_index);
+    gct->id = osThreadId();
+    gc_thread_mainloop();
+}
+#endif
+
+static void
+start_gc_threads (void)
+{
+#if defined(THREADED_RTS)
+    nat i;
+    OSThreadId id;
+    static rtsBool done = rtsFalse;
+
+    gc_running_threads = 0;
+    initMutex(&gc_running_mutex);
+
+    if (!done) {
+	// Start from 1: the main thread is 0
+	for (i = 1; i < RtsFlags.ParFlags.gcThreads; i++) {
+	    createOSThread(&id, (OSThreadProc*)&gc_thread_entry, 
+			   &gc_threads[i]);
+	}
+	done = rtsTrue;
+    }
+#endif
+}
+
+static void
+wakeup_gc_threads (nat n_threads USED_IF_THREADS)
+{
+#if defined(THREADED_RTS)
+    nat i;
+    for (i=1; i < n_threads; i++) {
+	inc_running();
+	ACQUIRE_LOCK(&gc_threads[i].wake_mutex);
+	gc_threads[i].wakeup = rtsTrue;
+	signalCondition(&gc_threads[i].wake_cond);
+	RELEASE_LOCK(&gc_threads[i].wake_mutex);
+    }
+#endif
+}
+
+/* ----------------------------------------------------------------------------
    Initialise a generation that is to be collected 
    ------------------------------------------------------------------------- */
 
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index 69fdc10447..013a901329 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -110,6 +110,10 @@ typedef struct step_workspace_ {
 typedef struct gc_thread_ {
 #ifdef THREADED_RTS
     OSThreadId id;                 // The OS thread that this struct belongs to
+    Mutex      wake_mutex;
+    Condition  wake_cond;          // So we can go to sleep between GCs
+    rtsBool    wakeup;
+    rtsBool    exit;
 #endif
     nat thread_index;              // a zero based index identifying the thread
 
@@ -148,7 +152,8 @@ extern nat N;
 extern rtsBool major_gc;
 
 extern gc_thread *gc_threads;
-extern gc_thread *gct;  // this thread's gct TODO: make thread-local
+register gc_thread *gct __asm__("%rbx");
+// extern gc_thread *gct;  // this thread's gct TODO: make thread-local
 
 extern StgClosure* static_objects;
 extern StgClosure* scavenged_static_objects;
@@ -165,6 +170,10 @@ extern StgPtr  oldgen_scan;
 extern long copied;
 extern long scavd_copied;
 
+#ifdef THREADED_RTS
+extern SpinLock static_objects_sync;
+#endif
+
 #ifdef DEBUG
 extern nat mutlist_MUTVARS, mutlist_MUTARRS, mutlist_MVARS, mutlist_OTHERS;
 #endif
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index 2a6ea389e6..8d7e5825ee 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -1494,7 +1494,7 @@ scavenge_mutable_list(generation *gen)
 static void
 scavenge_static(void)
 {
-  StgClosure* p = static_objects;
+  StgClosure* p;
   const StgInfoTable *info;
 
   /* Always evacuate straight to the oldest generation for static
@@ -1503,13 +1503,26 @@ scavenge_static(void)
 
   /* keep going until we've scavenged all the objects on the linked
      list... */
-  while (p != END_OF_STATIC_LIST) {
 
+  while (1) {
+      
+    ACQUIRE_SPIN_LOCK(&static_objects_sync);
+    
+    /* get the next static object from the list.  Remember, there might
+     * be more stuff on this list after each evacuation...
+     * (static_objects is a global)
+     */
+    p = static_objects;
+    if (p == END_OF_STATIC_LIST) {
+    	  RELEASE_SPIN_LOCK(&static_objects_sync);
+    	  break;
+    }
+    
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
     info = get_itbl(p);
     /*
-    if (info->type==RBH)
-      info = REVERT_INFOPTR(info); // if it's an RBH, look at the orig closure
+    	if (info->type==RBH)
+    	info = REVERT_INFOPTR(info); // if it's an RBH, look at the orig closure
     */
     // make sure the info pointer is into text space 
     
@@ -1520,6 +1533,8 @@ scavenge_static(void)
     *STATIC_LINK(info,p) = scavenged_static_objects;
     scavenged_static_objects = p;
     
+    RELEASE_SPIN_LOCK(&static_objects_sync);
+    
     switch (info -> type) {
       
     case IND_STATIC:
@@ -1564,12 +1579,6 @@ scavenge_static(void)
     }
 
     ASSERT(gct->failed_to_evac == rtsFalse);
-
-    /* get the next static object from the list.  Remember, there might
-     * be more stuff on this list now that we've done some evacuating!
-     * (static_objects is a global)
-     */
-    p = static_objects;
   }
 }
 
@@ -1947,3 +1956,39 @@ loop:
     
     if (work_to_do) goto loop;
 }
+
+rtsBool
+any_work (void)
+{
+    int g, s;
+    step_workspace *ws;
+
+    write_barrier();
+
+    // scavenge static objects 
+    if (major_gc && static_objects != END_OF_STATIC_LIST) {
+	return rtsTrue;
+    }
+    
+    // scavenge objects in compacted generation
+    if (mark_stack_overflowed || oldgen_scan_bd != NULL ||
+	(mark_stack_bdescr != NULL && !mark_stack_empty())) {
+	return rtsTrue;
+    }
+    
+    // Check for global work in any step.  We don't need to check for
+    // local work, because we have already exited scavenge_loop(),
+    // which means there is no local work for this thread.
+    for (g = RtsFlags.GcFlags.generations; --g >= 0; ) {
+	for (s = generations[g].n_steps; --s >= 0; ) {
+	    if (g == 0 && s == 0 && RtsFlags.GcFlags.generations > 1) { 
+		continue; 
+	    }
+	    ws = &gct->steps[g][s];
+	    if (ws->todo_large_objects) return rtsTrue;
+	    if (ws->stp->todos) return rtsTrue;
+	}
+    }
+
+    return rtsFalse;
+}    
diff --git a/rts/sm/Scav.h b/rts/sm/Scav.h
index ab66775641..f149b667ac 100644
--- a/rts/sm/Scav.h
+++ b/rts/sm/Scav.h
@@ -11,5 +11,6 @@
  *
  * ---------------------------------------------------------------------------*/
 
-void scavenge_loop (void);
-void scavenge_mutable_list (generation *g);
+void    scavenge_loop (void);
+rtsBool any_work (void);
+void    scavenge_mutable_list (generation *g);