3 files changed, 1372 insertions, 1353 deletions
diff --git a/rts/RetainerProfile.c b/rts/RetainerProfile.c
index 301f712e59..6f053c09c4 100644
--- a/rts/RetainerProfile.c
+++ b/rts/RetainerProfile.c
@@ -66,41 +66,6 @@ static uint32_t numObjectVisited;    // total number of objects visited
 static uint32_t timesAnyObjectVisited;  // number of times any objects are
                                         // visited
 
-/** Note [Profiling heap traversal visited bit]
- *
- * If the RTS is compiled with profiling enabled StgProfHeader can be used by
- * profiling code to store per-heap object information.
- *
- * The generic heap traversal code reserves the least significant bit of the
- * largest members of the 'trav' union to decide whether we've already visited a
- * given closure in the current pass or not. The rest of the field is free to be
- * used by the calling profiler.
- *
- * By doing things this way we implicitly assume that the LSB of the largest
- * field in the 'trav' union is insignificant. This is true at least for the
- * word aligned pointers which the retainer profiler currently stores there and
- * should be maintained by new users of the 'trav' union for example by shifting
- * the real data up by one bit.
- *
- * Since we don't want to have to scan the entire heap a second time just to
- * reset the per-object visitied bit before/after the real traversal we make the
- * interpretation of this bit dependent on the value of a global variable,
- * 'flip'.
- *
- * When the 'trav' bit is equal to the value of 'flip' the closure data is
- * valid otherwise not (see isTravDataValid). We then invert the value of 'flip'
- * on each heap traversal (see traverseWorkStack), in effect marking all
- * closure's data as invalid at once.
- *
- * There are some complications with this approach, namely: static objects and
- * mutable data. There we do just go over all existing objects to reset the bit
- * manually. See 'resetStaticObjectForProfiling' and 'computeRetainerSet'.
- */
-StgWord flip = 0;
-
-#define setTravDataToZero(c) \
-  (c)->header.prof.hp.trav.lsb = flip
-
 /* -----------------------------------------------------------------------------
  * Retainer stack - header
  *   Note:
@@ -111,181 +76,8 @@ StgWord flip = 0;
  *     all.
  * -------------------------------------------------------------------------- */
 
-typedef enum {
-    // Object with fixed layout. Keeps an information about that
-    // element was processed. (stackPos.next.step)
-    posTypeStep,
-    // Description of the pointers-first heap object. Keeps information
-    // about layout. (stackPos.next.ptrs)
-    posTypePtrs,
-    // Keeps SRT bitmap (stackPos.next.srt)
-    posTypeSRT,
-    // Keeps a new object that was not inspected yet. Keeps a parent
-    // element (stackPos.next.parent)
-    posTypeFresh
-} nextPosType;
-
-typedef union {
-    // fixed layout or layout specified by a field in the closure
-    StgWord step;
-
-    // layout.payload
-    struct {
-        // See StgClosureInfo in InfoTables.h
-        StgHalfWord pos;
-        StgHalfWord ptrs;
-        StgPtr payload;
-    } ptrs;
-
-    // SRT
-    struct {
-        StgClosure *srt;
-    } srt;
-} nextPos;
-
-/**
- * Position pointer into a closure. Determines what the next element to return
- * for a stackElement is.
- */
-typedef struct {
-    nextPosType type;
-    nextPos next;
-} stackPos;
-
-/**
- * An element of the traversal work-stack. Besides the closure itself this also
- * stores it's parent and associated data.
- *
- * When 'info.type == posTypeFresh' a 'stackElement' represents just one
- * closure, namely 'c' and 'cp' being it's parent. Otherwise 'info' specifies an
- * offset into the children of 'c'. This is to support returning a closure's
- * children one-by-one without pushing one element per child onto the stack. See
- * traversePushChildren() and traversePop().
- *
- */
-typedef struct stackElement_ {
-    stackPos info;
-    StgClosure *c;
-    StgClosure *cp; // parent of 'c'. Only used when info.type == posTypeFresh.
-    stackData data;
-} stackElement;
-
 traverseState g_retainerTraverseState;
 
-
-#if defined(DEBUG)
-unsigned int g_traversalDebugLevel = 0;
-static inline void debug(const char *s, ...)
-{
-    va_list ap;
-
-    if(g_traversalDebugLevel == 0)
-        return;
-
-    va_start(ap,s);
-    vdebugBelch(s, ap);
-    va_end(ap);
-}
-#else
-#define debug(...)
-#endif
-
-// number of blocks allocated for one stack
-#define BLOCKS_IN_STACK 1
-
-/* -----------------------------------------------------------------------------
- * Add a new block group to the stack.
- * Invariants:
- *  currentStack->link == s.
- * -------------------------------------------------------------------------- */
-STATIC_INLINE void
-newStackBlock( traverseState *ts, bdescr *bd )
-{
-    ts->currentStack = bd;
-    ts->stackTop     = (stackElement *)(bd->start + BLOCK_SIZE_W * bd->blocks);
-    ts->stackBottom  = (stackElement *)bd->start;
-    ts->stackLimit   = (stackElement *)ts->stackTop;
-    bd->free     = (StgPtr)ts->stackLimit;
-}
-
-/* -----------------------------------------------------------------------------
- * Return to the previous block group.
- * Invariants:
- *   s->link == currentStack.
- * -------------------------------------------------------------------------- */
-STATIC_INLINE void
-returnToOldStack( traverseState *ts, bdescr *bd )
-{
-    ts->currentStack = bd;
-    ts->stackTop = (stackElement *)bd->free;
-    ts->stackBottom = (stackElement *)bd->start;
-    ts->stackLimit = (stackElement *)(bd->start + BLOCK_SIZE_W * bd->blocks);
-    bd->free = (StgPtr)ts->stackLimit;
-}
-
-/**
- *  Initializes the traversal work-stack.
- */
-void
-initializeTraverseStack( traverseState *ts )
-{
-    if (ts->firstStack != NULL) {
-        freeChain(ts->firstStack);
-    }
-
-    ts->firstStack = allocGroup(BLOCKS_IN_STACK);
-    ts->firstStack->link = NULL;
-    ts->firstStack->u.back = NULL;
-
-    ts->stackSize = 0;
-    ts->maxStackSize = 0;
-
-    newStackBlock(ts, ts->firstStack);
-}
-
-/**
- * Frees all the block groups in the traversal works-stack.
- *
- * Invariants:
- *   firstStack != NULL
- */
-void
-closeTraverseStack( traverseState *ts )
-{
-    freeChain(ts->firstStack);
-    ts->firstStack = NULL;
-}
-
-int
-getTraverseStackMaxSize(traverseState *ts)
-{
-    return ts->maxStackSize;
-}
-
-/* -----------------------------------------------------------------------------
- * Returns true if the whole stack is empty.
- * -------------------------------------------------------------------------- */
-STATIC_INLINE bool
-isEmptyWorkStack( traverseState *ts )
-{
-    return (ts->firstStack == ts->currentStack) && ts->stackTop == ts->stackLimit;
-}
-
-/* -----------------------------------------------------------------------------
- * Returns size of stack
- * -------------------------------------------------------------------------- */
-W_
-traverseWorkStackBlocks(traverseState *ts)
-{
-    bdescr* bd;
-    W_ res = 0;
-
-    for (bd = ts->firstStack; bd != NULL; bd = bd->link)
-      res += bd->blocks;
-
-    return res;
-}
-
 W_
 retainerStackBlocks(void)
 {
@@ -293,648 +85,6 @@ retainerStackBlocks(void)
 }
 
 /* -----------------------------------------------------------------------------
- * Initializes *info from ptrs and payload.
- * Invariants:
- *   payload[] begins with ptrs pointers followed by non-pointers.
- * -------------------------------------------------------------------------- */
-STATIC_INLINE void
-init_ptrs( stackPos *info, uint32_t ptrs, StgPtr payload )
-{
-    info->type              = posTypePtrs;
-    info->next.ptrs.pos     = 0;
-    info->next.ptrs.ptrs    = ptrs;
-    info->next.ptrs.payload = payload;
-}
-
-/* -----------------------------------------------------------------------------
- * Find the next object from *info.
- * -------------------------------------------------------------------------- */
-STATIC_INLINE StgClosure *
-find_ptrs( stackPos *info )
-{
-    if (info->next.ptrs.pos < info->next.ptrs.ptrs) {
-        return (StgClosure *)info->next.ptrs.payload[info->next.ptrs.pos++];
-    } else {
-        return NULL;
-    }
-}
-
-/* -----------------------------------------------------------------------------
- *  Initializes *info from SRT information stored in *infoTable.
- * -------------------------------------------------------------------------- */
-STATIC_INLINE void
-init_srt_fun( stackPos *info, const StgFunInfoTable *infoTable )
-{
-    info->type = posTypeSRT;
-    if (infoTable->i.srt) {
-        info->next.srt.srt = (StgClosure*)GET_FUN_SRT(infoTable);
-    } else {
-        info->next.srt.srt = NULL;
-    }
-}
-
-STATIC_INLINE void
-init_srt_thunk( stackPos *info, const StgThunkInfoTable *infoTable )
-{
-    info->type = posTypeSRT;
-    if (infoTable->i.srt) {
-        info->next.srt.srt = (StgClosure*)GET_SRT(infoTable);
-    } else {
-        info->next.srt.srt = NULL;
-    }
-}
-
-/* -----------------------------------------------------------------------------
- * Find the next object from *info.
- * -------------------------------------------------------------------------- */
-STATIC_INLINE StgClosure *
-find_srt( stackPos *info )
-{
-    StgClosure *c;
-    if (info->type == posTypeSRT) {
-        c = info->next.srt.srt;
-        info->next.srt.srt = NULL;
-        return c;
-    }
-
-    return NULL;
-}
-
-/**
- * Push a set of closures, represented by a single 'stackElement', onto the
- * traversal work-stack.
- */
-static void
-pushStackElement(traverseState *ts, stackElement *se)
-{
-    bdescr *nbd;      // Next Block Descriptor
-    if (ts->stackTop - 1 < ts->stackBottom) {
-        debug("pushStackElement() to the next stack.\n");
-
-        // currentStack->free is updated when the active stack is switched
-        // to the next stack.
-        ts->currentStack->free = (StgPtr)ts->stackTop;
-
-        if (ts->currentStack->link == NULL) {
-            nbd = allocGroup(BLOCKS_IN_STACK);
-            nbd->link = NULL;
-            nbd->u.back = ts->currentStack;
-            ts->currentStack->link = nbd;
-        } else
-            nbd = ts->currentStack->link;
-
-        newStackBlock(ts, nbd);
-    }
-
-    // adjust stackTop (acutal push)
-    ts->stackTop--;
-    // If the size of stackElement was huge, we would better replace the
-    // following statement by either a memcpy() call or a switch statement
-    // on the type of the element. Currently, the size of stackElement is
-    // small enough (5 words) that this direct assignment seems to be enough.
-    *ts->stackTop = *se;
-
-    ts->stackSize++;
-    if (ts->stackSize > ts->maxStackSize) ts->maxStackSize = ts->stackSize;
-    ASSERT(ts->stackSize >= 0);
-    debug("stackSize = %d\n", ts->stackSize);
-}
-
-/**
- * Push a single closure onto the traversal work-stack.
- *
- *  cp   - object's parent
- *  c    - closure
- *  data - data associated with closure.
- */
-inline void
-traversePushClosure(traverseState *ts, StgClosure *c, StgClosure *cp, stackData data) {
-    stackElement se;
-
-    se.c = c;
-    se.cp = cp;
-    se.data = data;
-    se.info.type = posTypeFresh;
-
-    pushStackElement(ts, &se);
-};
-
-/**
- * traversePushChildren() extracts the first child of 'c' in 'first_child' and
- * conceptually pushes all remaining children of 'c' onto the traversal stack
- * while associating 'data' with the pushed elements to be returned upon poping.
- *
- * If 'c' has no children, 'first_child' is set to NULL and nothing is pushed
- * onto the stack.
- *
- * If 'c' has only one child, 'first_child' is set to that child and nothing is
- * pushed onto the stack.
- *
- * Invariants:
- *
- *  - 'c' is not any of TSO, AP, PAP, AP_STACK, which means that there cannot
- *       be any stack objects.
- *
- * Note: SRTs are considered to be children as well.
- *
- * Note: When pushing onto the stack we only really push one 'stackElement'
- * representing all children onto the stack. See traversePop()
- */
-STATIC_INLINE void
-traversePushChildren(traverseState *ts, StgClosure *c, stackData data, StgClosure **first_child)
-{
-    stackElement se;
-
-    debug("traversePushChildren(): stackTop = 0x%x\n", ts->stackTop);
-
-    ASSERT(get_itbl(c)->type != TSO);
-    ASSERT(get_itbl(c)->type != AP_STACK);
-
-    //
-    // fill in se
-    //
-
-    se.c = c;
-    se.data = data;
-    // Note: se.cp ommitted on purpose, only traversePushClosure uses that.
-
-    // fill in se.info
-    switch (get_itbl(c)->type) {
-        // no child, no SRT
-    case CONSTR_0_1:
-    case CONSTR_0_2:
-    case ARR_WORDS:
-    case COMPACT_NFDATA:
-        *first_child = NULL;
-        return;
-
-        // one child (fixed), no SRT
-    case MUT_VAR_CLEAN:
-    case MUT_VAR_DIRTY:
-        *first_child = ((StgMutVar *)c)->var;
-        return;
-    case THUNK_SELECTOR:
-        *first_child = ((StgSelector *)c)->selectee;
-        return;
-    case BLACKHOLE:
-        *first_child = ((StgInd *)c)->indirectee;
-        return;
-    case CONSTR_1_0:
-    case CONSTR_1_1:
-        *first_child = c->payload[0];
-        return;
-
-        // For CONSTR_2_0 and MVAR, we use se.info.step to record the position
-        // of the next child. We do not write a separate initialization code.
-        // Also we do not have to initialize info.type;
-
-        // two children (fixed), no SRT
-        // need to push a stackElement, but nothing to store in se.info
-    case CONSTR_2_0:
-        *first_child = c->payload[0];         // return the first pointer
-        se.info.type = posTypeStep;
-        se.info.next.step = 2;            // 2 = second
-        break;
-
-        // three children (fixed), no SRT
-        // need to push a stackElement
-    case MVAR_CLEAN:
-    case MVAR_DIRTY:
-        // head must be TSO and the head of a linked list of TSOs.
-        // Shoule it be a child? Seems to be yes.
-        *first_child = (StgClosure *)((StgMVar *)c)->head;
-        se.info.type = posTypeStep;
-        se.info.next.step = 2;            // 2 = second
-        break;
-
-        // three children (fixed), no SRT
-    case WEAK:
-        *first_child = ((StgWeak *)c)->key;
-        se.info.type = posTypeStep;
-        se.info.next.step = 2;
-        break;
-
-        // layout.payload.ptrs, no SRT
-    case TVAR:
-    case CONSTR:
-    case CONSTR_NOCAF:
-    case PRIM:
-    case MUT_PRIM:
-    case BCO:
-        init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs,
-                  (StgPtr)c->payload);
-        *first_child = find_ptrs(&se.info);
-        if (*first_child == NULL)
-            return;   // no child
-        break;
-
-        // StgMutArrPtr.ptrs, no SRT
-    case MUT_ARR_PTRS_CLEAN:
-    case MUT_ARR_PTRS_DIRTY:
-    case MUT_ARR_PTRS_FROZEN_CLEAN:
-    case MUT_ARR_PTRS_FROZEN_DIRTY:
-        init_ptrs(&se.info, ((StgMutArrPtrs *)c)->ptrs,
-                  (StgPtr)(((StgMutArrPtrs *)c)->payload));
-        *first_child = find_ptrs(&se.info);
-        if (*first_child == NULL)
-            return;
-        break;
-
-        // StgMutArrPtr.ptrs, no SRT
-    case SMALL_MUT_ARR_PTRS_CLEAN:
-    case SMALL_MUT_ARR_PTRS_DIRTY:
-    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
-    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
-        init_ptrs(&se.info, ((StgSmallMutArrPtrs *)c)->ptrs,
-                  (StgPtr)(((StgSmallMutArrPtrs *)c)->payload));
-        *first_child = find_ptrs(&se.info);
-        if (*first_child == NULL)
-            return;
-        break;
-
-    // layout.payload.ptrs, SRT
-    case FUN_STATIC:
-    case FUN:           // *c is a heap object.
-    case FUN_2_0:
-        init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs, (StgPtr)c->payload);
-        *first_child = find_ptrs(&se.info);
-        if (*first_child == NULL)
-            // no child from ptrs, so check SRT
-            goto fun_srt_only;
-        break;
-
-    case THUNK:
-    case THUNK_2_0:
-        init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs,
-                  (StgPtr)((StgThunk *)c)->payload);
-        *first_child = find_ptrs(&se.info);
-        if (*first_child == NULL)
-            // no child from ptrs, so check SRT
-            goto thunk_srt_only;
-        break;
-
-        // 1 fixed child, SRT
-    case FUN_1_0:
-    case FUN_1_1:
-        *first_child = c->payload[0];
-        ASSERT(*first_child != NULL);
-        init_srt_fun(&se.info, get_fun_itbl(c));
-        break;
-
-    case THUNK_1_0:
-    case THUNK_1_1:
-        *first_child = ((StgThunk *)c)->payload[0];
-        ASSERT(*first_child != NULL);
-        init_srt_thunk(&se.info, get_thunk_itbl(c));
-        break;
-
-    case FUN_0_1:      // *c is a heap object.
-    case FUN_0_2:
-    fun_srt_only:
-        init_srt_fun(&se.info, get_fun_itbl(c));
-        *first_child = find_srt(&se.info);
-        if (*first_child == NULL)
-            return;     // no child
-        break;
-
-    // SRT only
-    case THUNK_STATIC:
-        ASSERT(get_itbl(c)->srt != 0);
-        /* fall-thru */
-    case THUNK_0_1:
-    case THUNK_0_2:
-    thunk_srt_only:
-        init_srt_thunk(&se.info, get_thunk_itbl(c));
-        *first_child = find_srt(&se.info);
-        if (*first_child == NULL)
-            return;     // no child
-        break;
-
-    case TREC_CHUNK:
-        *first_child = (StgClosure *)((StgTRecChunk *)c)->prev_chunk;
-        se.info.type = posTypeStep;
-        se.info.next.step = 0;  // entry no.
-        break;
-
-        // cannot appear
-    case PAP:
-    case AP:
-    case AP_STACK:
-    case TSO:
-    case STACK:
-    case IND_STATIC:
-        // stack objects
-    case UPDATE_FRAME:
-    case CATCH_FRAME:
-    case UNDERFLOW_FRAME:
-    case STOP_FRAME:
-    case RET_BCO:
-    case RET_SMALL:
-    case RET_BIG:
-        // invalid objects
-    case IND:
-    case INVALID_OBJECT:
-    default:
-        barf("Invalid object *c in push(): %d", get_itbl(c)->type);
-        return;
-    }
-
-    // se.cp has to be initialized when type==posTypeFresh. We don't do that
-    // here though. So type must be !=posTypeFresh.
-    ASSERT(se.info.type != posTypeFresh);
-
-    pushStackElement(ts, &se);
-}
-
-/**
- *  popStackElement(): Remove a depleted stackElement from the top of the
- *  traversal work-stack.
- *
- *  Invariants:
- *    stackTop cannot be equal to stackLimit unless the whole stack is
- *    empty, in which case popStackElement() is not allowed.
- */
-static void
-popStackElement(traverseState *ts) {
-    debug("popStackElement(): stackTop = 0x%x\n", ts->stackTop);
-
-    ASSERT(ts->stackTop != ts->stackLimit);
-    ASSERT(!isEmptyWorkStack(ts));
-
-    // <= (instead of <) is wrong!
-    if (ts->stackTop + 1 < ts->stackLimit) {
-        ts->stackTop++;
-
-        ts->stackSize--;
-        if (ts->stackSize > ts->maxStackSize) ts->maxStackSize = ts->stackSize;
-        ASSERT(ts->stackSize >= 0);
-        debug("stackSize = (--) %d\n", ts->stackSize);
-
-        return;
-    }
-
-    bdescr *pbd;    // Previous Block Descriptor
-
-    debug("popStackElement() to the previous stack.\n");
-
-    ASSERT(ts->stackTop + 1 == ts->stackLimit);
-    ASSERT(ts->stackBottom == (stackElement *)ts->currentStack->start);
-
-    if (ts->firstStack == ts->currentStack) {
-        // The stack is completely empty.
-        ts->stackTop++;
-        ASSERT(ts->stackTop == ts->stackLimit);
-
-        ts->stackSize--;
-        if (ts->stackSize > ts->maxStackSize) ts->maxStackSize = ts->stackSize;
-        ASSERT(ts->stackSize >= 0);
-        debug("stackSize = %d\n", ts->stackSize);
-
-        return;
-    }
-
-    // currentStack->free is updated when the active stack is switched back
-    // to the previous stack.
-    ts->currentStack->free = (StgPtr)ts->stackLimit;
-
-    // find the previous block descriptor
-    pbd = ts->currentStack->u.back;
-    ASSERT(pbd != NULL);
-
-    returnToOldStack(ts, pbd);
-
-    ts->stackSize--;
-    if (ts->stackSize > ts->maxStackSize) ts->maxStackSize = ts->stackSize;
-    ASSERT(ts->stackSize >= 0);
-    debug("stackSize = %d\n", ts->stackSize);
-}
-
-/**
- *  Finds the next object to be considered for retainer profiling and store
- *  its pointer to *c.
- *
- *  If the unprocessed object was stored in the stack (posTypeFresh), the
- *  this object is returned as-is. Otherwise Test if the topmost stack
- *  element indicates that more objects are left,
- *  and if so, retrieve the first object and store its pointer to *c. Also,
- *  set *cp and *data appropriately, both of which are stored in the stack
- *  element.  The topmost stack element then is overwritten so as for it to now
- *  denote the next object.
- *
- *  If the topmost stack element indicates no more objects are left, pop
- *  off the stack element until either an object can be retrieved or
- *  the work-stack becomes empty, indicated by true returned by
- *  isEmptyWorkStack(), in which case *c is set to NULL.
- *
- *  Note:
- *
- *    It is okay to call this function even when the work-stack is empty.
- */
-STATIC_INLINE void
-traversePop(traverseState *ts, StgClosure **c, StgClosure **cp, stackData *data)
-{
-    stackElement *se;
-
-    debug("traversePop(): stackTop = 0x%x\n", ts->stackTop);
-
-    // Is this the last internal element? If so instead of modifying the current
-    // stackElement in place we actually remove it from the stack.
-    bool last = false;
-
-    do {
-        if (isEmptyWorkStack(ts)) {
-            *c = NULL;
-            return;
-        }
-
-        // Note: Below every `break`, where the loop condition is true, must be
-        // accompanied by a popStackElement() otherwise this is an infinite
-        // loop.
-        se = ts->stackTop;
-
-        // If this is a top-level element, you should pop that out.
-        if (se->info.type == posTypeFresh) {
-            *cp = se->cp;
-            *c = se->c;
-            *data = se->data;
-            popStackElement(ts);
-            return;
-        }
-
-        // Note: The first ptr of all of these was already returned as
-        // *fist_child in push(), so we always start with the second field.
-        switch (get_itbl(se->c)->type) {
-            // two children (fixed), no SRT
-            // nothing in se.info
-        case CONSTR_2_0:
-            *c = se->c->payload[1];
-            last = true;
-            goto out;
-
-            // three children (fixed), no SRT
-            // need to push a stackElement
-        case MVAR_CLEAN:
-        case MVAR_DIRTY:
-            if (se->info.next.step == 2) {
-                *c = (StgClosure *)((StgMVar *)se->c)->tail;
-                se->info.next.step++;             // move to the next step
-                // no popStackElement
-            } else {
-                *c = ((StgMVar *)se->c)->value;
-                last = true;
-            }
-            goto out;
-
-            // three children (fixed), no SRT
-        case WEAK:
-            if (se->info.next.step == 2) {
-                *c = ((StgWeak *)se->c)->value;
-                se->info.next.step++;
-                // no popStackElement
-            } else {
-                *c = ((StgWeak *)se->c)->finalizer;
-                last = true;
-            }
-            goto out;
-
-        case TREC_CHUNK: {
-            // These are pretty complicated: we have N entries, each
-            // of which contains 3 fields that we want to follow.  So
-            // we divide the step counter: the 2 low bits indicate
-            // which field, and the rest of the bits indicate the
-            // entry number (starting from zero).
-            TRecEntry *entry;
-            uint32_t entry_no = se->info.next.step >> 2;
-            uint32_t field_no = se->info.next.step & 3;
-            if (entry_no == ((StgTRecChunk *)se->c)->next_entry_idx) {
-                *c = NULL;
-                popStackElement(ts);
-                break; // this breaks out of the switch not the loop
-            }
-            entry = &((StgTRecChunk *)se->c)->entries[entry_no];
-            if (field_no == 0) {
-                *c = (StgClosure *)entry->tvar;
-            } else if (field_no == 1) {
-                *c = entry->expected_value;
-            } else {
-                *c = entry->new_value;
-            }
-            se->info.next.step++;
-            goto out;
-        }
-
-        case TVAR:
-        case CONSTR:
-        case PRIM:
-        case MUT_PRIM:
-        case BCO:
-            // StgMutArrPtr.ptrs, no SRT
-        case MUT_ARR_PTRS_CLEAN:
-        case MUT_ARR_PTRS_DIRTY:
-        case MUT_ARR_PTRS_FROZEN_CLEAN:
-        case MUT_ARR_PTRS_FROZEN_DIRTY:
-        case SMALL_MUT_ARR_PTRS_CLEAN:
-        case SMALL_MUT_ARR_PTRS_DIRTY:
-        case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
-        case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
-            *c = find_ptrs(&se->info);
-            if (*c == NULL) {
-                popStackElement(ts);
-                break; // this breaks out of the switch not the loop
-            }
-            goto out;
-
-            // layout.payload.ptrs, SRT
-        case FUN:         // always a heap object
-        case FUN_STATIC:
-        case FUN_2_0:
-            if (se->info.type == posTypePtrs) {
-                *c = find_ptrs(&se->info);
-                if (*c != NULL) {
-                    goto out;
-                }
-                init_srt_fun(&se->info, get_fun_itbl(se->c));
-            }
-            goto do_srt;
-
-        case THUNK:
-        case THUNK_2_0:
-            if (se->info.type == posTypePtrs) {
-                *c = find_ptrs(&se->info);
-                if (*c != NULL) {
-                    goto out;
-                }
-                init_srt_thunk(&se->info, get_thunk_itbl(se->c));
-            }
-            goto do_srt;
-
-            // SRT
-        do_srt:
-        case THUNK_STATIC:
-        case FUN_0_1:
-        case FUN_0_2:
-        case THUNK_0_1:
-        case THUNK_0_2:
-        case FUN_1_0:
-        case FUN_1_1:
-        case THUNK_1_0:
-        case THUNK_1_1:
-            *c = find_srt(&se->info);
-            if(*c == NULL) {
-                popStackElement(ts);
-                break; // this breaks out of the switch not the loop
-            }
-            goto out;
-
-            // no child (fixed), no SRT
-        case CONSTR_0_1:
-        case CONSTR_0_2:
-        case ARR_WORDS:
-            // one child (fixed), no SRT
-        case MUT_VAR_CLEAN:
-        case MUT_VAR_DIRTY:
-        case THUNK_SELECTOR:
-        case CONSTR_1_1:
-            // cannot appear
-        case PAP:
-        case AP:
-        case AP_STACK:
-        case TSO:
-        case STACK:
-        case IND_STATIC:
-        case CONSTR_NOCAF:
-            // stack objects
-        case UPDATE_FRAME:
-        case CATCH_FRAME:
-        case UNDERFLOW_FRAME:
-        case STOP_FRAME:
-        case RET_BCO:
-        case RET_SMALL:
-        case RET_BIG:
-            // invalid objects
-        case IND:
-        case INVALID_OBJECT:
-        default:
-            barf("Invalid object *c in traversePop(): %d", get_itbl(se->c)->type);
-            return;
-        }
-    } while (*c == NULL);
-
-out:
-
-    ASSERT(*c != NULL);
-
-    *cp = se->c;
-    *data = se->data;
-
-    if(last)
-        popStackElement(ts);
-
-    return;
-
-}
-
-/* -----------------------------------------------------------------------------
  * RETAINER PROFILING ENGINE
  * -------------------------------------------------------------------------- */
 
@@ -954,22 +104,6 @@ endRetainerProfiling( void )
     outputAllRetainerSet(prof_file);
 }
 
-/**
- * Make sure a closure's profiling data is initialized to zero if it does not
- * conform to the current value of the flip bit, returns true in this case.
- *
- * See Note [Profiling heap traversal visited bit].
- */
-bool
-traverseMaybeInitClosureData(StgClosure *c)
-{
-    if (!isTravDataValid(c)) {
-        setTravDataToZero(c);
-        return true;
-    }
-    return false;
-}
-
 /* -----------------------------------------------------------------------------
  * Returns true if *c is a retainer.
  * In general the retainers are the objects that may be the roots of the
@@ -1120,214 +254,6 @@ associate( StgClosure *c, RetainerSet *s )
     RSET(c) = (RetainerSet *)((StgWord)s | flip);
 }
 
-/* -----------------------------------------------------------------------------
-   Call traversePushClosure for each of the closures covered by a large bitmap.
-   -------------------------------------------------------------------------- */
-
-static void
-traverseLargeBitmap (traverseState *ts, StgPtr p, StgLargeBitmap *large_bitmap,
-                     uint32_t size, StgClosure *c, stackData data)
-{
-    uint32_t i, b;
-    StgWord bitmap;
-
-    b = 0;
-    bitmap = large_bitmap->bitmap[b];
-    for (i = 0; i < size; ) {
-        if ((bitmap & 1) == 0) {
-            traversePushClosure(ts, (StgClosure *)*p, c, data);
-        }
-        i++;
-        p++;
-        if (i % BITS_IN(W_) == 0) {
-            b++;
-            bitmap = large_bitmap->bitmap[b];
-        } else {
-            bitmap = bitmap >> 1;
-        }
-    }
-}
-
-STATIC_INLINE StgPtr
-traverseSmallBitmap (traverseState *ts, StgPtr p, uint32_t size, StgWord bitmap,
-                     StgClosure *c, stackData data)
-{
-    while (size > 0) {
-        if ((bitmap & 1) == 0) {
-            traversePushClosure(ts, (StgClosure *)*p, c, data);
-        }
-        p++;
-        bitmap = bitmap >> 1;
-        size--;
-    }
-    return p;
-}
-
-/**
- *  traversePushStack(ts, cp, data, stackStart, stackEnd) pushes all the objects
- *  in the STG stack-chunk from stackStart to stackEnd onto the traversal
- *  work-stack with 'c' and 'data' being their parent and associated data,
- *  respectively.
- *
- *  Invariants:
- *
- *    *cp is one of the following: TSO, AP_STACK.
- *
- *    stackStart < stackEnd.
- *
- *    If *c is TSO, its state is not ThreadComplete,or ThreadKilled,
- *    which means that its stack is ready to process.
- *
- *  Note:
- *
- *    This code was almost plagiarzied from GC.c! For each pointer,
- *    traversePushClosure() is invoked instead of evacuate().
- */
-static void
-traversePushStack(traverseState *ts, StgClosure *cp, stackData data,
-                  StgPtr stackStart, StgPtr stackEnd)
-{
-    StgPtr p;
-    const StgRetInfoTable *info;
-    StgWord bitmap;
-    uint32_t size;
-
-    ASSERT(get_itbl(cp)->type == STACK);
-
-    p = stackStart;
-    while (p < stackEnd) {
-        info = get_ret_itbl((StgClosure *)p);
-
-        switch(info->i.type) {
-
-        case UPDATE_FRAME:
-            traversePushClosure(ts, ((StgUpdateFrame *)p)->updatee, cp, data);
-            p += sizeofW(StgUpdateFrame);
-            continue;
-
-        case UNDERFLOW_FRAME:
-        case STOP_FRAME:
-        case CATCH_FRAME:
-        case CATCH_STM_FRAME:
-        case CATCH_RETRY_FRAME:
-        case ATOMICALLY_FRAME:
-        case RET_SMALL:
-            bitmap = BITMAP_BITS(info->i.layout.bitmap);
-            size   = BITMAP_SIZE(info->i.layout.bitmap);
-            p++;
-            p = traverseSmallBitmap(ts, p, size, bitmap, cp, data);
-
-        follow_srt:
-            if (info->i.srt) {
-                traversePushClosure(ts, GET_SRT(info), cp, data);
-            }
-            continue;
-
-        case RET_BCO: {
-            StgBCO *bco;
-
-            p++;
-            traversePushClosure(ts, (StgClosure*)*p, cp, data);
-            bco = (StgBCO *)*p;
-            p++;
-            size = BCO_BITMAP_SIZE(bco);
-            traverseLargeBitmap(ts, p, BCO_BITMAP(bco), size, cp, data);
-            p += size;
-            continue;
-        }
-
-            // large bitmap (> 32 entries, or > 64 on a 64-bit machine)
-        case RET_BIG:
-            size = GET_LARGE_BITMAP(&info->i)->size;
-            p++;
-            traverseLargeBitmap(ts, p, GET_LARGE_BITMAP(&info->i),
-                                size, cp, data);
-            p += size;
-            // and don't forget to follow the SRT
-            goto follow_srt;
-
-        case RET_FUN: {
-            StgRetFun *ret_fun = (StgRetFun *)p;
-            const StgFunInfoTable *fun_info;
-
-            traversePushClosure(ts, ret_fun->fun, cp, data);
-            fun_info = get_fun_itbl(UNTAG_CONST_CLOSURE(ret_fun->fun));
-
-            p = (P_)&ret_fun->payload;
-            switch (fun_info->f.fun_type) {
-            case ARG_GEN:
-                bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
-                size = BITMAP_SIZE(fun_info->f.b.bitmap);
-                p = traverseSmallBitmap(ts, p, size, bitmap, cp, data);
-                break;
-            case ARG_GEN_BIG:
-                size = GET_FUN_LARGE_BITMAP(fun_info)->size;
-                traverseLargeBitmap(ts, p, GET_FUN_LARGE_BITMAP(fun_info),
-                                    size, cp, data);
-                p += size;
-                break;
-            default:
-                bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
-                size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
-                p = traverseSmallBitmap(ts, p, size, bitmap, cp, data);
-                break;
-            }
-            goto follow_srt;
-        }
-
-        default:
-            barf("Invalid object found in traversePushStack(): %d",
-                 (int)(info->i.type));
-        }
-    }
-}
-
-/* ----------------------------------------------------------------------------
- * Call traversePushClosure for each of the children of a PAP/AP
- * ------------------------------------------------------------------------- */
-
-STATIC_INLINE StgPtr
-traversePAP (traverseState *ts,
-                    StgClosure *pap,    /* NOT tagged */
-                    stackData data,
-                    StgClosure *fun,    /* tagged */
-                    StgClosure** payload, StgWord n_args)
-{
-    StgPtr p;
-    StgWord bitmap;
-    const StgFunInfoTable *fun_info;
-
-    traversePushClosure(ts, fun, pap, data);
-    fun = UNTAG_CLOSURE(fun);
-    fun_info = get_fun_itbl(fun);
-    ASSERT(fun_info->i.type != PAP);
-
-    p = (StgPtr)payload;
-
-    switch (fun_info->f.fun_type) {
-    case ARG_GEN:
-        bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
-        p = traverseSmallBitmap(ts, p, n_args, bitmap,
-                                pap, data);
-        break;
-    case ARG_GEN_BIG:
-        traverseLargeBitmap(ts, p, GET_FUN_LARGE_BITMAP(fun_info),
-                            n_args, pap, data);
-        p += n_args;
-        break;
-    case ARG_BCO:
-        traverseLargeBitmap(ts, (StgPtr)payload, BCO_BITMAP(fun),
-                            n_args, pap, data);
-        p += n_args;
-        break;
-    default:
-        bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
-        p = traverseSmallBitmap(ts, p, n_args, bitmap, pap, data);
-        break;
-    }
-    return p;
-}
-
 static bool
 retainVisitClosure( StgClosure *c, const StgClosure *cp, const stackData data, const bool first_visit, stackData *out_data )
 {
@@ -1408,219 +334,6 @@ retainVisitClosure( StgClosure *c, const StgClosure *cp, const stackData data, c
     return 1;
 }
 
-static void
-resetMutableObjects(void)
-{
-    uint32_t g, n;
-    bdescr *bd;
-    StgPtr ml;
-
-    // The following code resets the 'trav' field of each unvisited mutable
-    // object.
-    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-        // NOT true: even G0 has a block on its mutable list
-        // ASSERT(g != 0 || (generations[g].mut_list == NULL));
-
-        // Traversing through mut_list is necessary
-        // because we can find MUT_VAR objects which have not been
-        // visited during heap traversal.
-        for (n = 0; n < n_capabilities; n++) {
-          for (bd = capabilities[n]->mut_lists[g]; bd != NULL; bd = bd->link) {
-            for (ml = bd->start; ml < bd->free; ml++) {
-
-                traverseMaybeInitClosureData((StgClosure *)*ml);
-            }
-          }
-        }
-    }
-}
-
-/**
- * Traverse all closures on the traversal work-stack, calling 'visit_cb' on each
- * closure. See 'visitClosure_cb' for details. This function flips the 'flip'
- * bit and hence every closure's profiling data will be reset to zero upon
- * visiting. See Note [Profiling heap traversal visited bit].
- */
-void
-traverseWorkStack(traverseState *ts, visitClosure_cb visit_cb)
-{
-    // first_child = first child of c
-    StgClosure *c, *cp, *first_child;
-    stackData data, child_data;
-    StgWord typeOfc;
-
-    // Now we flip the flip bit.
-    flip = flip ^ 1;
-
-    // c = Current closure                           (possibly tagged)
-    // cp = Current closure's Parent                 (NOT tagged)
-    // data = current closures' associated data      (NOT tagged)
-    // data_out = data to associate with current closure's children
-
-loop:
-    traversePop(ts, &c, &cp, &data);
-
-    if (c == NULL) {
-        debug("maxStackSize= %d\n", ts->maxStackSize);
-        resetMutableObjects();
-        return;
-    }
-inner_loop:
-    c = UNTAG_CLOSURE(c);
-
-    typeOfc = get_itbl(c)->type;
-
-    // special cases
-    switch (typeOfc) {
-    case TSO:
-        if (((StgTSO *)c)->what_next == ThreadComplete ||
-            ((StgTSO *)c)->what_next == ThreadKilled) {
-            debug("ThreadComplete or ThreadKilled encountered in traverseWorkStack()\n");
-            goto loop;
-        }
-        break;
-
-    case IND_STATIC:
-        // We just skip IND_STATIC, so it's never visited.
-        c = ((StgIndStatic *)c)->indirectee;
-        goto inner_loop;
-
-    case CONSTR_NOCAF:
-        // static objects with no pointers out, so goto loop.
-
-        // It is not just enough not to visit *c; it is
-        // mandatory because CONSTR_NOCAF are not reachable from
-        // scavenged_static_objects, the list from which is assumed to traverse
-        // all static objects after major garbage collections.
-        goto loop;
-
-    case THUNK_STATIC:
-        if (get_itbl(c)->srt == 0) {
-            // No need to visit *c; no dynamic objects are reachable from it.
-            //
-            // Static objects: if we traverse all the live closures,
-            // including static closures, during each heap census then
-            // we will observe that some static closures appear and
-            // disappear.  eg. a closure may contain a pointer to a
-            // static function 'f' which is not otherwise reachable
-            // (it doesn't indirectly point to any CAFs, so it doesn't
-            // appear in any SRTs), so we would find 'f' during
-            // traversal.  However on the next sweep there may be no
-            // closures pointing to 'f'.
-            //
-            // We must therefore ignore static closures whose SRT is
-            // empty, because these are exactly the closures that may
-            // "appear".  A closure with a non-empty SRT, and which is
-            // still required, will always be reachable.
-            //
-            // But what about CONSTR?  Surely these may be able
-            // to appear, and they don't have SRTs, so we can't
-            // check.  So for now, we're calling
-            // resetStaticObjectForProfiling() from the
-            // garbage collector to reset the retainer sets in all the
-            // reachable static objects.
-            goto loop;
-        }
-        /* fall-thru */
-
-    case FUN_STATIC: {
-        const StgInfoTable *info = get_itbl(c);
-        if (info->srt == 0 && info->layout.payload.ptrs == 0) {
-            goto loop;
-        } else {
-            break;
-        }
-    }
-
-    default:
-        break;
-    }
-
-    // If this is the first visit to c, initialize its data.
-    bool first_visit = traverseMaybeInitClosureData(c);
-    bool traverse_children
-        = visit_cb(c, cp, data, first_visit, (stackData*)&child_data);
-    if(!traverse_children)
-        goto loop;
-
-    // process child
-
-    // Special case closures: we process these all in one go rather
-    // than attempting to save the current position, because doing so
-    // would be hard.
-    switch (typeOfc) {
-    case STACK:
-        traversePushStack(ts, c, child_data,
-                    ((StgStack *)c)->sp,
-                    ((StgStack *)c)->stack + ((StgStack *)c)->stack_size);
-        goto loop;
-
-    case TSO:
-    {
-        StgTSO *tso = (StgTSO *)c;
-
-        traversePushClosure(ts, (StgClosure *) tso->stackobj, c, child_data);
-        traversePushClosure(ts, (StgClosure *) tso->blocked_exceptions, c, child_data);
-        traversePushClosure(ts, (StgClosure *) tso->bq, c, child_data);
-        traversePushClosure(ts, (StgClosure *) tso->trec, c, child_data);
-        if (   tso->why_blocked == BlockedOnMVar
-               || tso->why_blocked == BlockedOnMVarRead
-               || tso->why_blocked == BlockedOnBlackHole
-               || tso->why_blocked == BlockedOnMsgThrowTo
-            ) {
-            traversePushClosure(ts, tso->block_info.closure, c, child_data);
-        }
-        goto loop;
-    }
-
-    case BLOCKING_QUEUE:
-    {
-        StgBlockingQueue *bq = (StgBlockingQueue *)c;
-        traversePushClosure(ts, (StgClosure *) bq->link,  c, child_data);
-        traversePushClosure(ts, (StgClosure *) bq->bh,    c, child_data);
-        traversePushClosure(ts, (StgClosure *) bq->owner, c, child_data);
-        goto loop;
-    }
-
-    case PAP:
-    {
-        StgPAP *pap = (StgPAP *)c;
-        traversePAP(ts, c, child_data, pap->fun, pap->payload, pap->n_args);
-        goto loop;
-    }
-
-    case AP:
-    {
-        StgAP *ap = (StgAP *)c;
-        traversePAP(ts, c, child_data, ap->fun, ap->payload, ap->n_args);
-        goto loop;
-    }
-
-    case AP_STACK:
-        traversePushClosure(ts, ((StgAP_STACK *)c)->fun, c, child_data);
-        traversePushStack(ts, c, child_data,
-                    (StgPtr)((StgAP_STACK *)c)->payload,
-                    (StgPtr)((StgAP_STACK *)c)->payload +
-                             ((StgAP_STACK *)c)->size);
-        goto loop;
-    }
-
-    traversePushChildren(ts, c, child_data, &first_child);
-
-    // If first_child is null, c has no child.
-    // If first_child is not null, the top stack element points to the next
-    // object. traversePushChildren() may or may not push a stackElement on the
-    // stack.
-    if (first_child == NULL)
-        goto loop;
-
-    // (c, cp, data) = (first_child, c, child_data)
-    data = child_data;
-    cp = c;
-    c = first_child;
-    goto inner_loop;
-}
-
 /**
  *  Push every object reachable from *tl onto the traversal work stack.
  */
@@ -1684,72 +397,6 @@ computeRetainerSet( traverseState *ts )
 }
 
 /* -----------------------------------------------------------------------------
- *  Traverse all static objects for which we compute retainer sets,
- *  and reset their rs fields to NULL, which is accomplished by
- *  invoking traverseMaybeInitClosureData(). This function must be called
- *  before zeroing all objects reachable from scavenged_static_objects
- *  in the case of major garbage collections. See GarbageCollect() in
- *  GC.c.
- *  Note:
- *    The mut_once_list of the oldest generation must also be traversed?
- *    Why? Because if the evacuation of an object pointed to by a static
- *    indirection object fails, it is put back to the mut_once_list of
- *    the oldest generation.
- *    However, this is not necessary because any static indirection objects
- *    are just traversed through to reach dynamic objects. In other words,
- *    they are not taken into consideration in computing retainer sets.
- *
- * SDM (20/7/2011): I don't think this is doing anything sensible,
- * because it happens before retainerProfile() and at the beginning of
- * retainerProfil() we change the sense of 'flip'.  So all of the
- * calls to traverseMaybeInitClosureData() here are initialising retainer sets
- * with the wrong flip.  Also, I don't see why this is necessary.  I
- * added a traverseMaybeInitClosureData() call to retainRoot(), and that seems
- * to have fixed the assertion failure in retainerSetOf() I was
- * encountering.
- * -------------------------------------------------------------------------- */
-void
-resetStaticObjectForProfiling( StgClosure *static_objects )
-{
-    uint32_t count = 0;
-    StgClosure *p;
-
-    p = static_objects;
-    while (p != END_OF_STATIC_OBJECT_LIST) {
-        p = UNTAG_STATIC_LIST_PTR(p);
-        count++;
-
-        switch (get_itbl(p)->type) {
-        case IND_STATIC:
-            // Since we do not compute the retainer set of any
-            // IND_STATIC object, we don't have to reset its retainer
-            // field.
-            p = (StgClosure*)*IND_STATIC_LINK(p);
-            break;
-        case THUNK_STATIC:
-            traverseMaybeInitClosureData(p);
-            p = (StgClosure*)*THUNK_STATIC_LINK(p);
-            break;
-        case FUN_STATIC:
-        case CONSTR:
-        case CONSTR_1_0:
-        case CONSTR_2_0:
-        case CONSTR_1_1:
-        case CONSTR_NOCAF:
-            traverseMaybeInitClosureData(p);
-            p = (StgClosure*)*STATIC_LINK(get_itbl(p), p);
-            break;
-        default:
-            barf("resetStaticObjectForProfiling: %p (%lu)",
-                 p, (unsigned long)get_itbl(p)->type);
-            break;
-        }
-    }
-
-    debug("count in scavenged_static_objects = %d\n", count);
-}
-
-/* -----------------------------------------------------------------------------
  * Perform retainer profiling.
  * N is the oldest generation being profilied, where the generations are
  * numbered starting at 0.
diff --git a/rts/TraverseHeap.c b/rts/TraverseHeap.c
new file mode 100644
index 0000000000..bf2584cef4
--- /dev/null
+++ b/rts/TraverseHeap.c
@@ -0,0 +1,1371 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The GHC Team, 2001,2019
+ * Author: Sungwoo Park, Daniel Gröber
+ *
+ * Generalised profiling heap traversal.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#if defined(PROFILING)
+
+#include "PosixSource.h"
+#include "Rts.h"
+#include "sm/Storage.h"
+
+#include "TraverseHeap.h"
+
+/** Note [Profiling heap traversal visited bit]
+ *
+ * If the RTS is compiled with profiling enabled StgProfHeader can be used by
+ * profiling code to store per-heap object information.
+ *
+ * The generic heap traversal code reserves the least significant bit of the
+ * largest members of the 'trav' union to decide whether we've already visited a
+ * given closure in the current pass or not. The rest of the field is free to be
+ * used by the calling profiler.
+ *
+ * By doing things this way we implicitly assume that the LSB of the largest
+ * field in the 'trav' union is insignificant. This is true at least for the
+ * word aligned pointers which the retainer profiler currently stores there and
+ * should be maintained by new users of the 'trav' union for example by shifting
+ * the real data up by one bit.
+ *
+ * Since we don't want to have to scan the entire heap a second time just to
+ * reset the per-object visitied bit before/after the real traversal we make the
+ * interpretation of this bit dependent on the value of a global variable,
+ * 'flip'.
+ *
+ * When the 'trav' bit is equal to the value of 'flip' the closure data is
+ * valid otherwise not (see isTravDataValid). We then invert the value of 'flip'
+ * on each heap traversal (see traverseWorkStack), in effect marking all
+ * closure's data as invalid at once.
+ *
+ * There are some complications with this approach, namely: static objects and
+ * mutable data. There we do just go over all existing objects to reset the bit
+ * manually. See 'resetStaticObjectForProfiling' and 'resetMutableObjects'.
+ */
+StgWord flip = 0;
+
+#define setTravDataToZero(c) \
+  (c)->header.prof.hp.trav.lsb = flip
+
+typedef enum {
+    // Object with fixed layout. Keeps an information about that
+    // element was processed. (stackPos.next.step)
+    posTypeStep,
+    // Description of the pointers-first heap object. Keeps information
+    // about layout. (stackPos.next.ptrs)
+    posTypePtrs,
+    // Keeps SRT bitmap (stackPos.next.srt)
+    posTypeSRT,
+    // Keeps a new object that was not inspected yet. Keeps a parent
+    // element (stackPos.next.parent)
+    posTypeFresh
+} nextPosType;
+
+typedef union {
+    // fixed layout or layout specified by a field in the closure
+    StgWord step;
+
+    // layout.payload
+    struct {
+        // See StgClosureInfo in InfoTables.h
+        StgHalfWord pos;
+        StgHalfWord ptrs;
+        StgPtr payload;
+    } ptrs;
+
+    // SRT
+    struct {
+        StgClosure *srt;
+    } srt;
+} nextPos;
+
+/**
+ * Position pointer into a closure. Determines what the next element to return
+ * for a stackElement is.
+ */
+typedef struct {
+    nextPosType type;
+    nextPos next;
+} stackPos;
+
+/**
+ * An element of the traversal work-stack. Besides the closure itself this also
+ * stores it's parent and associated data.
+ *
+ * When 'info.type == posTypeFresh' a 'stackElement' represents just one
+ * closure, namely 'c' and 'cp' being it's parent. Otherwise 'info' specifies an
+ * offset into the children of 'c'. This is to support returning a closure's
+ * children one-by-one without pushing one element per child onto the stack. See
+ * traversePushChildren() and traversePop().
+ *
+ */
+typedef struct stackElement_ {
+    stackPos info;
+    StgClosure *c;
+    StgClosure *cp; // parent of 'c'. Only used when info.type == posTypeFresh.
+    stackData data;
+} stackElement;
+
+
+#if defined(DEBUG)
+unsigned int g_traversalDebugLevel = 0;
+static inline void debug(const char *s, ...)
+{
+    va_list ap;
+
+    if(g_traversalDebugLevel == 0)
+        return;
+
+    va_start(ap,s);
+    vdebugBelch(s, ap);
+    va_end(ap);
+}
+#else
+#define debug(...)
+#endif
+
+// number of blocks allocated for one stack
+#define BLOCKS_IN_STACK 1
+
+/* -----------------------------------------------------------------------------
+ * Add a new block group to the stack.
+ * Invariants:
+ *  currentStack->link == s.
+ * -------------------------------------------------------------------------- */
+STATIC_INLINE void
+newStackBlock( traverseState *ts, bdescr *bd )
+{
+    ts->currentStack = bd;
+    ts->stackTop     = (stackElement *)(bd->start + BLOCK_SIZE_W * bd->blocks);
+    ts->stackBottom  = (stackElement *)bd->start;
+    ts->stackLimit   = (stackElement *)ts->stackTop;
+    bd->free     = (StgPtr)ts->stackLimit;
+}
+
+/* -----------------------------------------------------------------------------
+ * Return to the previous block group.
+ * Invariants:
+ *   s->link == currentStack.
+ * -------------------------------------------------------------------------- */
+STATIC_INLINE void
+returnToOldStack( traverseState *ts, bdescr *bd )
+{
+    ts->currentStack = bd;
+    ts->stackTop = (stackElement *)bd->free;
+    ts->stackBottom = (stackElement *)bd->start;
+    ts->stackLimit = (stackElement *)(bd->start + BLOCK_SIZE_W * bd->blocks);
+    bd->free = (StgPtr)ts->stackLimit;
+}
+
+/**
+ *  Initializes the traversal work-stack.
+ */
+void
+initializeTraverseStack( traverseState *ts )
+{
+    if (ts->firstStack != NULL) {
+        freeChain(ts->firstStack);
+    }
+
+    ts->firstStack = allocGroup(BLOCKS_IN_STACK);
+    ts->firstStack->link = NULL;
+    ts->firstStack->u.back = NULL;
+
+    ts->stackSize = 0;
+    ts->maxStackSize = 0;
+
+    newStackBlock(ts, ts->firstStack);
+}
+
+/**
+ * Frees all the block groups in the traversal works-stack.
+ *
+ * Invariants:
+ *   firstStack != NULL
+ */
+void
+closeTraverseStack( traverseState *ts )
+{
+    freeChain(ts->firstStack);
+    ts->firstStack = NULL;
+}
+
+int
+getTraverseStackMaxSize(traverseState *ts)
+{
+    return ts->maxStackSize;
+}
+
+/* -----------------------------------------------------------------------------
+ * Returns true if the whole stack is empty.
+ * -------------------------------------------------------------------------- */
+STATIC_INLINE bool
+isEmptyWorkStack( traverseState *ts )
+{
+    return (ts->firstStack == ts->currentStack) && ts->stackTop == ts->stackLimit;
+}
+
+/* -----------------------------------------------------------------------------
+ * Returns size of stack
+ * -------------------------------------------------------------------------- */
+W_
+traverseWorkStackBlocks(traverseState *ts)
+{
+    bdescr* bd;
+    W_ res = 0;
+
+    for (bd = ts->firstStack; bd != NULL; bd = bd->link)
+      res += bd->blocks;
+
+    return res;
+}
+
+/* -----------------------------------------------------------------------------
+ * Initializes *info from ptrs and payload.
+ * Invariants:
+ *   payload[] begins with ptrs pointers followed by non-pointers.
+ * -------------------------------------------------------------------------- */
+STATIC_INLINE void
+init_ptrs( stackPos *info, uint32_t ptrs, StgPtr payload )
+{
+    info->type              = posTypePtrs;
+    info->next.ptrs.pos     = 0;
+    info->next.ptrs.ptrs    = ptrs;
+    info->next.ptrs.payload = payload;
+}
+
+/* -----------------------------------------------------------------------------
+ * Find the next object from *info.
+ * -------------------------------------------------------------------------- */
+STATIC_INLINE StgClosure *
+find_ptrs( stackPos *info )
+{
+    if (info->next.ptrs.pos < info->next.ptrs.ptrs) {
+        return (StgClosure *)info->next.ptrs.payload[info->next.ptrs.pos++];
+    } else {
+        return NULL;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ *  Initializes *info from SRT information stored in *infoTable.
+ * -------------------------------------------------------------------------- */
+STATIC_INLINE void
+init_srt_fun( stackPos *info, const StgFunInfoTable *infoTable )
+{
+    info->type = posTypeSRT;
+    if (infoTable->i.srt) {
+        info->next.srt.srt = (StgClosure*)GET_FUN_SRT(infoTable);
+    } else {
+        info->next.srt.srt = NULL;
+    }
+}
+
+STATIC_INLINE void
+init_srt_thunk( stackPos *info, const StgThunkInfoTable *infoTable )
+{
+    info->type = posTypeSRT;
+    if (infoTable->i.srt) {
+        info->next.srt.srt = (StgClosure*)GET_SRT(infoTable);
+    } else {
+        info->next.srt.srt = NULL;
+    }
+}
+
+/* -----------------------------------------------------------------------------
+ * Find the next object from *info.
+ * -------------------------------------------------------------------------- */
+STATIC_INLINE StgClosure *
+find_srt( stackPos *info )
+{
+    StgClosure *c;
+    if (info->type == posTypeSRT) {
+        c = info->next.srt.srt;
+        info->next.srt.srt = NULL;
+        return c;
+    }
+
+    return NULL;
+}
+
+/**
+ * Push a set of closures, represented by a single 'stackElement', onto the
+ * traversal work-stack.
+ */
+static void
+pushStackElement(traverseState *ts, stackElement *se)
+{
+    bdescr *nbd;      // Next Block Descriptor
+    if (ts->stackTop - 1 < ts->stackBottom) {
+        debug("pushStackElement() to the next stack.\n");
+
+        // currentStack->free is updated when the active stack is switched
+        // to the next stack.
+        ts->currentStack->free = (StgPtr)ts->stackTop;
+
+        if (ts->currentStack->link == NULL) {
+            nbd = allocGroup(BLOCKS_IN_STACK);
+            nbd->link = NULL;
+            nbd->u.back = ts->currentStack;
+            ts->currentStack->link = nbd;
+        } else
+            nbd = ts->currentStack->link;
+
+        newStackBlock(ts, nbd);
+    }
+
+    // adjust stackTop (acutal push)
+    ts->stackTop--;
+    // If the size of stackElement was huge, we would better replace the
+    // following statement by either a memcpy() call or a switch statement
+    // on the type of the element. Currently, the size of stackElement is
+    // small enough (5 words) that this direct assignment seems to be enough.
+    *ts->stackTop = *se;
+
+    ts->stackSize++;
+    if (ts->stackSize > ts->maxStackSize) ts->maxStackSize = ts->stackSize;
+    ASSERT(ts->stackSize >= 0);
+    debug("stackSize = %d\n", ts->stackSize);
+}
+
+/**
+ * Push a single closure onto the traversal work-stack.
+ *
+ *  cp   - object's parent
+ *  c    - closure
+ *  data - data associated with closure.
+ */
+inline void
+traversePushClosure(traverseState *ts, StgClosure *c, StgClosure *cp, stackData data) {
+    stackElement se;
+
+    se.c = c;
+    se.cp = cp;
+    se.data = data;
+    se.info.type = posTypeFresh;
+
+    pushStackElement(ts, &se);
+};
+
+/**
+ * traversePushChildren() extracts the first child of 'c' in 'first_child' and
+ * conceptually pushes all remaining children of 'c' onto the traversal stack
+ * while associating 'data' with the pushed elements to be returned upon poping.
+ *
+ * If 'c' has no children, 'first_child' is set to NULL and nothing is pushed
+ * onto the stack.
+ *
+ * If 'c' has only one child, 'first_child' is set to that child and nothing is
+ * pushed onto the stack.
+ *
+ * Invariants:
+ *
+ *  - 'c' is not any of TSO, AP, PAP, AP_STACK, which means that there cannot
+ *       be any stack objects.
+ *
+ * Note: SRTs are considered to be children as well.
+ *
+ * Note: When pushing onto the stack we only really push one 'stackElement'
+ * representing all children onto the stack. See traversePop()
+ */
+STATIC_INLINE void
+traversePushChildren(traverseState *ts, StgClosure *c, stackData data, StgClosure **first_child)
+{
+    stackElement se;
+
+    debug("traversePushChildren(): stackTop = 0x%x\n", ts->stackTop);
+
+    ASSERT(get_itbl(c)->type != TSO);
+    ASSERT(get_itbl(c)->type != AP_STACK);
+
+    //
+    // fill in se
+    //
+
+    se.c = c;
+    se.data = data;
+    // Note: se.cp ommitted on purpose, only traversePushClosure uses that.
+
+    // fill in se.info
+    switch (get_itbl(c)->type) {
+        // no child, no SRT
+    case CONSTR_0_1:
+    case CONSTR_0_2:
+    case ARR_WORDS:
+    case COMPACT_NFDATA:
+        *first_child = NULL;
+        return;
+
+        // one child (fixed), no SRT
+    case MUT_VAR_CLEAN:
+    case MUT_VAR_DIRTY:
+        *first_child = ((StgMutVar *)c)->var;
+        return;
+    case THUNK_SELECTOR:
+        *first_child = ((StgSelector *)c)->selectee;
+        return;
+    case BLACKHOLE:
+        *first_child = ((StgInd *)c)->indirectee;
+        return;
+    case CONSTR_1_0:
+    case CONSTR_1_1:
+        *first_child = c->payload[0];
+        return;
+
+        // For CONSTR_2_0 and MVAR, we use se.info.step to record the position
+        // of the next child. We do not write a separate initialization code.
+        // Also we do not have to initialize info.type;
+
+        // two children (fixed), no SRT
+        // need to push a stackElement, but nothing to store in se.info
+    case CONSTR_2_0:
+        *first_child = c->payload[0];         // return the first pointer
+        se.info.type = posTypeStep;
+        se.info.next.step = 2;            // 2 = second
+        break;
+
+        // three children (fixed), no SRT
+        // need to push a stackElement
+    case MVAR_CLEAN:
+    case MVAR_DIRTY:
+        // head must be TSO and the head of a linked list of TSOs.
+        // Shoule it be a child? Seems to be yes.
+        *first_child = (StgClosure *)((StgMVar *)c)->head;
+        se.info.type = posTypeStep;
+        se.info.next.step = 2;            // 2 = second
+        break;
+
+        // three children (fixed), no SRT
+    case WEAK:
+        *first_child = ((StgWeak *)c)->key;
+        se.info.type = posTypeStep;
+        se.info.next.step = 2;
+        break;
+
+        // layout.payload.ptrs, no SRT
+    case TVAR:
+    case CONSTR:
+    case CONSTR_NOCAF:
+    case PRIM:
+    case MUT_PRIM:
+    case BCO:
+        init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs,
+                  (StgPtr)c->payload);
+        *first_child = find_ptrs(&se.info);
+        if (*first_child == NULL)
+            return;   // no child
+        break;
+
+        // StgMutArrPtr.ptrs, no SRT
+    case MUT_ARR_PTRS_CLEAN:
+    case MUT_ARR_PTRS_DIRTY:
+    case MUT_ARR_PTRS_FROZEN_CLEAN:
+    case MUT_ARR_PTRS_FROZEN_DIRTY:
+        init_ptrs(&se.info, ((StgMutArrPtrs *)c)->ptrs,
+                  (StgPtr)(((StgMutArrPtrs *)c)->payload));
+        *first_child = find_ptrs(&se.info);
+        if (*first_child == NULL)
+            return;
+        break;
+
+        // StgMutArrPtr.ptrs, no SRT
+    case SMALL_MUT_ARR_PTRS_CLEAN:
+    case SMALL_MUT_ARR_PTRS_DIRTY:
+    case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+    case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
+        init_ptrs(&se.info, ((StgSmallMutArrPtrs *)c)->ptrs,
+                  (StgPtr)(((StgSmallMutArrPtrs *)c)->payload));
+        *first_child = find_ptrs(&se.info);
+        if (*first_child == NULL)
+            return;
+        break;
+
+    // layout.payload.ptrs, SRT
+    case FUN_STATIC:
+    case FUN:           // *c is a heap object.
+    case FUN_2_0:
+        init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs, (StgPtr)c->payload);
+        *first_child = find_ptrs(&se.info);
+        if (*first_child == NULL)
+            // no child from ptrs, so check SRT
+            goto fun_srt_only;
+        break;
+
+    case THUNK:
+    case THUNK_2_0:
+        init_ptrs(&se.info, get_itbl(c)->layout.payload.ptrs,
+                  (StgPtr)((StgThunk *)c)->payload);
+        *first_child = find_ptrs(&se.info);
+        if (*first_child == NULL)
+            // no child from ptrs, so check SRT
+            goto thunk_srt_only;
+        break;
+
+        // 1 fixed child, SRT
+    case FUN_1_0:
+    case FUN_1_1:
+        *first_child = c->payload[0];
+        ASSERT(*first_child != NULL);
+        init_srt_fun(&se.info, get_fun_itbl(c));
+        break;
+
+    case THUNK_1_0:
+    case THUNK_1_1:
+        *first_child = ((StgThunk *)c)->payload[0];
+        ASSERT(*first_child != NULL);
+        init_srt_thunk(&se.info, get_thunk_itbl(c));
+        break;
+
+    case FUN_0_1:      // *c is a heap object.
+    case FUN_0_2:
+    fun_srt_only:
+        init_srt_fun(&se.info, get_fun_itbl(c));
+        *first_child = find_srt(&se.info);
+        if (*first_child == NULL)
+            return;     // no child
+        break;
+
+    // SRT only
+    case THUNK_STATIC:
+        ASSERT(get_itbl(c)->srt != 0);
+        /* fall-thru */
+    case THUNK_0_1:
+    case THUNK_0_2:
+    thunk_srt_only:
+        init_srt_thunk(&se.info, get_thunk_itbl(c));
+        *first_child = find_srt(&se.info);
+        if (*first_child == NULL)
+            return;     // no child
+        break;
+
+    case TREC_CHUNK:
+        *first_child = (StgClosure *)((StgTRecChunk *)c)->prev_chunk;
+        se.info.type = posTypeStep;
+        se.info.next.step = 0;  // entry no.
+        break;
+
+        // cannot appear
+    case PAP:
+    case AP:
+    case AP_STACK:
+    case TSO:
+    case STACK:
+    case IND_STATIC:
+        // stack objects
+    case UPDATE_FRAME:
+    case CATCH_FRAME:
+    case UNDERFLOW_FRAME:
+    case STOP_FRAME:
+    case RET_BCO:
+    case RET_SMALL:
+    case RET_BIG:
+        // invalid objects
+    case IND:
+    case INVALID_OBJECT:
+    default:
+        barf("Invalid object *c in push(): %d", get_itbl(c)->type);
+        return;
+    }
+
+    // se.cp has to be initialized when type==posTypeFresh. We don't do that
+    // here though. So type must be !=posTypeFresh.
+    ASSERT(se.info.type != posTypeFresh);
+
+    pushStackElement(ts, &se);
+}
+
+/**
+ *  popStackElement(): Remove a depleted stackElement from the top of the
+ *  traversal work-stack.
+ *
+ *  Invariants:
+ *    stackTop cannot be equal to stackLimit unless the whole stack is
+ *    empty, in which case popStackElement() is not allowed.
+ */
+static void
+popStackElement(traverseState *ts) {
+    debug("popStackElement(): stackTop = 0x%x\n", ts->stackTop);
+
+    ASSERT(ts->stackTop != ts->stackLimit);
+    ASSERT(!isEmptyWorkStack(ts));
+
+    // <= (instead of <) is wrong!
+    if (ts->stackTop + 1 < ts->stackLimit) {
+        ts->stackTop++;
+
+        ts->stackSize--;
+        if (ts->stackSize > ts->maxStackSize) ts->maxStackSize = ts->stackSize;
+        ASSERT(ts->stackSize >= 0);
+        debug("stackSize = (--) %d\n", ts->stackSize);
+
+        return;
+    }
+
+    bdescr *pbd;    // Previous Block Descriptor
+
+    debug("popStackElement() to the previous stack.\n");
+
+    ASSERT(ts->stackTop + 1 == ts->stackLimit);
+    ASSERT(ts->stackBottom == (stackElement *)ts->currentStack->start);
+
+    if (ts->firstStack == ts->currentStack) {
+        // The stack is completely empty.
+        ts->stackTop++;
+        ASSERT(ts->stackTop == ts->stackLimit);
+
+        ts->stackSize--;
+        if (ts->stackSize > ts->maxStackSize) ts->maxStackSize = ts->stackSize;
+        ASSERT(ts->stackSize >= 0);
+        debug("stackSize = %d\n", ts->stackSize);
+
+        return;
+    }
+
+    // currentStack->free is updated when the active stack is switched back
+    // to the previous stack.
+    ts->currentStack->free = (StgPtr)ts->stackLimit;
+
+    // find the previous block descriptor
+    pbd = ts->currentStack->u.back;
+    ASSERT(pbd != NULL);
+
+    returnToOldStack(ts, pbd);
+
+    ts->stackSize--;
+    if (ts->stackSize > ts->maxStackSize) ts->maxStackSize = ts->stackSize;
+    ASSERT(ts->stackSize >= 0);
+    debug("stackSize = %d\n", ts->stackSize);
+}
+
+/**
+ *  Finds the next object to be considered for retainer profiling and store
+ *  its pointer to *c.
+ *
+ *  If the unprocessed object was stored in the stack (posTypeFresh), the
+ *  this object is returned as-is. Otherwise Test if the topmost stack
+ *  element indicates that more objects are left,
+ *  and if so, retrieve the first object and store its pointer to *c. Also,
+ *  set *cp and *data appropriately, both of which are stored in the stack
+ *  element.  The topmost stack element then is overwritten so as for it to now
+ *  denote the next object.
+ *
+ *  If the topmost stack element indicates no more objects are left, pop
+ *  off the stack element until either an object can be retrieved or
+ *  the work-stack becomes empty, indicated by true returned by
+ *  isEmptyWorkStack(), in which case *c is set to NULL.
+ *
+ *  Note:
+ *
+ *    It is okay to call this function even when the work-stack is empty.
+ */
+STATIC_INLINE void
+traversePop(traverseState *ts, StgClosure **c, StgClosure **cp, stackData *data)
+{
+    stackElement *se;
+
+    debug("traversePop(): stackTop = 0x%x\n", ts->stackTop);
+
+    // Is this the last internal element? If so instead of modifying the current
+    // stackElement in place we actually remove it from the stack.
+    bool last = false;
+
+    do {
+        if (isEmptyWorkStack(ts)) {
+            *c = NULL;
+            return;
+        }
+
+        // Note: Below every `break`, where the loop condition is true, must be
+        // accompanied by a popStackElement() otherwise this is an infinite
+        // loop.
+        se = ts->stackTop;
+
+        // If this is a top-level element, you should pop that out.
+        if (se->info.type == posTypeFresh) {
+            *cp = se->cp;
+            *c = se->c;
+            *data = se->data;
+            popStackElement(ts);
+            return;
+        }
+
+        // Note: The first ptr of all of these was already returned as
+        // *fist_child in push(), so we always start with the second field.
+        switch (get_itbl(se->c)->type) {
+            // two children (fixed), no SRT
+            // nothing in se.info
+        case CONSTR_2_0:
+            *c = se->c->payload[1];
+            last = true;
+            goto out;
+
+            // three children (fixed), no SRT
+            // need to push a stackElement
+        case MVAR_CLEAN:
+        case MVAR_DIRTY:
+            if (se->info.next.step == 2) {
+                *c = (StgClosure *)((StgMVar *)se->c)->tail;
+                se->info.next.step++;             // move to the next step
+                // no popStackElement
+            } else {
+                *c = ((StgMVar *)se->c)->value;
+                last = true;
+            }
+            goto out;
+
+            // three children (fixed), no SRT
+        case WEAK:
+            if (se->info.next.step == 2) {
+                *c = ((StgWeak *)se->c)->value;
+                se->info.next.step++;
+                // no popStackElement
+            } else {
+                *c = ((StgWeak *)se->c)->finalizer;
+                last = true;
+            }
+            goto out;
+
+        case TREC_CHUNK: {
+            // These are pretty complicated: we have N entries, each
+            // of which contains 3 fields that we want to follow.  So
+            // we divide the step counter: the 2 low bits indicate
+            // which field, and the rest of the bits indicate the
+            // entry number (starting from zero).
+            TRecEntry *entry;
+            uint32_t entry_no = se->info.next.step >> 2;
+            uint32_t field_no = se->info.next.step & 3;
+            if (entry_no == ((StgTRecChunk *)se->c)->next_entry_idx) {
+                *c = NULL;
+                popStackElement(ts);
+                break; // this breaks out of the switch not the loop
+            }
+            entry = &((StgTRecChunk *)se->c)->entries[entry_no];
+            if (field_no == 0) {
+                *c = (StgClosure *)entry->tvar;
+            } else if (field_no == 1) {
+                *c = entry->expected_value;
+            } else {
+                *c = entry->new_value;
+            }
+            se->info.next.step++;
+            goto out;
+        }
+
+        case TVAR:
+        case CONSTR:
+        case PRIM:
+        case MUT_PRIM:
+        case BCO:
+            // StgMutArrPtr.ptrs, no SRT
+        case MUT_ARR_PTRS_CLEAN:
+        case MUT_ARR_PTRS_DIRTY:
+        case MUT_ARR_PTRS_FROZEN_CLEAN:
+        case MUT_ARR_PTRS_FROZEN_DIRTY:
+        case SMALL_MUT_ARR_PTRS_CLEAN:
+        case SMALL_MUT_ARR_PTRS_DIRTY:
+        case SMALL_MUT_ARR_PTRS_FROZEN_CLEAN:
+        case SMALL_MUT_ARR_PTRS_FROZEN_DIRTY:
+            *c = find_ptrs(&se->info);
+            if (*c == NULL) {
+                popStackElement(ts);
+                break; // this breaks out of the switch not the loop
+            }
+            goto out;
+
+            // layout.payload.ptrs, SRT
+        case FUN:         // always a heap object
+        case FUN_STATIC:
+        case FUN_2_0:
+            if (se->info.type == posTypePtrs) {
+                *c = find_ptrs(&se->info);
+                if (*c != NULL) {
+                    goto out;
+                }
+                init_srt_fun(&se->info, get_fun_itbl(se->c));
+            }
+            goto do_srt;
+
+        case THUNK:
+        case THUNK_2_0:
+            if (se->info.type == posTypePtrs) {
+                *c = find_ptrs(&se->info);
+                if (*c != NULL) {
+                    goto out;
+                }
+                init_srt_thunk(&se->info, get_thunk_itbl(se->c));
+            }
+            goto do_srt;
+
+            // SRT
+        do_srt:
+        case THUNK_STATIC:
+        case FUN_0_1:
+        case FUN_0_2:
+        case THUNK_0_1:
+        case THUNK_0_2:
+        case FUN_1_0:
+        case FUN_1_1:
+        case THUNK_1_0:
+        case THUNK_1_1:
+            *c = find_srt(&se->info);
+            if(*c == NULL) {
+                popStackElement(ts);
+                break; // this breaks out of the switch not the loop
+            }
+            goto out;
+
+            // no child (fixed), no SRT
+        case CONSTR_0_1:
+        case CONSTR_0_2:
+        case ARR_WORDS:
+            // one child (fixed), no SRT
+        case MUT_VAR_CLEAN:
+        case MUT_VAR_DIRTY:
+        case THUNK_SELECTOR:
+        case CONSTR_1_1:
+            // cannot appear
+        case PAP:
+        case AP:
+        case AP_STACK:
+        case TSO:
+        case STACK:
+        case IND_STATIC:
+        case CONSTR_NOCAF:
+            // stack objects
+        case UPDATE_FRAME:
+        case CATCH_FRAME:
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
+        case RET_BCO:
+        case RET_SMALL:
+        case RET_BIG:
+            // invalid objects
+        case IND:
+        case INVALID_OBJECT:
+        default:
+            barf("Invalid object *c in traversePop(): %d", get_itbl(se->c)->type);
+            return;
+        }
+    } while (*c == NULL);
+
+out:
+
+    ASSERT(*c != NULL);
+
+    *cp = se->c;
+    *data = se->data;
+
+    if(last)
+        popStackElement(ts);
+
+    return;
+
+}
+
+/**
+ * Make sure a closure's profiling data is initialized to zero if it does not
+ * conform to the current value of the flip bit, returns true in this case.
+ *
+ * See Note [Profiling heap traversal visited bit].
+ */
+bool
+traverseMaybeInitClosureData(StgClosure *c)
+{
+    if (!isTravDataValid(c)) {
+        setTravDataToZero(c);
+        return true;
+    }
+    return false;
+}
+
+/* -----------------------------------------------------------------------------
+   Call traversePushClosure for each of the closures covered by a large bitmap.
+   -------------------------------------------------------------------------- */
+
+static void
+traverseLargeBitmap (traverseState *ts, StgPtr p, StgLargeBitmap *large_bitmap,
+                     uint32_t size, StgClosure *c, stackData data)
+{
+    uint32_t i, b;
+    StgWord bitmap;
+
+    b = 0;
+    bitmap = large_bitmap->bitmap[b];
+    for (i = 0; i < size; ) {
+        if ((bitmap & 1) == 0) {
+            traversePushClosure(ts, (StgClosure *)*p, c, data);
+        }
+        i++;
+        p++;
+        if (i % BITS_IN(W_) == 0) {
+            b++;
+            bitmap = large_bitmap->bitmap[b];
+        } else {
+            bitmap = bitmap >> 1;
+        }
+    }
+}
+
+STATIC_INLINE StgPtr
+traverseSmallBitmap (traverseState *ts, StgPtr p, uint32_t size, StgWord bitmap,
+                     StgClosure *c, stackData data)
+{
+    while (size > 0) {
+        if ((bitmap & 1) == 0) {
+            traversePushClosure(ts, (StgClosure *)*p, c, data);
+        }
+        p++;
+        bitmap = bitmap >> 1;
+        size--;
+    }
+    return p;
+}
+
+/**
+ *  traversePushStack(ts, cp, data, stackStart, stackEnd) pushes all the objects
+ *  in the STG stack-chunk from stackStart to stackEnd onto the traversal
+ *  work-stack with 'c' and 'data' being their parent and associated data,
+ *  respectively.
+ *
+ *  Invariants:
+ *
+ *    *cp is one of the following: TSO, AP_STACK.
+ *
+ *    stackStart < stackEnd.
+ *
+ *    If *c is TSO, its state is not ThreadComplete,or ThreadKilled,
+ *    which means that its stack is ready to process.
+ *
+ *  Note:
+ *
+ *    This code was almost plagiarzied from GC.c! For each pointer,
+ *    traversePushClosure() is invoked instead of evacuate().
+ */
+static void
+traversePushStack(traverseState *ts, StgClosure *cp, stackData data,
+                  StgPtr stackStart, StgPtr stackEnd)
+{
+    StgPtr p;
+    const StgRetInfoTable *info;
+    StgWord bitmap;
+    uint32_t size;
+
+    ASSERT(get_itbl(cp)->type == STACK);
+
+    p = stackStart;
+    while (p < stackEnd) {
+        info = get_ret_itbl((StgClosure *)p);
+
+        switch(info->i.type) {
+
+        case UPDATE_FRAME:
+            traversePushClosure(ts, ((StgUpdateFrame *)p)->updatee, cp, data);
+            p += sizeofW(StgUpdateFrame);
+            continue;
+
+        case UNDERFLOW_FRAME:
+        case STOP_FRAME:
+        case CATCH_FRAME:
+        case CATCH_STM_FRAME:
+        case CATCH_RETRY_FRAME:
+        case ATOMICALLY_FRAME:
+        case RET_SMALL:
+            bitmap = BITMAP_BITS(info->i.layout.bitmap);
+            size   = BITMAP_SIZE(info->i.layout.bitmap);
+            p++;
+            p = traverseSmallBitmap(ts, p, size, bitmap, cp, data);
+
+        follow_srt:
+            if (info->i.srt) {
+                traversePushClosure(ts, GET_SRT(info), cp, data);
+            }
+            continue;
+
+        case RET_BCO: {
+            StgBCO *bco;
+
+            p++;
+            traversePushClosure(ts, (StgClosure*)*p, cp, data);
+            bco = (StgBCO *)*p;
+            p++;
+            size = BCO_BITMAP_SIZE(bco);
+            traverseLargeBitmap(ts, p, BCO_BITMAP(bco), size, cp, data);
+            p += size;
+            continue;
+        }
+
+            // large bitmap (> 32 entries, or > 64 on a 64-bit machine)
+        case RET_BIG:
+            size = GET_LARGE_BITMAP(&info->i)->size;
+            p++;
+            traverseLargeBitmap(ts, p, GET_LARGE_BITMAP(&info->i),
+                                size, cp, data);
+            p += size;
+            // and don't forget to follow the SRT
+            goto follow_srt;
+
+        case RET_FUN: {
+            StgRetFun *ret_fun = (StgRetFun *)p;
+            const StgFunInfoTable *fun_info;
+
+            traversePushClosure(ts, ret_fun->fun, cp, data);
+            fun_info = get_fun_itbl(UNTAG_CONST_CLOSURE(ret_fun->fun));
+
+            p = (P_)&ret_fun->payload;
+            switch (fun_info->f.fun_type) {
+            case ARG_GEN:
+                bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+                size = BITMAP_SIZE(fun_info->f.b.bitmap);
+                p = traverseSmallBitmap(ts, p, size, bitmap, cp, data);
+                break;
+            case ARG_GEN_BIG:
+                size = GET_FUN_LARGE_BITMAP(fun_info)->size;
+                traverseLargeBitmap(ts, p, GET_FUN_LARGE_BITMAP(fun_info),
+                                    size, cp, data);
+                p += size;
+                break;
+            default:
+                bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+                size = BITMAP_SIZE(stg_arg_bitmaps[fun_info->f.fun_type]);
+                p = traverseSmallBitmap(ts, p, size, bitmap, cp, data);
+                break;
+            }
+            goto follow_srt;
+        }
+
+        default:
+            barf("Invalid object found in traversePushStack(): %d",
+                 (int)(info->i.type));
+        }
+    }
+}
+
+/* ----------------------------------------------------------------------------
+ * Call traversePushClosure for each of the children of a PAP/AP
+ * ------------------------------------------------------------------------- */
+
+STATIC_INLINE StgPtr
+traversePAP (traverseState *ts,
+                    StgClosure *pap,    /* NOT tagged */
+                    stackData data,
+                    StgClosure *fun,    /* tagged */
+                    StgClosure** payload, StgWord n_args)
+{
+    StgPtr p;
+    StgWord bitmap;
+    const StgFunInfoTable *fun_info;
+
+    traversePushClosure(ts, fun, pap, data);
+    fun = UNTAG_CLOSURE(fun);
+    fun_info = get_fun_itbl(fun);
+    ASSERT(fun_info->i.type != PAP);
+
+    p = (StgPtr)payload;
+
+    switch (fun_info->f.fun_type) {
+    case ARG_GEN:
+        bitmap = BITMAP_BITS(fun_info->f.b.bitmap);
+        p = traverseSmallBitmap(ts, p, n_args, bitmap,
+                                pap, data);
+        break;
+    case ARG_GEN_BIG:
+        traverseLargeBitmap(ts, p, GET_FUN_LARGE_BITMAP(fun_info),
+                            n_args, pap, data);
+        p += n_args;
+        break;
+    case ARG_BCO:
+        traverseLargeBitmap(ts, (StgPtr)payload, BCO_BITMAP(fun),
+                            n_args, pap, data);
+        p += n_args;
+        break;
+    default:
+        bitmap = BITMAP_BITS(stg_arg_bitmaps[fun_info->f.fun_type]);
+        p = traverseSmallBitmap(ts, p, n_args, bitmap, pap, data);
+        break;
+    }
+    return p;
+}
+
+static void
+resetMutableObjects(void)
+{
+    uint32_t g, n;
+    bdescr *bd;
+    StgPtr ml;
+
+    // The following code resets the 'trav' field of each unvisited mutable
+    // object.
+    for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
+        // NOT true: even G0 has a block on its mutable list
+        // ASSERT(g != 0 || (generations[g].mut_list == NULL));
+
+        // Traversing through mut_list is necessary
+        // because we can find MUT_VAR objects which have not been
+        // visited during heap traversal.
+        for (n = 0; n < n_capabilities; n++) {
+          for (bd = capabilities[n]->mut_lists[g]; bd != NULL; bd = bd->link) {
+            for (ml = bd->start; ml < bd->free; ml++) {
+
+                traverseMaybeInitClosureData((StgClosure *)*ml);
+            }
+          }
+        }
+    }
+}
+
+/**
+ * Traverse all closures on the traversal work-stack, calling 'visit_cb' on each
+ * closure. See 'visitClosure_cb' for details. This function flips the 'flip'
+ * bit and hence every closure's profiling data will be reset to zero upon
+ * visiting. See Note [Profiling heap traversal visited bit].
+ */
+void
+traverseWorkStack(traverseState *ts, visitClosure_cb visit_cb)
+{
+    // first_child = first child of c
+    StgClosure *c, *cp, *first_child;
+    stackData data, child_data;
+    StgWord typeOfc;
+
+    // Now we flip the flip bit.
+    flip = flip ^ 1;
+
+    // c = Current closure                           (possibly tagged)
+    // cp = Current closure's Parent                 (NOT tagged)
+    // data = current closures' associated data      (NOT tagged)
+    // data_out = data to associate with current closure's children
+
+loop:
+    traversePop(ts, &c, &cp, &data);
+
+    if (c == NULL) {
+        debug("maxStackSize= %d\n", ts->maxStackSize);
+        resetMutableObjects();
+        return;
+    }
+inner_loop:
+    c = UNTAG_CLOSURE(c);
+
+    typeOfc = get_itbl(c)->type;
+
+    // special cases
+    switch (typeOfc) {
+    case TSO:
+        if (((StgTSO *)c)->what_next == ThreadComplete ||
+            ((StgTSO *)c)->what_next == ThreadKilled) {
+            debug("ThreadComplete or ThreadKilled encountered in traverseWorkStack()\n");
+            goto loop;
+        }
+        break;
+
+    case IND_STATIC:
+        // We just skip IND_STATIC, so it's never visited.
+        c = ((StgIndStatic *)c)->indirectee;
+        goto inner_loop;
+
+    case CONSTR_NOCAF:
+        // static objects with no pointers out, so goto loop.
+
+        // It is not just enough not to visit *c; it is
+        // mandatory because CONSTR_NOCAF are not reachable from
+        // scavenged_static_objects, the list from which is assumed to traverse
+        // all static objects after major garbage collections.
+        goto loop;
+
+    case THUNK_STATIC:
+        if (get_itbl(c)->srt == 0) {
+            // No need to visit *c; no dynamic objects are reachable from it.
+            //
+            // Static objects: if we traverse all the live closures,
+            // including static closures, during each heap census then
+            // we will observe that some static closures appear and
+            // disappear.  eg. a closure may contain a pointer to a
+            // static function 'f' which is not otherwise reachable
+            // (it doesn't indirectly point to any CAFs, so it doesn't
+            // appear in any SRTs), so we would find 'f' during
+            // traversal.  However on the next sweep there may be no
+            // closures pointing to 'f'.
+            //
+            // We must therefore ignore static closures whose SRT is
+            // empty, because these are exactly the closures that may
+            // "appear".  A closure with a non-empty SRT, and which is
+            // still required, will always be reachable.
+            //
+            // But what about CONSTR?  Surely these may be able
+            // to appear, and they don't have SRTs, so we can't
+            // check.  So for now, we're calling
+            // resetStaticObjectForProfiling() from the
+            // garbage collector to reset the retainer sets in all the
+            // reachable static objects.
+            goto loop;
+        }
+        /* fall-thru */
+
+    case FUN_STATIC: {
+        const StgInfoTable *info = get_itbl(c);
+        if (info->srt == 0 && info->layout.payload.ptrs == 0) {
+            goto loop;
+        } else {
+            break;
+        }
+    }
+
+    default:
+        break;
+    }
+
+    // If this is the first visit to c, initialize its data.
+    bool first_visit = traverseMaybeInitClosureData(c);
+    bool traverse_children
+        = visit_cb(c, cp, data, first_visit, (stackData*)&child_data);
+    if(!traverse_children)
+        goto loop;
+
+    // process child
+
+    // Special case closures: we process these all in one go rather
+    // than attempting to save the current position, because doing so
+    // would be hard.
+    switch (typeOfc) {
+    case STACK:
+        traversePushStack(ts, c, child_data,
+                    ((StgStack *)c)->sp,
+                    ((StgStack *)c)->stack + ((StgStack *)c)->stack_size);
+        goto loop;
+
+    case TSO:
+    {
+        StgTSO *tso = (StgTSO *)c;
+
+        traversePushClosure(ts, (StgClosure *) tso->stackobj, c, child_data);
+        traversePushClosure(ts, (StgClosure *) tso->blocked_exceptions, c, child_data);
+        traversePushClosure(ts, (StgClosure *) tso->bq, c, child_data);
+        traversePushClosure(ts, (StgClosure *) tso->trec, c, child_data);
+        if (   tso->why_blocked == BlockedOnMVar
+               || tso->why_blocked == BlockedOnMVarRead
+               || tso->why_blocked == BlockedOnBlackHole
+               || tso->why_blocked == BlockedOnMsgThrowTo
+            ) {
+            traversePushClosure(ts, tso->block_info.closure, c, child_data);
+        }
+        goto loop;
+    }
+
+    case BLOCKING_QUEUE:
+    {
+        StgBlockingQueue *bq = (StgBlockingQueue *)c;
+        traversePushClosure(ts, (StgClosure *) bq->link,  c, child_data);
+        traversePushClosure(ts, (StgClosure *) bq->bh,    c, child_data);
+        traversePushClosure(ts, (StgClosure *) bq->owner, c, child_data);
+        goto loop;
+    }
+
+    case PAP:
+    {
+        StgPAP *pap = (StgPAP *)c;
+        traversePAP(ts, c, child_data, pap->fun, pap->payload, pap->n_args);
+        goto loop;
+    }
+
+    case AP:
+    {
+        StgAP *ap = (StgAP *)c;
+        traversePAP(ts, c, child_data, ap->fun, ap->payload, ap->n_args);
+        goto loop;
+    }
+
+    case AP_STACK:
+        traversePushClosure(ts, ((StgAP_STACK *)c)->fun, c, child_data);
+        traversePushStack(ts, c, child_data,
+                    (StgPtr)((StgAP_STACK *)c)->payload,
+                    (StgPtr)((StgAP_STACK *)c)->payload +
+                             ((StgAP_STACK *)c)->size);
+        goto loop;
+    }
+
+    traversePushChildren(ts, c, child_data, &first_child);
+
+    // If first_child is null, c has no child.
+    // If first_child is not null, the top stack element points to the next
+    // object. traversePushChildren() may or may not push a stackElement on the
+    // stack.
+    if (first_child == NULL)
+        goto loop;
+
+    // (c, cp, data) = (first_child, c, child_data)
+    data = child_data;
+    cp = c;
+    c = first_child;
+    goto inner_loop;
+}
+
+/* -----------------------------------------------------------------------------
+ *  Traverse all static objects for which we compute retainer sets,
+ *  and reset their rs fields to NULL, which is accomplished by
+ *  invoking traverseMaybeInitClosureData(). This function must be called
+ *  before zeroing all objects reachable from scavenged_static_objects
+ *  in the case of major garbage collections. See GarbageCollect() in
+ *  GC.c.
+ *  Note:
+ *    The mut_once_list of the oldest generation must also be traversed?
+ *    Why? Because if the evacuation of an object pointed to by a static
+ *    indirection object fails, it is put back to the mut_once_list of
+ *    the oldest generation.
+ *    However, this is not necessary because any static indirection objects
+ *    are just traversed through to reach dynamic objects. In other words,
+ *    they are not taken into consideration in computing retainer sets.
+ *
+ * SDM (20/7/2011): I don't think this is doing anything sensible,
+ * because it happens before retainerProfile() and at the beginning of
+ * retainerProfil() we change the sense of 'flip'.  So all of the
+ * calls to traverseMaybeInitClosureData() here are initialising retainer sets
+ * with the wrong flip.  Also, I don't see why this is necessary.  I
+ * added a traverseMaybeInitClosureData() call to retainRoot(), and that seems
+ * to have fixed the assertion failure in retainerSetOf() I was
+ * encountering.
+ * -------------------------------------------------------------------------- */
+void
+resetStaticObjectForProfiling( StgClosure *static_objects )
+{
+    uint32_t count = 0;
+    StgClosure *p;
+
+    p = static_objects;
+    while (p != END_OF_STATIC_OBJECT_LIST) {
+        p = UNTAG_STATIC_LIST_PTR(p);
+        count++;
+
+        switch (get_itbl(p)->type) {
+        case IND_STATIC:
+            // Since we do not compute the retainer set of any
+            // IND_STATIC object, we don't have to reset its retainer
+            // field.
+            p = (StgClosure*)*IND_STATIC_LINK(p);
+            break;
+        case THUNK_STATIC:
+            traverseMaybeInitClosureData(p);
+            p = (StgClosure*)*THUNK_STATIC_LINK(p);
+            break;
+        case FUN_STATIC:
+        case CONSTR:
+        case CONSTR_1_0:
+        case CONSTR_2_0:
+        case CONSTR_1_1:
+        case CONSTR_NOCAF:
+            traverseMaybeInitClosureData(p);
+            p = (StgClosure*)*STATIC_LINK(get_itbl(p), p);
+            break;
+        default:
+            barf("resetStaticObjectForProfiling: %p (%lu)",
+                 p, (unsigned long)get_itbl(p)->type);
+            break;
+        }
+    }
+
+    debug("count in scavenged_static_objects = %d\n", count);
+}
+
+#endif /* PROFILING */
diff --git a/rts/rts.cabal.in b/rts/rts.cabal.in
index 7ce3c7f5aa..674566c0ad 100644
--- a/rts/rts.cabal.in
+++ b/rts/rts.cabal.in
@@ -430,6 +430,7 @@ library
                Timer.c
                TopHandler.c
                Trace.c
+               TraverseHeap.c
                WSDeque.c
                Weak.c
                eventlog/EventLog.c