19 files changed, 1040 insertions, 172 deletions
diff --git a/rts/CheckUnload.c b/rts/CheckUnload.c
index 2c01113f63..34f976db4d 100644
--- a/rts/CheckUnload.c
+++ b/rts/CheckUnload.c
@@ -271,7 +271,8 @@ void checkUnload (StgClosure *static_objects)
 
   addrs = allocHashTable();
 
-  for (p = static_objects; p != END_OF_STATIC_LIST; p = link) {
+  for (p = static_objects; p != END_OF_STATIC_OBJECT_LIST; p = link) {
+      p = UNTAG_STATIC_LIST_PTR(p);
       checkAddress(addrs, p);
       info = get_itbl(p);
       link = *STATIC_LINK(info, p);
@@ -279,8 +280,9 @@ void checkUnload (StgClosure *static_objects)
 
   // CAFs on revertible_caf_list are not on static_objects
   for (p = (StgClosure*)revertible_caf_list;
-       p != END_OF_STATIC_LIST;
+       p != END_OF_CAF_LIST;
        p = ((StgIndStatic *)p)->static_link) {
+      p = UNTAG_STATIC_LIST_PTR(p);
       checkAddress(addrs, p);
   }
 
diff --git a/rts/RetainerProfile.c b/rts/RetainerProfile.c
index 78daa892ad..ba58c199f0 100644
--- a/rts/RetainerProfile.c
+++ b/rts/RetainerProfile.c
@@ -1881,7 +1881,8 @@ resetStaticObjectForRetainerProfiling( StgClosure *static_objects )
     count = 0;
 #endif
     p = static_objects;
-    while (p != END_OF_STATIC_LIST) {
+    while (p != END_OF_STATIC_OBJECT_LIST) {
+        p = UNTAG_STATIC_LIST_PTR(p);
 #ifdef DEBUG_RETAINER
         count++;
 #endif
diff --git a/rts/Sparks.c b/rts/Sparks.c
index ada2adfd3a..ec075805bf 100644
--- a/rts/Sparks.c
+++ b/rts/Sparks.c
@@ -14,6 +14,7 @@
 #include "Trace.h"
 #include "Prelude.h"
 #include "Sparks.h"
+#include "sm/HeapAlloc.h"
 
 #if defined(THREADED_RTS)
 
diff --git a/rts/posix/OSMem.c b/rts/posix/OSMem.c
index 359df7022b..125ae10367 100644
--- a/rts/posix/OSMem.c
+++ b/rts/posix/OSMem.c
@@ -13,6 +13,7 @@
 
 #include "RtsUtils.h"
 #include "sm/OSMem.h"
+#include "sm/HeapAlloc.h"
 
 #ifdef HAVE_UNISTD_H
 #include <unistd.h>
@@ -72,23 +73,67 @@ void osMemInit(void)
 
    -------------------------------------------------------------------------- */
 
-// A wrapper around mmap(), to abstract away from OS differences in
-// the mmap() interface.
+/*
+ A wrapper around mmap(), to abstract away from OS differences in
+ the mmap() interface.
+
+ It supports the following operations:
+ - reserve: find a new chunk of available address space, and make it so
+            that we own it (no other library will get it), but don't actually
+            allocate memory for it
+            the addr is a hint for where to place the memory (and most
+            of the time the OS happily ignores!)
+ - commit: given a chunk of address space that we know we own, make sure
+           there is some memory backing it
+           the addr is not a hint, it must point into previously reserved
+           address space, or bad things happen
+ - reserve&commit: do both at the same time
+
+ The naming is chosen from the Win32 API (VirtualAlloc) which does the
+ same thing and has done so forever, while support for this in Unix systems
+ has only been added recently and is hidden in the posix portability mess.
+ It is confusing because to get the reserve behavior we need MAP_NORESERVE
+ (which tells the kernel not to allocate backing space), but heh...
+*/
+enum
+{
+    MEM_RESERVE = 1,
+    MEM_COMMIT = 2,
+    MEM_RESERVE_AND_COMMIT = MEM_RESERVE | MEM_COMMIT
+};
 
 static void *
-my_mmap (void *addr, W_ size)
+my_mmap (void *addr, W_ size, int operation)
 {
     void *ret;
+    int prot, flags;
+
+    if (operation & MEM_COMMIT)
+        prot = PROT_READ | PROT_WRITE;
+    else
+        prot = PROT_NONE;
+    if (operation == MEM_RESERVE)
+        flags = MAP_NORESERVE;
+    else if (operation == MEM_COMMIT)
+        flags = MAP_FIXED;
+    else
+        flags = 0;
 
 #if defined(solaris2_HOST_OS) || defined(irix_HOST_OS)
     {
-        int fd = open("/dev/zero",O_RDONLY);
-        ret = mmap(addr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
-        close(fd);
+        if (operation & MEM_RESERVE)
+        {
+            int fd = open("/dev/zero",O_RDONLY);
+            ret = mmap(addr, size, prot, flags | MAP_PRIVATE, fd, 0);
+            close(fd);
+        }
+        else
+        {
+            ret = mmap(addr, size, prot, flags | MAP_PRIVATE, -1, 0);
+        }
     }
 #elif hpux_HOST_OS
-    ret = mmap(addr, size, PROT_READ | PROT_WRITE,
-               MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+    ret = mmap(addr, size, prot, flags | MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 #elif darwin_HOST_OS
     // Without MAP_FIXED, Apple's mmap ignores addr.
     // With MAP_FIXED, it overwrites already mapped regions, whic
@@ -100,10 +145,16 @@ my_mmap (void *addr, W_ size)
 
     kern_return_t err = 0;
     ret = addr;
-    if(addr)    // try to allocate at address
-        err = vm_allocate(mach_task_self(),(vm_address_t*) &ret, size, FALSE);
-    if(!addr || err)    // try to allocate anywhere
-        err = vm_allocate(mach_task_self(),(vm_address_t*) &ret, size, TRUE);
+
+    if(operation & MEM_RESERVE)
+    {
+        if(addr)    // try to allocate at address
+            err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
+                              size, FALSE);
+        if(!addr || err)    // try to allocate anywhere
+            err = vm_allocate(mach_task_self(),(vm_address_t*) &ret,
+                              size, TRUE);
+    }
 
     if(err) {
         // don't know what the error codes mean exactly, assume it's
@@ -111,23 +162,24 @@ my_mmap (void *addr, W_ size)
         errorBelch("memory allocation failed (requested %" FMT_Word " bytes)",
                    size);
         stg_exit(EXIT_FAILURE);
-    } else {
+    }
+
+    if(operation & MEM_COMMIT) {
         vm_protect(mach_task_self(), (vm_address_t)ret, size, FALSE,
                    VM_PROT_READ|VM_PROT_WRITE);
     }
+
 #elif linux_HOST_OS
-    ret = mmap(addr, size, PROT_READ | PROT_WRITE,
-               MAP_ANON | MAP_PRIVATE, -1, 0);
+    ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
     if (ret == (void *)-1 && errno == EPERM) {
         // Linux may return EPERM if it tried to give us
         // a chunk of address space below mmap_min_addr,
         // See Trac #7500.
-        if (addr != 0) {
+        if (addr != 0 && (operation & MEM_RESERVE)) {
             // Try again with no hint address.
             // It's not clear that this can ever actually help,
             // but since our alternative is to abort, we may as well try.
-            ret = mmap(0, size, PROT_READ | PROT_WRITE,
-                       MAP_ANON | MAP_PRIVATE, -1, 0);
+            ret = mmap(0, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
         }
         if (ret == (void *)-1 && errno == EPERM) {
             // Linux is not willing to give us any mapping,
@@ -137,8 +189,7 @@ my_mmap (void *addr, W_ size)
         }
     }
 #else
-    ret = mmap(addr, size, PROT_READ | PROT_WRITE,
-               MAP_ANON | MAP_PRIVATE, -1, 0);
+    ret = mmap(addr, size, prot, flags | MAP_ANON | MAP_PRIVATE, -1, 0);
 #endif
 
     if (ret == (void *)-1) {
@@ -168,7 +219,7 @@ gen_map_mblocks (W_ size)
     // Try to map a larger block, and take the aligned portion from
     // it (unmap the rest).
     size += MBLOCK_SIZE;
-    ret = my_mmap(0, size);
+    ret = my_mmap(0, size, MEM_RESERVE_AND_COMMIT);
 
     // unmap the slop bits around the chunk we allocated
     slop = (W_)ret & MBLOCK_MASK;
@@ -207,7 +258,7 @@ osGetMBlocks(nat n)
       // use gen_map_mblocks the first time.
       ret = gen_map_mblocks(size);
   } else {
-      ret = my_mmap(next_request, size);
+      ret = my_mmap(next_request, size, MEM_RESERVE_AND_COMMIT);
 
       if (((W_)ret & MBLOCK_MASK) != 0) {
           // misaligned block!
@@ -244,10 +295,11 @@ void osReleaseFreeMemory(void) {
 void osFreeAllMBlocks(void)
 {
     void *mblock;
+    void *state;
 
-    for (mblock = getFirstMBlock();
+    for (mblock = getFirstMBlock(&state);
          mblock != NULL;
-         mblock = getNextMBlock(mblock)) {
+         mblock = getNextMBlock(&state, mblock)) {
         munmap(mblock, MBLOCK_SIZE);
     }
 }
@@ -318,3 +370,103 @@ void setExecutable (void *p, W_ len, rtsBool exec)
         barf("setExecutable: failed to protect 0x%p\n", p);
     }
 }
+
+#ifdef USE_LARGE_ADDRESS_SPACE
+
+static void *
+osTryReserveHeapMemory (void *hint)
+{
+    void *base, *top;
+    void *start, *end;
+
+    /* We try to allocate MBLOCK_SPACE_SIZE + MBLOCK_SIZE,
+       because we need memory which is MBLOCK_SIZE aligned,
+       and then we discard what we don't need */
+
+    base = my_mmap(hint, MBLOCK_SPACE_SIZE + MBLOCK_SIZE, MEM_RESERVE);
+    top = (void*)((W_)base + MBLOCK_SPACE_SIZE + MBLOCK_SIZE);
+
+    if (((W_)base & MBLOCK_MASK) != 0) {
+        start = MBLOCK_ROUND_UP(base);
+        end = MBLOCK_ROUND_DOWN(top);
+        ASSERT(((W_)end - (W_)start) == MBLOCK_SPACE_SIZE);
+
+        if (munmap(base, (W_)start-(W_)base) < 0) {
+            sysErrorBelch("unable to release slop before heap");
+        }
+        if (munmap(end, (W_)top-(W_)end) < 0) {
+            sysErrorBelch("unable to release slop after heap");
+        }
+    } else {
+        start = base;
+    }
+
+    return start;
+}
+
+void *osReserveHeapMemory(void)
+{
+    int attempt;
+    void *at;
+
+    /* We want to ensure the heap starts at least 8 GB inside the address space,
+       to make sure that any dynamically loaded code will be close enough to the
+       original code so that short relocations will work. This is in particular
+       important on Darwin/Mach-O, because object files not compiled as shared
+       libraries are position independent but cannot be loaded about 4GB.
+
+       We do so with a hint to the mmap, and we verify the OS satisfied our
+       hint. We loop a few times in case there is already something allocated
+       there, but we bail if we cannot allocate at all.
+    */
+
+    attempt = 0;
+    do {
+        at = osTryReserveHeapMemory((void*)((W_)8 * (1 << 30) +
+                                            attempt * BLOCK_SIZE));
+    } while ((W_)at < ((W_)8 * (1 << 30)));
+
+    return at;
+}
+
+void osCommitMemory(void *at, W_ size)
+{
+    my_mmap(at, size, MEM_COMMIT);
+}
+
+void osDecommitMemory(void *at, W_ size)
+{
+    int r;
+
+    // First make the memory unaccessible (so that we get a segfault
+    // at the next attempt to touch it)
+    // We only do this in DEBUG because it forces the OS to remove
+    // all MMU entries for this page range, and there is no reason
+    // to do so unless there is memory pressure
+#ifdef DEBUG
+    r = mprotect(at, size, PROT_NONE);
+    if(r < 0)
+        sysErrorBelch("unable to make released memory unaccessible");
+#endif
+
+#ifdef MADV_FREE
+    // Try MADV_FREE first, FreeBSD has both and MADV_DONTNEED
+    // just swaps memory out
+    r = madvise(at, size, MADV_FREE);
+#else
+    r = madvise(at, size, MADV_DONTNEED);
+#endif
+    if(r < 0)
+        sysErrorBelch("unable to decommit memory");
+}
+
+void osReleaseHeapMemory(void)
+{
+    int r;
+
+    r = munmap((void*)mblock_address_space_begin, MBLOCK_SPACE_SIZE);
+    if(r < 0)
+        sysErrorBelch("unable to release address space");
+}
+
+#endif
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index c2a5913963..e721fb13b6 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -736,7 +736,14 @@ void returnMemoryToOS(nat n /* megablocks */)
     }
     free_mblock_list = bd;
 
-    osReleaseFreeMemory();
+    // Ask the OS to release any address space portion
+    // that was associated with the just released MBlocks
+    //
+    // Historically, we used to ask the OS directly (via
+    // osReleaseFreeMemory()) - now the MBlock layer might
+    // have a reason to preserve the address space range,
+    // so we keep it
+    releaseFreeMemory();
 
     IF_DEBUG(gc,
         if (n != 0) {
@@ -869,11 +876,12 @@ void
 reportUnmarkedBlocks (void)
 {
     void *mblock;
+    void *state;
     bdescr *bd;
 
     debugBelch("Unreachable blocks:\n");
-    for (mblock = getFirstMBlock(); mblock != NULL;
-         mblock = getNextMBlock(mblock)) {
+    for (mblock = getFirstMBlock(&state); mblock != NULL;
+         mblock = getNextMBlock(&state, mblock)) {
         for (bd = FIRST_BDESCR(mblock); bd <= LAST_BDESCR(mblock); ) {
             if (!(bd->flags & BF_KNOWN) && bd->free != (P_)-1) {
                 debugBelch("  %p\n",bd);
diff --git a/rts/sm/Compact.c b/rts/sm/Compact.c
index a053dc3b4e..4ee88da11c 100644
--- a/rts/sm/Compact.c
+++ b/rts/sm/Compact.c
@@ -197,8 +197,8 @@ thread_static( StgClosure* p )
 
   // keep going until we've threaded all the objects on the linked
   // list...
-  while (p != END_OF_STATIC_LIST) {
-
+  while (p != END_OF_STATIC_OBJECT_LIST) {
+    p = UNTAG_STATIC_LIST_PTR(p);
     info = get_itbl(p);
     switch (info->type) {
 
diff --git a/rts/sm/Evac.c b/rts/sm/Evac.c
index b0ef807768..bc8cb9ad13 100644
--- a/rts/sm/Evac.c
+++ b/rts/sm/Evac.c
@@ -324,6 +324,38 @@ evacuate_large(StgPtr p)
 }
 
 /* ----------------------------------------------------------------------------
+   Evacuate static objects
+
+   When a static object is visited for the first time in this GC, it
+   is chained on to the gct->static_objects list.
+
+   evacuate_static_object (link_field, q)
+     - link_field must be STATIC_LINK(q)
+   ------------------------------------------------------------------------- */
+
+STATIC_INLINE void
+evacuate_static_object (StgClosure **link_field, StgClosure *q)
+{
+    StgWord link = (StgWord)*link_field;
+
+    // See Note [STATIC_LINK fields] for how the link field bits work
+    if ((((StgWord)(link)&STATIC_BITS) | prev_static_flag) != 3) {
+        StgWord new_list_head = (StgWord)q | static_flag;
+#ifndef THREADED_RTS
+        *link_field = gct->static_objects;
+        gct->static_objects = (StgClosure *)new_list_head;
+#else
+        StgWord prev;
+        prev = cas((StgVolatilePtr)link_field, link,
+                   (StgWord)gct->static_objects);
+        if (prev == link) {
+            gct->static_objects = (StgClosure *)new_list_head;
+        }
+#endif
+    }
+}
+
+/* ----------------------------------------------------------------------------
    Evacuate
 
    This is called (eventually) for every live object in the system.
@@ -392,38 +424,13 @@ loop:
 
       case THUNK_STATIC:
           if (info->srt_bitmap != 0) {
-              if (*THUNK_STATIC_LINK((StgClosure *)q) == NULL) {
-#ifndef THREADED_RTS
-                  *THUNK_STATIC_LINK((StgClosure *)q) = gct->static_objects;
-                  gct->static_objects = (StgClosure *)q;
-#else
-                  StgPtr link;
-                  link = (StgPtr)cas((StgPtr)THUNK_STATIC_LINK((StgClosure *)q),
-                                     (StgWord)NULL,
-                                     (StgWord)gct->static_objects);
-                  if (link == NULL) {
-                      gct->static_objects = (StgClosure *)q;
-                  }
-#endif
-              }
+              evacuate_static_object(THUNK_STATIC_LINK((StgClosure *)q), q);
           }
           return;
 
       case FUN_STATIC:
-          if (info->srt_bitmap != 0 &&
-              *FUN_STATIC_LINK((StgClosure *)q) == NULL) {
-#ifndef THREADED_RTS
-              *FUN_STATIC_LINK((StgClosure *)q) = gct->static_objects;
-              gct->static_objects = (StgClosure *)q;
-#else
-              StgPtr link;
-              link = (StgPtr)cas((StgPtr)FUN_STATIC_LINK((StgClosure *)q),
-                                 (StgWord)NULL,
-                                 (StgWord)gct->static_objects);
-              if (link == NULL) {
-                  gct->static_objects = (StgClosure *)q;
-              }
-#endif
+          if (info->srt_bitmap != 0) {
+              evacuate_static_object(FUN_STATIC_LINK((StgClosure *)q), q);
           }
           return;
 
@@ -432,39 +439,11 @@ loop:
            * on the CAF list, so don't do anything with it here (we'll
            * scavenge it later).
            */
-          if (*IND_STATIC_LINK((StgClosure *)q) == NULL) {
-#ifndef THREADED_RTS
-                  *IND_STATIC_LINK((StgClosure *)q) = gct->static_objects;
-                  gct->static_objects = (StgClosure *)q;
-#else
-                  StgPtr link;
-                  link = (StgPtr)cas((StgPtr)IND_STATIC_LINK((StgClosure *)q),
-                                     (StgWord)NULL,
-                                     (StgWord)gct->static_objects);
-                  if (link == NULL) {
-                      gct->static_objects = (StgClosure *)q;
-                  }
-#endif
-          }
+          evacuate_static_object(IND_STATIC_LINK((StgClosure *)q), q);
           return;
 
       case CONSTR_STATIC:
-          if (*STATIC_LINK(info,(StgClosure *)q) == NULL) {
-#ifndef THREADED_RTS
-              *STATIC_LINK(info,(StgClosure *)q) = gct->static_objects;
-              gct->static_objects = (StgClosure *)q;
-#else
-              StgPtr link;
-              link = (StgPtr)cas((StgPtr)STATIC_LINK(info,(StgClosure *)q),
-                                 (StgWord)NULL,
-                                 (StgWord)gct->static_objects);
-              if (link == NULL) {
-                  gct->static_objects = (StgClosure *)q;
-              }
-#endif
-          }
-          /* I am assuming that static_objects pointers are not
-           * written to other objects, and thus, no need to retag. */
+          evacuate_static_object(STATIC_LINK(info,(StgClosure *)q), q);
           return;
 
       case CONSTR_NOCAF_STATIC:
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index 52d7f98fa0..e6a23395eb 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -134,6 +134,9 @@ long copied;        // *words* copied & scavenged during this GC
 
 rtsBool work_stealing;
 
+nat static_flag = STATIC_FLAG_B;
+nat prev_static_flag = STATIC_FLAG_A;
+
 DECLARE_GCT
 
 /* -----------------------------------------------------------------------------
@@ -141,7 +144,6 @@ DECLARE_GCT
    -------------------------------------------------------------------------- */
 
 static void mark_root               (void *user, StgClosure **root);
-static void zero_static_object_list (StgClosure* first_static);
 static void prepare_collected_gen   (generation *gen);
 static void prepare_uncollected_gen (generation *gen);
 static void init_gc_thread          (gc_thread *t);
@@ -246,6 +248,12 @@ GarbageCollect (nat collect_gen,
   N = collect_gen;
   major_gc = (N == RtsFlags.GcFlags.generations-1);
 
+  if (major_gc) {
+      prev_static_flag = static_flag;
+      static_flag =
+          static_flag == STATIC_FLAG_A ? STATIC_FLAG_B : STATIC_FLAG_A;
+  }
+
 #if defined(THREADED_RTS)
   work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
                   N >= RtsFlags.ParFlags.parGcLoadBalancingGen;
@@ -672,20 +680,6 @@ GarbageCollect (nat collect_gen,
   resetStaticObjectForRetainerProfiling(gct->scavenged_static_objects);
 #endif
 
-  // zero the scavenged static object list
-  if (major_gc) {
-      nat i;
-      if (n_gc_threads == 1) {
-          zero_static_object_list(gct->scavenged_static_objects);
-      } else {
-          for (i = 0; i < n_gc_threads; i++) {
-              if (!gc_threads[i]->idle) {
-                  zero_static_object_list(gc_threads[i]->scavenged_static_objects);
-              }
-          }
-      }
-  }
-
   // Start any pending finalizers.  Must be after
   // updateStableTables() and stableUnlock() (see #4221).
   RELEASE_SM_LOCK;
@@ -1427,8 +1421,8 @@ collect_pinned_object_blocks (void)
 static void
 init_gc_thread (gc_thread *t)
 {
-    t->static_objects = END_OF_STATIC_LIST;
-    t->scavenged_static_objects = END_OF_STATIC_LIST;
+    t->static_objects = END_OF_STATIC_OBJECT_LIST;
+    t->scavenged_static_objects = END_OF_STATIC_OBJECT_LIST;
     t->scan_bd = NULL;
     t->mut_lists = t->cap->mut_lists;
     t->evac_gen_no = 0;
@@ -1465,24 +1459,6 @@ mark_root(void *user USED_IF_THREADS, StgClosure **root)
     SET_GCT(saved_gct);
 }
 
-/* -----------------------------------------------------------------------------
-   Initialising the static object & mutable lists
-   -------------------------------------------------------------------------- */
-
-static void
-zero_static_object_list(StgClosure* first_static)
-{
-  StgClosure* p;
-  StgClosure* link;
-  const StgInfoTable *info;
-
-  for (p = first_static; p != END_OF_STATIC_LIST; p = link) {
-    info = get_itbl(p);
-    link = *STATIC_LINK(info, p);
-    *STATIC_LINK(info,p) = NULL;
-  }
-}
-
 /* ----------------------------------------------------------------------------
    Reset the sizes of the older generations when we do a major
    collection.
@@ -1728,7 +1704,7 @@ static void gcCAFs(void)
     p = debug_caf_list;
     prev = NULL;
 
-    for (p = debug_caf_list; p != (StgIndStatic*)END_OF_STATIC_LIST;
+    for (p = debug_caf_list; p != (StgIndStatic*)END_OF_CAF_LIST;
          p = (StgIndStatic*)p->saved_info) {
 
         info = get_itbl((StgClosure*)p);
diff --git a/rts/sm/GC.h b/rts/sm/GC.h
index 571aa07110..5744eb95a8 100644
--- a/rts/sm/GC.h
+++ b/rts/sm/GC.h
@@ -16,6 +16,8 @@
 
 #include "BeginPrivate.h"
 
+#include "HeapAlloc.h"
+
 void GarbageCollect (rtsBool force_major_gc,
                      rtsBool do_heap_census,
                      nat gc_type, Capability *cap);
diff --git a/rts/sm/GCAux.c b/rts/sm/GCAux.c
index 13316e4d29..d3cbdaefb4 100644
--- a/rts/sm/GCAux.c
+++ b/rts/sm/GCAux.c
@@ -118,14 +118,14 @@ revertCAFs( void )
     StgIndStatic *c;
 
     for (c = revertible_caf_list;
-         c != (StgIndStatic *)END_OF_STATIC_LIST;
+         c != (StgIndStatic *)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
         SET_INFO((StgClosure *)c, c->saved_info);
         c->saved_info = NULL;
         // could, but not necessary: c->static_link = NULL;
     }
-    revertible_caf_list = (StgIndStatic*)END_OF_STATIC_LIST;
+    revertible_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
 }
 
 void
@@ -134,15 +134,17 @@ markCAFs (evac_fn evac, void *user)
     StgIndStatic *c;
 
     for (c = dyn_caf_list;
-         c != (StgIndStatic*)END_OF_STATIC_LIST;
+         c != (StgIndStatic*)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
+        c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);
         evac(user, &c->indirectee);
     }
     for (c = revertible_caf_list;
-         c != (StgIndStatic*)END_OF_STATIC_LIST;
+         c != (StgIndStatic*)END_OF_CAF_LIST;
          c = (StgIndStatic *)c->static_link)
     {
+        c = (StgIndStatic *)UNTAG_STATIC_LIST_PTR(c);
         evac(user, &c->indirectee);
     }
 }
diff --git a/rts/sm/GCThread.h b/rts/sm/GCThread.h
index cbe4346afe..d42b89f973 100644
--- a/rts/sm/GCThread.h
+++ b/rts/sm/GCThread.h
@@ -131,8 +131,11 @@ typedef struct gc_thread_ {
                                    //  during GC without accessing the block
                                    //   allocators spin lock. 
 
-    StgClosure* static_objects;      // live static objects
-    StgClosure* scavenged_static_objects;   // static objects scavenged so far
+    // These two lists are chained through the STATIC_LINK() fields of static
+    // objects.  Pointers are tagged with the current static_flag, so before
+    // following a pointer, untag it with UNTAG_STATIC_LIST_PTR().
+    StgClosure* static_objects;            // live static objects
+    StgClosure* scavenged_static_objects;  // static objects scavenged so far
 
     W_ gc_count;                 // number of GCs this thread has done
 
diff --git a/rts/sm/HeapAlloc.h b/rts/sm/HeapAlloc.h
new file mode 100644
index 0000000000..c914b5db40
--- /dev/null
+++ b/rts/sm/HeapAlloc.h
@@ -0,0 +1,224 @@
+/* -----------------------------------------------------------------------------
+ *
+ * (c) The University of Glasgow 2006-2008
+ *
+ * The HEAP_ALLOCED() test.
+ *
+ * ---------------------------------------------------------------------------*/
+
+#ifndef SM_HEAP_ALLOC_H
+#define SM_HEAP_ALLOC_H
+
+#include "BeginPrivate.h"
+
+/* -----------------------------------------------------------------------------
+   The HEAP_ALLOCED() test.
+
+   HEAP_ALLOCED is called FOR EVERY SINGLE CLOSURE during GC.
+   It needs to be FAST.
+
+   See wiki commentary at
+     http://ghc.haskell.org/trac/ghc/wiki/Commentary/HeapAlloced
+
+   Implementation of HEAP_ALLOCED
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   Since heap is allocated in chunks of megablocks (MBLOCK_SIZE), we
+   can just use a table to record which megablocks in the address
+   space belong to the heap.  On a 32-bit machine, with 1Mb
+   megablocks, using 8 bits for each entry in the table, the table
+   requires 4k.  Lookups during GC will be fast, because the table
+   will be quickly cached (indeed, performance measurements showed no
+   measurable difference between doing the table lookup and using a
+   constant comparison).
+
+   On 64-bit machines, we have two possibilities. One is to request
+   a single chunk of address space that we deem "large enough"
+   (currently 1TB, could easily be extended to, say 16TB or more).
+   Memory from that chunk is GC memory, everything else is not. This
+   case is tricky in that it requires support from the OS to allocate
+   address space without allocating memory (in practice, all modern
+   OSes do this). It's also tricky in that it is the only case where
+   a successful HEAP_ALLOCED(p) check can trigger a segfault when
+   accessing p (and for debugging purposes, it will).
+
+   Alternatively, the older implementation caches one 12-bit block map
+   that describes 4096 megablocks or 4GB of memory. If HEAP_ALLOCED is
+   called for an address that is not in the cache, it calls
+   slowIsHeapAlloced (see MBlock.c) which will find the block map for
+   the 4GB block in question.
+   -------------------------------------------------------------------------- */
+
+#ifdef USE_LARGE_ADDRESS_SPACE
+
+extern W_ mblock_address_space_begin;
+# define MBLOCK_SPACE_SIZE      ((StgWord)1 << 40) /* 1 TB */
+# define HEAP_ALLOCED(p)        ((W_)(p) >= mblock_address_space_begin && \
+                                 (W_)(p) < (mblock_address_space_begin +  \
+                                            MBLOCK_SPACE_SIZE))
+# define HEAP_ALLOCED_GC(p)     HEAP_ALLOCED(p)
+
+#elif SIZEOF_VOID_P == 4
+extern StgWord8 mblock_map[];
+
+/* On a 32-bit machine a 4KB table is always sufficient */
+# define MBLOCK_MAP_SIZE        4096
+# define MBLOCK_MAP_ENTRY(p)    ((StgWord)(p) >> MBLOCK_SHIFT)
+# define HEAP_ALLOCED(p)        mblock_map[MBLOCK_MAP_ENTRY(p)]
+# define HEAP_ALLOCED_GC(p)     HEAP_ALLOCED(p)
+
+/* -----------------------------------------------------------------------------
+   HEAP_ALLOCED for 64-bit machines (without LARGE_ADDRESS_SPACE).
+
+ Here are some cache layout options:
+
+ [1]
+ 16KB cache of 16-bit entries, 1MB lines (capacity 8GB)
+  mblock size =          20 bits
+  entries   =     8192   13 bits
+  line size =             0 bits (1 bit of value)
+  tag size  =            15 bits
+                       = 48 bits
+
+ [2]
+ 32KB cache of 16-bit entries, 4MB lines (capacity 32GB)
+  mblock size =          20 bits
+  entries   =    16384   14 bits
+  line size =             2 bits (4 bits of value)
+  tag size  =            12 bits
+                       = 48 bits
+
+ [3]
+ 16KB cache of 16-bit entries, 2MB lines (capacity 16GB)
+  mblock size =          20 bits
+  entries   =    8192    13 bits
+  line size =             1 bits (2 bits of value)
+  tag size  =            14 bits
+                       = 48 bits
+
+ [4]
+ 4KB cache of 32-bit entries, 16MB lines (capacity 16GB)
+  mblock size =          20 bits
+  entries   =     1024   10 bits
+  line size =             4 bits (16 bits of value)
+  tag size  =            14 bits
+                       = 48 bits
+
+ [5]
+ 4KB cache of 64-bit entries, 32MB lines (capacity 16GB)
+  mblock size =          20 bits
+  entries   =     512     9 bits
+  line size =             5 bits (32 bits of value)
+  tag size  =            14 bits
+                       = 48 bits
+
+ We actually use none of the above.  After much experimentation it was
+ found that optimising the lookup is the most important factor,
+ followed by reducing the number of misses.  To that end, we use a
+ variant of [1] in which each cache entry is ((mblock << 1) + value)
+ where value is 0 for non-heap and 1 for heap.  The cache entries can
+ be 32 bits, since the mblock number is 48-20 = 28 bits, and we need
+ 1 bit for the value.  The cache can be as big as we like, but
+ currently we use 8k entries, giving us 8GB capacity.
+
+ ---------------------------------------------------------------------------- */
+
+#elif SIZEOF_VOID_P == 8
+
+#define MBC_LINE_BITS 0
+#define MBC_TAG_BITS 15
+
+#if x86_64_HOST_ARCH
+// 32bits are enough for 'entry' as modern amd64 boxes have
+// only 48bit sized virtual addres.
+typedef StgWord32 MbcCacheLine;
+#else
+// 32bits is not enough here as some arches (like ia64) use
+// upper address bits to distinct memory areas.
+typedef StgWord64 MbcCacheLine;
+#endif
+
+typedef StgWord8  MBlockMapLine;
+
+#define MBLOCK_MAP_LINE(p)  (((StgWord)p & 0xffffffff) >> (MBLOCK_SHIFT + MBC_LINE_BITS))
+
+#define MBC_LINE_SIZE  (1<<MBC_LINE_BITS)
+#define MBC_SHIFT      (48 - MBLOCK_SHIFT - MBC_LINE_BITS - MBC_TAG_BITS)
+#define MBC_ENTRIES    (1<<MBC_SHIFT)
+
+extern MbcCacheLine mblock_cache[];
+
+#define MBC_LINE(p) ((StgWord)p >> (MBLOCK_SHIFT + MBC_LINE_BITS))
+
+#define MBLOCK_MAP_ENTRIES  (1 << (32 - MBLOCK_SHIFT - MBC_LINE_BITS))
+
+typedef struct {
+    StgWord32    addrHigh32;
+    MBlockMapLine lines[MBLOCK_MAP_ENTRIES];
+} MBlockMap;
+
+extern W_ mpc_misses;
+
+StgBool HEAP_ALLOCED_miss(StgWord mblock, void *p);
+
+INLINE_HEADER
+StgBool HEAP_ALLOCED(void *p)
+{
+    StgWord mblock;
+    nat entry_no;
+    MbcCacheLine entry, value;
+
+    mblock   = (StgWord)p >> MBLOCK_SHIFT;
+    entry_no = mblock & (MBC_ENTRIES-1);
+    entry    = mblock_cache[entry_no];
+    value    = entry ^ (mblock << 1);
+    // this formulation coaxes gcc into prioritising the value==1
+    // case, which we expect to be the most common.
+    // __builtin_expect() didn't have any useful effect (gcc-4.3.0).
+    if (value == 1) {
+        return 1;
+    } else if (value == 0) {
+        return 0;
+    } else {
+        // putting the rest out of line turned out to be a slight
+        // performance improvement:
+        return HEAP_ALLOCED_miss(mblock,p);
+    }
+}
+
+// In the parallel GC, the cache itself is safe to *read*, and can be
+// updated atomically, but we need to place a lock around operations
+// that touch the MBlock map.
+INLINE_HEADER
+StgBool HEAP_ALLOCED_GC(void *p)
+{
+    StgWord mblock;
+    nat entry_no;
+    MbcCacheLine entry, value;
+    StgBool b;
+
+    mblock   = (StgWord)p >> MBLOCK_SHIFT;
+    entry_no = mblock & (MBC_ENTRIES-1);
+    entry    = mblock_cache[entry_no];
+    value    = entry ^ (mblock << 1);
+    if (value == 1) {
+        return 1;
+    } else if (value == 0) {
+        return 0;
+    } else {
+        // putting the rest out of line turned out to be a slight
+        // performance improvement:
+        ACQUIRE_SPIN_LOCK(&gc_alloc_block_sync);
+        b = HEAP_ALLOCED_miss(mblock,p);
+        RELEASE_SPIN_LOCK(&gc_alloc_block_sync);
+        return b;
+    }
+}
+
+#else
+# error HEAP_ALLOCED not defined
+#endif
+
+#include "EndPrivate.h"
+
+#endif /* SM_HEAP_ALLOC_H */
diff --git a/rts/sm/MBlock.c b/rts/sm/MBlock.c
index f626e1f43b..35a11bf589 100644
--- a/rts/sm/MBlock.c
+++ b/rts/sm/MBlock.c
@@ -23,9 +23,320 @@ W_ mblocks_allocated = 0;
 W_ mpc_misses = 0;
 
 /* -----------------------------------------------------------------------------
-   The MBlock Map: provides our implementation of HEAP_ALLOCED()
+   The MBlock Map: provides our implementation of HEAP_ALLOCED() and the
+   utilities to walk the really allocated (thus accessible without risk of
+   segfault) heap
    -------------------------------------------------------------------------- */
 
+/*
+  There are two different cases here: either we use "large address
+  space" (which really means two-step allocation), so we have to
+  manage which memory is good (= accessible without fear of segfault)
+  and which is not owned by us, or we use the older method and get
+  good memory straight from the system.
+
+  Both code paths need to provide:
+
+  void *getFirstMBlock(void ** state)
+      return the first (lowest address) mblock
+      that was actually committed
+
+  void *getNextMBlock(void ** state, void * mblock)
+      return the first (lowest address) mblock
+      that was committed, after the given one
+
+  For both these calls, @state is an in-out parameter that points to
+  an opaque state threading the calls togheter. The calls should only
+  be used in an interation fashion. Pass NULL if @state is not
+  interesting,or pass a pointer to NULL if you don't have a state.
+
+  void *getCommittedMBlocks(nat n)
+      return @n new mblocks, ready to be used (reserved and committed)
+
+  void *decommitMBlocks(char *addr, nat n)
+      release memory for @n mblocks, starting at the given address
+
+  void releaseFreeMemory()
+      potentially release any address space that was associated
+      with recently decommitted blocks
+*/
+
+#ifdef USE_LARGE_ADDRESS_SPACE
+
+// Large address space means we use two-step allocation: reserve
+// something large upfront, and then commit as needed
+// (This is normally only useful on 64-bit, where we can assume
+// that reserving 1TB is possible)
+//
+// There is no block map in this case, but there is a free list
+// of blocks that were committed and decommitted at least once,
+// which we use to choose which block to commit next in the already
+// reserved space.
+//
+// We cannot let the OS choose it as we do in the
+// non large address space case, because the committing wants to
+// know the exact address upfront.
+//
+// The free list is coalesced and ordered, which means that
+// allocate and free are worst-case O(n), but benchmarks have shown
+// that this is not a significant problem, because large (>=2MB)
+// allocations are infrequent and their time is mostly insignificant
+// compared to the time to use that memory.
+//
+// The free list is stored in malloc()'d memory, unlike the other free
+// lists in BlockAlloc.c which are stored in block descriptors,
+// because we cannot touch the contents of decommitted mblocks.
+
+typedef struct free_list {
+    struct free_list *prev;
+    struct free_list *next;
+    W_ address;
+    W_ size;
+} free_list;
+
+static free_list *free_list_head;
+static W_ mblock_high_watermark;
+W_ mblock_address_space_begin = 0;
+
+static void *getAllocatedMBlock(free_list **start_iter, W_ startingAt)
+{
+    free_list *iter;
+    W_ p = startingAt;
+
+    for (iter = *start_iter; iter != NULL; iter = iter->next)
+    {
+        if (p < iter->address)
+            break;
+
+        if (p == iter->address)
+            p += iter->size;
+    }
+
+    *start_iter = iter;
+
+    if (p >= mblock_high_watermark)
+        return NULL;
+
+    return (void*)p;
+}
+
+void * getFirstMBlock(void **state STG_UNUSED)
+{
+    free_list *fake_state;
+    free_list **casted_state;
+
+    if (state)
+        casted_state = (free_list**)state;
+    else
+        casted_state = &fake_state;
+
+    *casted_state = free_list_head;
+    return getAllocatedMBlock(casted_state, mblock_address_space_begin);
+}
+
+void * getNextMBlock(void **state STG_UNUSED, void *mblock)
+{
+    free_list *fake_state = free_list_head;
+    free_list **casted_state;
+
+    if (state)
+        casted_state = (free_list**)state;
+    else
+        casted_state = &fake_state;
+
+    return getAllocatedMBlock(casted_state, (W_)mblock + MBLOCK_SIZE);
+}
+
+static void *getReusableMBlocks(nat n)
+{
+    struct free_list *iter;
+    W_ size = MBLOCK_SIZE * (W_)n;
+
+    for (iter = free_list_head; iter != NULL; iter = iter->next) {
+        void *addr;
+
+        if (iter->size < size)
+            continue;
+
+        addr = (void*)iter->address;
+        iter->address += size;
+        iter->size -= size;
+        if (iter->size == 0) {
+            struct free_list *prev, *next;
+
+            prev = iter->prev;
+            next = iter->next;
+            if (prev == NULL) {
+                ASSERT(free_list_head == iter);
+                free_list_head = next;
+            } else {
+                prev->next = next;
+            }
+            if (next != NULL) {
+                next->prev = prev;
+            }
+            stgFree(iter);
+        }
+
+        osCommitMemory(addr, size);
+        return addr;
+    }
+
+    return NULL;
+}
+
+static void *getFreshMBlocks(nat n)
+{
+    W_ size = MBLOCK_SIZE * (W_)n;
+    void *addr = (void*)mblock_high_watermark;
+
+    if (mblock_high_watermark + size >
+        mblock_address_space_begin + MBLOCK_SPACE_SIZE)
+    {
+        // whoa, 1 TB of heap?
+        errorBelch("out of memory");
+        stg_exit(EXIT_HEAPOVERFLOW);
+    }
+
+    osCommitMemory(addr, size);
+    mblock_high_watermark += size;
+    return addr;
+}
+
+static void *getCommittedMBlocks(nat n)
+{
+    void *p;
+
+    p = getReusableMBlocks(n);
+    if (p == NULL) {
+        p = getFreshMBlocks(n);
+    }
+
+    ASSERT(p != NULL && p != (void*)-1);
+    return p;
+}
+
+static void decommitMBlocks(char *addr, nat n)
+{
+    struct free_list *iter, *prev;
+    W_ size = MBLOCK_SIZE * (W_)n;
+    W_ address = (W_)addr;
+
+    osDecommitMemory(addr, size);
+
+    prev = NULL;
+    for (iter = free_list_head; iter != NULL; iter = iter->next)
+    {
+        prev = iter;
+
+        if (iter->address + iter->size < address)
+            continue;
+
+        if (iter->address + iter->size == address) {
+            iter->size += size;
+
+            if (address + size == mblock_high_watermark) {
+                mblock_high_watermark -= iter->size;
+                if (iter->prev) {
+                    iter->prev->next = NULL;
+                } else {
+                    ASSERT(iter == free_list_head);
+                    free_list_head = NULL;
+                }
+                stgFree(iter);
+                return;
+            }
+
+            if (iter->next &&
+                iter->next->address == iter->address + iter->size) {
+                struct free_list *next;
+
+                next = iter->next;
+                iter->size += next->size;
+                iter->next = next->next;
+
+                if (iter->next) {
+                    iter->next->prev = iter;
+
+                    /* We don't need to consolidate more */
+                    ASSERT(iter->next->address > iter->address + iter->size);
+                }
+
+                stgFree(next);
+            }
+            return;
+        } else if (address + size == iter->address) {
+            iter->address = address;
+            iter->size += size;
+
+            /* We don't need to consolidate backwards
+               (because otherwise it would have been handled by
+               the previous iteration) */
+            if (iter->prev) {
+                ASSERT(iter->prev->address + iter->prev->size < iter->address);
+            }
+            return;
+        } else {
+            struct free_list *new_iter;
+
+            /* All other cases have been handled */
+            ASSERT(iter->address > address + size);
+
+            new_iter = stgMallocBytes(sizeof(struct free_list), "freeMBlocks");
+            new_iter->address = address;
+            new_iter->size = size;
+            new_iter->next = iter;
+            new_iter->prev = iter->prev;
+            if (new_iter->prev) {
+                new_iter->prev->next = new_iter;
+            } else {
+                ASSERT(iter == free_list_head);
+                free_list_head = new_iter;
+            }
+            iter->prev = new_iter;
+            return;
+        }
+    }
+
+    /* We're past the last free list entry, so we must
+       be the highest allocation so far
+    */
+    ASSERT(address + size <= mblock_high_watermark);
+
+    /* Fast path the case of releasing high or all memory */
+    if (address + size == mblock_high_watermark) {
+        mblock_high_watermark -= size;
+    } else {
+        struct free_list *new_iter;
+
+        new_iter = stgMallocBytes(sizeof(struct free_list), "freeMBlocks");
+        new_iter->address = address;
+        new_iter->size = size;
+        new_iter->next = NULL;
+        new_iter->prev = prev;
+        if (new_iter->prev) {
+            ASSERT(new_iter->prev->next == NULL);
+            new_iter->prev->next = new_iter;
+        } else {
+            ASSERT(free_list_head == NULL);
+            free_list_head = new_iter;
+        }
+    }
+}
+
+void releaseFreeMemory(void)
+{
+    // This function exists for releasing address space
+    // on Windows 32 bit
+    //
+    // Do nothing if USE_LARGE_ADDRESS_SPACE, we never want
+    // to release address space
+
+    debugTrace(DEBUG_gc, "mblock_high_watermark: %p\n", mblock_high_watermark);
+}
+
+#else // !USE_LARGE_ADDRESS_SPACE
+
 #if SIZEOF_VOID_P == 4
 StgWord8 mblock_map[MBLOCK_MAP_SIZE]; // initially all zeros
 
@@ -108,6 +419,7 @@ setHeapAlloced(void *p, StgWord8 i)
         mblock_cache[entry_no] = (mblock << 1) + i;
     }
 }
+
 #endif
 
 static void
@@ -130,7 +442,7 @@ void * mapEntryToMBlock(nat i)
     return (void *)((StgWord)i << MBLOCK_SHIFT);
 }
 
-void * getFirstMBlock(void)
+void * getFirstMBlock(void **state STG_UNUSED)
 {
     nat i;
 
@@ -140,7 +452,7 @@ void * getFirstMBlock(void)
     return NULL;
 }
 
-void * getNextMBlock(void *mblock)
+void * getNextMBlock(void **state STG_UNUSED, void *mblock)
 {
     nat i;
 
@@ -152,7 +464,7 @@ void * getNextMBlock(void *mblock)
 
 #elif SIZEOF_VOID_P == 8
 
-void * getNextMBlock(void *p)
+void * getNextMBlock(void **state STG_UNUSED, void *p)
 {
     MBlockMap *map;
     nat off, j;
@@ -189,7 +501,7 @@ void * getNextMBlock(void *p)
     return NULL;
 }
 
-void * getFirstMBlock(void)
+void * getFirstMBlock(void **state STG_UNUSED)
 {
     MBlockMap *map = mblock_maps[0];
     nat line_no, off;
@@ -210,7 +522,38 @@ void * getFirstMBlock(void)
     return NULL;
 }
 
-#endif // SIZEOF_VOID_P
+#endif // SIZEOF_VOID_P == 8
+
+static void *getCommittedMBlocks(nat n)
+{
+    // The OS layer returns committed memory directly
+    void *ret = osGetMBlocks(n);
+    nat i;
+
+    // fill in the table
+    for (i = 0; i < n; i++) {
+        markHeapAlloced( (StgWord8*)ret + i * MBLOCK_SIZE );
+    }
+
+    return ret;
+}
+
+static void decommitMBlocks(void *p, nat n)
+{
+    osFreeMBlocks(p, n);
+    nat i;
+
+    for (i = 0; i < n; i++) {
+        markHeapUnalloced( (StgWord8*)p + i * MBLOCK_SIZE );
+    }
+}
+
+void releaseFreeMemory(void)
+{
+    osReleaseFreeMemory();
+}
+
+#endif /* !USE_LARGE_ADDRESS_SPACE */
 
 /* -----------------------------------------------------------------------------
    Allocate new mblock(s)
@@ -228,18 +571,12 @@ getMBlock(void)
 void *
 getMBlocks(nat n)
 {
-    nat i;
     void *ret;
 
-    ret = osGetMBlocks(n);
+    ret = getCommittedMBlocks(n);
 
     debugTrace(DEBUG_gc, "allocated %d megablock(s) at %p",n,ret);
 
-    // fill in the table
-    for (i = 0; i < n; i++) {
-        markHeapAlloced( (StgWord8*)ret + i * MBLOCK_SIZE );
-    }
-
     mblocks_allocated += n;
     peak_mblocks_allocated = stg_max(peak_mblocks_allocated, mblocks_allocated);
 
@@ -249,17 +586,11 @@ getMBlocks(nat n)
 void
 freeMBlocks(void *addr, nat n)
 {
-    nat i;
-
     debugTrace(DEBUG_gc, "freeing %d megablock(s) at %p",n,addr);
 
     mblocks_allocated -= n;
 
-    for (i = 0; i < n; i++) {
-        markHeapUnalloced( (StgWord8*)addr + i * MBLOCK_SIZE );
-    }
-
-    osFreeMBlocks(addr, n);
+    decommitMBlocks(addr, n);
 }
 
 void
@@ -267,6 +598,22 @@ freeAllMBlocks(void)
 {
     debugTrace(DEBUG_gc, "freeing all megablocks");
 
+#ifdef USE_LARGE_ADDRESS_SPACE
+    {
+        struct free_list *iter, *next;
+
+        for (iter = free_list_head; iter != NULL; iter = next)
+        {
+            next = iter->next;
+            stgFree(iter);
+        }
+    }
+
+    osReleaseHeapMemory();
+
+    mblock_address_space_begin = (W_)-1;
+    mblock_high_watermark = (W_)-1;
+#else
     osFreeAllMBlocks();
 
 #if SIZEOF_VOID_P == 8
@@ -276,13 +623,23 @@ freeAllMBlocks(void)
     }
     stgFree(mblock_maps);
 #endif
+
+#endif
 }
 
 void
 initMBlocks(void)
 {
     osMemInit();
-#if SIZEOF_VOID_P == 8
+
+#ifdef USE_LARGE_ADDRESS_SPACE
+    {
+        void *addr = osReserveHeapMemory();
+
+        mblock_address_space_begin = (W_)addr;
+        mblock_high_watermark = (W_)addr;
+    }
+#elif SIZEOF_VOID_P == 8
     memset(mblock_cache,0xff,sizeof(mblock_cache));
 #endif
 }
diff --git a/rts/sm/OSMem.h b/rts/sm/OSMem.h
index db704fc78b..9a6ccdd7ec 100644
--- a/rts/sm/OSMem.h
+++ b/rts/sm/OSMem.h
@@ -20,6 +20,47 @@ W_ getPageSize (void);
 StgWord64 getPhysicalMemorySize (void);
 void setExecutable (void *p, W_ len, rtsBool exec);
 
+#ifdef USE_LARGE_ADDRESS_SPACE
+
+/*
+  If "large address space" is enabled, we allocate memory in two
+  steps: first we request some address space, and then we request some
+  memory in it. This allows us to ask for much more address space that
+  we will ever need, which keeps everything nice and consecutive.
+*/
+
+// Reserve the large address space blob, and return the address that
+// the OS has chosen for it.  It is not safe to access the memory
+// pointed to by the return value, until that memory is committed
+// using osCommitMemory().
+//
+// This function is called once when the block allocator is initialized.
+void *osReserveHeapMemory(void);
+
+// Commit (allocate memory for) a piece of address space, which must
+// be within the previously reserved space After this call, it is safe
+// to access @p up to @len bytes.
+//
+// There is no guarantee on the contents of the memory pointed to by
+// @p, in particular it must not be assumed to contain all zeros.
+void osCommitMemory(void *p, W_ len);
+
+// Decommit (release backing memory for) a piece of address space,
+// which must be within the previously reserve space and must have
+// been previously committed After this call, it is again unsafe to
+// access @p (up to @len bytes), but there is no guarantee that the
+// memory will be released to the system (as far as eg. RSS statistics
+// from top are concerned).
+void osDecommitMemory(void *p, W_ len);
+
+// Release the address space previously obtained and undo the effects of
+// osReserveHeapMemory
+//
+// This function is called once, when the block allocator is deinitialized
+// before the program terminates.
+void osReleaseHeapMemory(void);
+#endif
+
 #include "EndPrivate.h"
 
 #endif /* SM_OSMEM_H */
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index c4a699e59a..e7a8401145 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -637,7 +637,8 @@ checkStaticObjects ( StgClosure* static_objects )
   StgClosure *p = static_objects;
   StgInfoTable *info;
 
-  while (p != END_OF_STATIC_LIST) {
+  while (p != END_OF_STATIC_OBJECT_LIST) {
+    p = UNTAG_STATIC_LIST_PTR(p);
     checkClosure(p);
     info = get_itbl(p);
     switch (info->type) {
diff --git a/rts/sm/Scav.c b/rts/sm/Scav.c
index a8f0ab037f..dfad0bef58 100644
--- a/rts/sm/Scav.c
+++ b/rts/sm/Scav.c
@@ -1672,7 +1672,7 @@ scavenge_capability_mut_lists (Capability *cap)
 static void
 scavenge_static(void)
 {
-  StgClosure* p;
+  StgClosure *flagged_p, *p;
   const StgInfoTable *info;
 
   debugTrace(DEBUG_gc, "scavenging static objects");
@@ -1690,10 +1690,11 @@ scavenge_static(void)
      * be more stuff on this list after each evacuation...
      * (static_objects is a global)
      */
-    p = gct->static_objects;
-    if (p == END_OF_STATIC_LIST) {
+    flagged_p = gct->static_objects;
+    if (flagged_p == END_OF_STATIC_OBJECT_LIST) {
           break;
     }
+    p = UNTAG_STATIC_LIST_PTR(flagged_p);
 
     ASSERT(LOOKS_LIKE_CLOSURE_PTR(p));
     info = get_itbl(p);
@@ -1708,7 +1709,7 @@ scavenge_static(void)
      */
     gct->static_objects = *STATIC_LINK(info,p);
     *STATIC_LINK(info,p) = gct->scavenged_static_objects;
-    gct->scavenged_static_objects = p;
+    gct->scavenged_static_objects = flagged_p;
 
     switch (info -> type) {
 
@@ -2066,7 +2067,7 @@ loop:
     work_to_do = rtsFalse;
 
     // scavenge static objects
-    if (major_gc && gct->static_objects != END_OF_STATIC_LIST) {
+    if (major_gc && gct->static_objects != END_OF_STATIC_OBJECT_LIST) {
         IF_DEBUG(sanity, checkStaticObjects(gct->static_objects));
         scavenge_static();
     }
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index 6e9b0634b7..65f5b70c21 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -175,9 +175,9 @@ initStorage (void)
 
   generations[0].max_blocks = 0;
 
-  dyn_caf_list = (StgIndStatic*)END_OF_STATIC_LIST;
-  debug_caf_list = (StgIndStatic*)END_OF_STATIC_LIST;
-  revertible_caf_list = (StgIndStatic*)END_OF_STATIC_LIST;
+  dyn_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
+  debug_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
+  revertible_caf_list = (StgIndStatic*)END_OF_CAF_LIST;
    
   /* initialise the allocate() interface */
   large_alloc_lim = RtsFlags.GcFlags.minAllocAreaSize * BLOCK_SIZE_W;
@@ -427,7 +427,7 @@ newCAF(StgRegTable *reg, StgIndStatic *caf)
 
         ACQUIRE_SM_LOCK; // dyn_caf_list is global, locked by sm_mutex
         caf->static_link = (StgClosure*)dyn_caf_list;
-        dyn_caf_list = caf;
+        dyn_caf_list = (StgIndStatic*)((StgWord)caf | STATIC_FLAG_LIST);
         RELEASE_SM_LOCK;
     }
     else
@@ -484,7 +484,7 @@ StgInd* newRetainedCAF (StgRegTable *reg, StgIndStatic *caf)
     ACQUIRE_SM_LOCK;
 
     caf->static_link = (StgClosure*)revertible_caf_list;
-    revertible_caf_list = caf;
+    revertible_caf_list = (StgIndStatic*)((StgWord)caf | STATIC_FLAG_LIST);
 
     RELEASE_SM_LOCK;
 
diff --git a/rts/sm/Storage.h b/rts/sm/Storage.h
index a4421db3f2..d0094b60fb 100644
--- a/rts/sm/Storage.h
+++ b/rts/sm/Storage.h
@@ -133,12 +133,59 @@ W_    calcLiveWords  (void);
 
 extern bdescr *exec_block;
 
-#define END_OF_STATIC_LIST ((StgClosure*)1)
-
 void move_STACK  (StgStack *src, StgStack *dest);
 
 /* -----------------------------------------------------------------------------
-   CAF lists
+   Note [STATIC_LINK fields]
+
+   The low 2 bits of the static link field have the following meaning:
+
+   00     we haven't seen this static object before
+
+   01/10  if it equals static_flag, then we saw it in this GC, otherwise
+          we saw it in the previous GC.
+
+   11     ignore during GC.  This value is used in two ways
+          - When we put CAFs on a list (see Note [CAF lists])
+          - a static constructor that was determined to have no CAF
+            references at compile time is given this value, so we
+            don't traverse it during GC
+
+  This choice of values is quite deliberate, because it means we can
+  decide whether a static object should be traversed during GC using a
+  single test:
+
+  bits = link_field & 3;
+  if ((bits | prev_static_flag) != 3) { ... }
+
+  -------------------------------------------------------------------------- */
+
+#define STATIC_BITS 3
+
+#define STATIC_FLAG_A 1
+#define STATIC_FLAG_B 2
+#define STATIC_FLAG_LIST 3
+
+#define END_OF_CAF_LIST ((StgClosure*)STATIC_FLAG_LIST)
+
+// The previous and current values of the static flag.  These flip
+// between STATIC_FLAG_A and STATIC_FLAG_B at each major GC.
+extern nat prev_static_flag, static_flag;
+
+// In the chain of static objects built up during GC, all the link
+// fields are tagged with the current static_flag value.  How to mark
+// the end of the chain?  It must be a special value so that we can
+// tell it is the end of the chain, but note that we're going to store
+// this value in the link field of a static object, which means that
+// during the NEXT GC we should treat it like any other object that
+// has not been visited during this GC.  Therefore, we use static_flag
+// as the sentinel value.
+#define END_OF_STATIC_OBJECT_LIST ((StgClosure*)(StgWord)static_flag)
+
+#define UNTAG_STATIC_LIST_PTR(p) ((StgClosure*)((StgWord)(p) & ~STATIC_BITS))
+
+/* -----------------------------------------------------------------------------
+   Note [CAF lists]
 
    dyn_caf_list  (CAFs chained through static_link)
       This is a chain of all CAFs in the program, used for
@@ -154,6 +201,10 @@ void move_STACK  (StgStack *src, StgStack *dest);
       A chain of CAFs in object code loaded with the RTS linker.
       These CAFs can be reverted to their unevaluated state using
       revertCAFs.
+
+ Pointers in these lists are tagged with STATIC_FLAG_LIST, so when
+ traversing the list remember to untag each pointer with
+ UNTAG_STATIC_LIST_PTR().
  --------------------------------------------------------------------------- */
 
 extern StgIndStatic * dyn_caf_list;
diff --git a/rts/win32/OSMem.c b/rts/win32/OSMem.c
index afa5113638..716171b3fc 100644
--- a/rts/win32/OSMem.c
+++ b/rts/win32/OSMem.c
@@ -8,6 +8,7 @@
 
 #include "Rts.h"
 #include "sm/OSMem.h"
+#include "sm/HeapAlloc.h"
 #include "RtsUtils.h"
 
 #if HAVE_WINDOWS_H
@@ -28,7 +29,11 @@ typedef struct block_rec_ {
 
 /* allocs are kept in ascending order, and are the memory regions as
    returned by the OS as we need to have matching VirtualAlloc and
-   VirtualFree calls. */
+   VirtualFree calls.
+
+   If USE_LARGE_ADDRESS_SPACE is defined, this list will contain only
+   one element.
+*/
 static alloc_rec* allocs = NULL;
 
 /* free_blocks are kept in ascending order, and adjacent blocks are merged */
@@ -207,12 +212,9 @@ osGetMBlocks(nat n) {
     return ret;
 }
 
-void osFreeMBlocks(char *addr, nat n)
+static void decommitBlocks(char *addr, W_ nBytes)
 {
     alloc_rec *p;
-    W_ nBytes = (W_)n * MBLOCK_SIZE;
-
-    insertFree(addr, nBytes);
 
     p = allocs;
     while ((p != NULL) && (addr >= (p->base + p->size))) {
@@ -243,6 +245,14 @@ void osFreeMBlocks(char *addr, nat n)
     }
 }
 
+void osFreeMBlocks(char *addr, nat n)
+{
+    W_ nBytes = (W_)n * MBLOCK_SIZE;
+
+    insertFree(addr, nBytes);
+    decommitBlocks(addr, nBytes);
+}
+
 void osReleaseFreeMemory(void)
 {
     alloc_rec *prev_a, *a;
@@ -414,3 +424,60 @@ void setExecutable (void *p, W_ len, rtsBool exec)
         stg_exit(EXIT_FAILURE);
     }
 }
+
+#ifdef USE_LARGE_ADDRESS_SPACE
+
+static void* heap_base = NULL;
+
+void *osReserveHeapMemory (void)
+{
+    void *start;
+
+    heap_base = VirtualAlloc(NULL, MBLOCK_SPACE_SIZE + MBLOCK_SIZE,
+                              MEM_RESERVE, PAGE_READWRITE);
+    if (heap_base == NULL) {
+        if (GetLastError() == ERROR_NOT_ENOUGH_MEMORY) {
+            errorBelch("out of memory");
+        } else {
+            sysErrorBelch(
+                "osReserveHeapMemory: VirtualAlloc MEM_RESERVE %llu bytes failed",
+                MBLOCK_SPACE_SIZE + MBLOCK_SIZE);
+        }
+        stg_exit(EXIT_FAILURE);
+    }
+
+    // VirtualFree MEM_RELEASE must always match a
+    // previous MEM_RESERVE call, in address and size
+    // so we necessarily leak some address space here,
+    // before and after the aligned area
+    // It is not a huge problem because we never commit
+    // that memory
+    start = MBLOCK_ROUND_UP(heap_base);
+
+    return start;
+}
+
+void osCommitMemory (void *at, W_ size)
+{
+    void *temp;
+    temp = VirtualAlloc(at, size, MEM_COMMIT, PAGE_READWRITE);
+    if (temp == NULL) {
+        sysErrorBelch("osCommitMemory: VirtualAlloc MEM_COMMIT failed");
+        stg_exit(EXIT_FAILURE);
+    }
+}
+
+void osDecommitMemory (void *at, W_ size)
+{
+    if (!VirtualFree(at, size, MEM_DECOMMIT)) {
+        sysErrorBelch("osDecommitMemory: VirtualFree MEM_DECOMMIT failed");
+        stg_exit(EXIT_FAILURE);
+    }
+}
+
+void osReleaseHeapMemory (void)
+{
+    VirtualFree(heap_base, 0, MEM_RELEASE);
+}
+
+#endif