8 files changed, 68 insertions, 58 deletions
diff --git a/rts/Capability.c b/rts/Capability.c
index 411e64dc7a..7ca220fbd9 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -26,6 +26,7 @@
 #include "sm/GC.h" // for gcWorkerThread()
 #include "STM.h"
 #include "RtsUtils.h"
+#include "sm/OSMem.h"
 
 #if !defined(mingw32_HOST_OS)
 #include "rts/IOManager.h" // for setIOManagerControlFd()
@@ -59,6 +60,12 @@ static Capability *last_free_capability[MAX_NUMA_NODES];
  */
 PendingSync * volatile pending_sync = 0;
 
+// Number of logical NUMA nodes
+uint32_t n_numa_nodes;
+
+// Map logical NUMA node to OS node numbers
+uint32_t numa_map[MAX_NUMA_NODES];
+
 /* Let foreign code get the current Capability -- assuming there is one!
  * This is useful for unsafe foreign calls because they are called with
  * the current Capability held, but they are not passed it. For example,
@@ -326,6 +333,31 @@ void initCapabilities (void)
     traceCapsetCreate(CAPSET_OSPROCESS_DEFAULT, CapsetTypeOsProcess);
     traceCapsetCreate(CAPSET_CLOCKDOMAIN_DEFAULT, CapsetTypeClockdomain);
 
+    // Initialise NUMA
+    if (!RtsFlags.GcFlags.numa) {
+        n_numa_nodes = 1;
+        for (i = 0; i < MAX_NUMA_NODES; i++) {
+            numa_map[i] = 0;
+        }
+    } else {
+        uint32_t nNodes = osNumaNodes();
+        if (nNodes > MAX_NUMA_NODES) {
+            barf("Too many NUMA nodes (max %d)", MAX_NUMA_NODES);
+        }
+        StgWord mask = RtsFlags.GcFlags.numaMask & osNumaMask();
+        uint32_t logical = 0, physical = 0;
+        for (; physical < MAX_NUMA_NODES; physical++) {
+            if (mask & 1) {
+                numa_map[logical++] = physical;
+            }
+            mask = mask >> 1;
+        }
+        n_numa_nodes = logical;
+        if (logical == 0) {
+            barf("%s: available NUMA node set is empty");
+        }
+    }
+
 #if defined(THREADED_RTS)
 
 #ifndef REG_Base
@@ -355,7 +387,7 @@ void initCapabilities (void)
     // There are no free capabilities to begin with.  We will start
     // a worker Task to each Capability, which will quickly put the
     // Capability on the free list when it finds nothing to do.
-    for (i = 0; i < RtsFlags.GcFlags.nNumaNodes; i++) {
+    for (i = 0; i < n_numa_nodes; i++) {
         last_free_capability[i] = capabilities[0];
     }
 }
@@ -730,9 +762,9 @@ void waitForCapability (Capability **pCap, Task *task)
                 // Otherwise, search for a free capability on this node.
                 cap = NULL;
                 for (i = task->node; i < enabled_capabilities;
-                     i += RtsFlags.GcFlags.nNumaNodes) {
+                     i += n_numa_nodes) {
                     // visits all the capabilities on this node, because
-                    // cap[i]->node == i % RtsFlags.GcFlags.nNumaNodes
+                    // cap[i]->node == i % n_numa_nodes
                     if (!capabilities[i]->running_task) {
                         cap = capabilities[i];
                         break;
diff --git a/rts/Capability.h b/rts/Capability.h
index 6874379c5f..67b43280eb 100644
--- a/rts/Capability.h
+++ b/rts/Capability.h
@@ -39,7 +39,7 @@ struct Capability_ {
     // The NUMA node on which this capability resides.  This is used to allocate
     // node-local memory in allocate().
     //
-    // Note: this is always equal to cap->no % RtsFlags.ParFlags.nNumaNodes.
+    // Note: this is always equal to cap->no % n_numa_nodes.
     // The reason we slice it this way is that if we add or remove capabilities
     // via setNumCapabilities(), then we keep the number of capabilities on each
     // NUMA node balanced.
@@ -159,9 +159,6 @@ struct Capability_ {
 #endif
   ;
 
-
-#define capNoToNumaNode(n) ((n) % RtsFlags.GcFlags.nNumaNodes)
-
 #if defined(THREADED_RTS)
 #define ASSERT_TASK_ID(task) ASSERT(task->id == osThreadId())
 #else
@@ -350,6 +347,18 @@ void markCapabilities (evac_fn evac, void *user);
 void traverseSparkQueues (evac_fn evac, void *user);
 
 /* -----------------------------------------------------------------------------
+   NUMA
+   -------------------------------------------------------------------------- */
+
+/* Number of logical NUMA nodes */
+extern uint32_t n_numa_nodes;
+
+/* Map logical NUMA node to OS node numbers */
+extern uint32_t numa_map[MAX_NUMA_NODES];
+
+#define capNoToNumaNode(n) ((n) % n_numa_nodes)
+
+/* -----------------------------------------------------------------------------
    Messages
    -------------------------------------------------------------------------- */
 
diff --git a/rts/RtsFlags.c b/rts/RtsFlags.c
index 25345bf57b..e23f760f43 100644
--- a/rts/RtsFlags.c
+++ b/rts/RtsFlags.c
@@ -123,7 +123,6 @@ static void errorRtsOptsDisabled (const char *s);
 
 void initRtsFlagsDefaults(void)
 {
-    uint32_t i;
     StgWord64 maxStkSize = 8 * getPhysicalMemorySize() / 10;
     // if getPhysicalMemorySize fails just move along with an 8MB limit
     if (maxStkSize == 0)
@@ -160,10 +159,7 @@ void initRtsFlagsDefaults(void)
     RtsFlags.GcFlags.heapBase           = 0;   /* means don't care */
     RtsFlags.GcFlags.allocLimitGrace    = (100*1024) / BLOCK_SIZE;
     RtsFlags.GcFlags.numa               = rtsFalse;
-    RtsFlags.GcFlags.nNumaNodes         = 1;
-    for (i = 0; i < MAX_NUMA_NODES; i++) {
-        RtsFlags.GcFlags.numaMap[i] = 0;
-    }
+    RtsFlags.GcFlags.numaMask           = 1;
 
     RtsFlags.DebugFlags.scheduler       = rtsFalse;
     RtsFlags.DebugFlags.interpreter     = rtsFalse;
@@ -776,28 +772,8 @@ error = rtsTrue;
                           break;
                       }
 
-                      uint32_t nNodes = osNumaNodes();
-                      if (nNodes > MAX_NUMA_NODES) {
-                          errorBelch("%s: Too many NUMA nodes (max %d)",
-                                     rts_argv[arg], MAX_NUMA_NODES);
-                          error = rtsTrue;
-                      } else {
-                          RtsFlags.GcFlags.numa = rtsTrue;
-                          mask = mask & osNumaMask();
-                          uint32_t logical = 0, physical = 0;
-                          for (; physical < MAX_NUMA_NODES; physical++) {
-                              if (mask & 1) {
-                                  RtsFlags.GcFlags.numaMap[logical++] = physical;
-                              }
-                              mask = mask >> 1;
-                          }
-                          RtsFlags.GcFlags.nNumaNodes = logical;
-                          if (logical == 0) {
-                              errorBelch("%s: available node set is empty",
-                                         rts_argv[arg]);
-                              error = rtsTrue;
-                          }
-                      }
+                      RtsFlags.GcFlags.numa = rtsTrue;
+                      RtsFlags.GcFlags.numaMask = mask;
                   }
 #endif
 #if defined(DEBUG) && defined(THREADED_RTS)
@@ -821,11 +797,7 @@ error = rtsTrue;
                       } else {
                           RtsFlags.GcFlags.numa = rtsTrue;
                           RtsFlags.DebugFlags.numa = rtsTrue;
-                          RtsFlags.GcFlags.nNumaNodes = nNodes;
-                          uint32_t physical = 0;
-                          for (; physical < MAX_NUMA_NODES; physical++) {
-                              RtsFlags.GcFlags.numaMap[physical] = physical;
-                          }
+                          RtsFlags.GcFlags.numaMask = (1<<nNodes) - 1;
                       }
                   }
 #endif
diff --git a/rts/Task.c b/rts/Task.c
index 9a827745ba..9a658e019c 100644
--- a/rts/Task.c
+++ b/rts/Task.c
@@ -429,7 +429,7 @@ workerStart(Task *task)
         setThreadAffinity(cap->no, n_capabilities);
     }
     if (RtsFlags.GcFlags.numa && !RtsFlags.DebugFlags.numa) {
-        setThreadNode(RtsFlags.GcFlags.numaMap[task->node]);
+        setThreadNode(numa_map[task->node]);
     }
 
     // set the thread-local pointer to the Task:
@@ -510,7 +510,7 @@ void rts_setInCallCapability (
         if (RtsFlags.GcFlags.numa) {
             task->node = capNoToNumaNode(preferred_capability);
             if (!DEBUG_IS_ON || !RtsFlags.DebugFlags.numa) { // faking NUMA
-                setThreadNode(RtsFlags.GcFlags.numaMap[task->node]);
+                setThreadNode(numa_map[task->node]);
             }
         }
     }
diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c
index 35ea2bde21..112a311f79 100644
--- a/rts/posix/OSThreads.c
+++ b/rts/posix/OSThreads.c
@@ -321,7 +321,6 @@ setThreadAffinity (uint32_t n STG_UNUSED,
 #if HAVE_LIBNUMA
 void setThreadNode (uint32_t node)
 {
-    ASSERT(node < RtsFlags.GcFlags.nNumaNodes);
     if (numa_run_on_node(node) == -1) {
         sysErrorBelch("numa_run_on_node");
         stg_exit(1);
diff --git a/rts/sm/BlockAlloc.c b/rts/sm/BlockAlloc.c
index c2859b0c15..6c2e96414e 100644
--- a/rts/sm/BlockAlloc.c
+++ b/rts/sm/BlockAlloc.c
@@ -467,7 +467,7 @@ uint32_t nodeWithLeastBlocks (void)
 {
     uint32_t node = 0, i;
     uint32_t min_blocks = n_alloc_blocks_by_node[0];
-    for (i = 1; i < RtsFlags.GcFlags.nNumaNodes; i++) {
+    for (i = 1; i < n_numa_nodes; i++) {
         if (n_alloc_blocks_by_node[i] < min_blocks) {
             min_blocks = n_alloc_blocks_by_node[i];
             node = i;
@@ -504,7 +504,7 @@ bdescr* allocLargeChunkOnNode (uint32_t node, W_ min, W_ max)
     StgWord ln, lnmax;
 
     if (min >= BLOCKS_PER_MBLOCK) {
-        return allocGroup(max);
+        return allocGroupOnNode(node,max);
     }
 
     ln = log_2_ceil(min);
@@ -811,7 +811,7 @@ void returnMemoryToOS(uint32_t n /* megablocks */)
     StgWord size;
 
     // ToDo: not fair, we free all the memory starting with node 0.
-    for (node = 0; n > 0 && node < RtsFlags.GcFlags.nNumaNodes; node++) {
+    for (node = 0; n > 0 && node < n_numa_nodes; node++) {
         bd = free_mblock_list[node];
         while ((n > 0) && (bd != NULL)) {
             size = BLOCKS_TO_MBLOCKS(bd->blocks);
@@ -875,7 +875,7 @@ checkFreeListSanity(void)
     StgWord ln, min;
     uint32_t node;
 
-    for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) {
+    for (node = 0; node < n_numa_nodes; node++) {
         min = 1;
         for (ln = 0; ln < NUM_FREE_LISTS; ln++) {
             IF_DEBUG(block_alloc,
@@ -950,7 +950,7 @@ countFreeList(void)
   StgWord ln;
   uint32_t node;
 
-  for (node = 0; node < RtsFlags.GcFlags.nNumaNodes; node++) {
+  for (node = 0; node < n_numa_nodes; node++) {
       for (ln=0; ln < NUM_FREE_LISTS; ln++) {
           for (bd = free_list[node][ln]; bd != NULL; bd = bd->link) {
               total_blocks += bd->blocks;
diff --git a/rts/sm/MBlock.c b/rts/sm/MBlock.c
index 53999d2c4b..4be7fd4356 100644
--- a/rts/sm/MBlock.c
+++ b/rts/sm/MBlock.c
@@ -594,7 +594,7 @@ getMBlocksOnNode(uint32_t node, uint32_t n)
 #ifdef DEBUG
     if (RtsFlags.DebugFlags.numa) return addr; // faking NUMA
 #endif
-    osBindMBlocksToNode(addr, n * MBLOCK_SIZE, RtsFlags.GcFlags.numaMap[node]);
+    osBindMBlocksToNode(addr, n * MBLOCK_SIZE, numa_map[node]);
     return addr;
 }
 
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index a9a7857d43..7c41f8c64b 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -57,7 +57,7 @@ generation *oldest_gen  = NULL; /* oldest generation, for convenience */
 /*
  * Array of nurseries, size == n_capabilities
  *
- * nursery[i] belongs to NUMA node (i % RtsFlags.GcFlags.nNumaNodes)
+ * nursery[i] belongs to NUMA node (i % n_numa_nodes)
  * This is chosen to be the same convention as capabilities[i], so
  * that when not using nursery chunks (+RTS -n), we just map
  * capabilities to nurseries 1:1.
@@ -209,7 +209,7 @@ initStorage (void)
 
   N = 0;
 
-  for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+  for (n = 0; n < n_numa_nodes; n++) {
       next_nursery[n] = n;
   }
   storageAddCapabilities(0, n_capabilities);
@@ -615,7 +615,7 @@ assignNurseriesToCapabilities (uint32_t from, uint32_t to)
     for (i = from; i < to; i++) {
         node = capabilities[i]->node;
         assignNurseryToCapability(capabilities[i], next_nursery[node]);
-        next_nursery[node] += RtsFlags.GcFlags.nNumaNodes;
+        next_nursery[node] += n_numa_nodes;
     }
 }
 
@@ -642,7 +642,7 @@ resetNurseries (void)
 {
     uint32_t n;
 
-    for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+    for (n = 0; n < n_numa_nodes; n++) {
         next_nursery[n] = n;
     }
     assignNurseriesToCapabilities(0, n_capabilities);
@@ -758,22 +758,20 @@ getNewNursery (Capability *cap)
     for(;;) {
         i = next_nursery[node];
         if (i < n_nurseries) {
-            if (cas(&next_nursery[node], i,
-                    i+RtsFlags.GcFlags.nNumaNodes) == i) {
+            if (cas(&next_nursery[node], i, i+n_numa_nodes) == i) {
                 assignNurseryToCapability(cap, i);
                 return rtsTrue;
             }
-        } else if (RtsFlags.GcFlags.nNumaNodes > 1) {
+        } else if (n_numa_nodes > 1) {
             // Try to find an unused nursery chunk on other nodes.  We'll get
             // remote memory, but the rationale is that avoiding GC is better
             // than avoiding remote memory access.
             rtsBool lost = rtsFalse;
-            for (n = 0; n < RtsFlags.GcFlags.nNumaNodes; n++) {
+            for (n = 0; n < n_numa_nodes; n++) {
                 if (n == node) continue;
                 i = next_nursery[n];
                 if (i < n_nurseries) {
-                    if (cas(&next_nursery[n], i,
-                            i+RtsFlags.GcFlags.nNumaNodes) == i) {
+                    if (cas(&next_nursery[n], i, i+n_numa_nodes) == i) {
                         assignNurseryToCapability(cap, i);
                         return rtsTrue;
                     } else {