32 files changed, 130 insertions, 122 deletions
diff --git a/rts/Capability.c b/rts/Capability.c
index afe21fcb46..bf20d991e8 100644
--- a/rts/Capability.c
+++ b/rts/Capability.c
@@ -131,7 +131,7 @@ findSpark (Capability *cap)
           retry = true;
       }
 
-      if (n_capabilities == 1) { return NULL; } // makes no sense...
+      if (getNumCapabilities() == 1) { return NULL; } // makes no sense...
 
       debugTrace(DEBUG_sched,
                  "cap %d: Trying to steal work from other capabilities",
@@ -139,7 +139,7 @@ findSpark (Capability *cap)
 
       /* visit cap.s 0..n-1 in sequence until a theft succeeds. We could
       start at a random place instead of 0 as well.  */
-      for ( i=0 ; i < n_capabilities ; i++ ) {
+      for ( i=0 ; i < getNumCapabilities() ; i++ ) {
           robbed = capabilities[i];
           if (cap == robbed)  // ourselves...
               continue;
@@ -182,7 +182,7 @@ anySparks (void)
 {
     uint32_t i;
 
-    for (i=0; i < n_capabilities; i++) {
+    for (i=0; i < getNumCapabilities(); i++) {
         if (!emptySparkPoolCap(capabilities[i])) {
             return true;
         }
@@ -464,7 +464,7 @@ moreCapabilities (uint32_t from USED_IF_THREADS, uint32_t to USED_IF_THREADS)
 void contextSwitchAllCapabilities(void)
 {
     uint32_t i;
-    for (i=0; i < n_capabilities; i++) {
+    for (i=0; i < getNumCapabilities(); i++) {
         contextSwitchCapability(capabilities[i], true);
     }
 }
@@ -472,7 +472,7 @@ void contextSwitchAllCapabilities(void)
 void interruptAllCapabilities(void)
 {
     uint32_t i;
-    for (i=0; i < n_capabilities; i++) {
+    for (i=0; i < getNumCapabilities(); i++) {
         interruptCapability(capabilities[i]);
     }
 }
@@ -1249,7 +1249,7 @@ void
 shutdownCapabilities(Task *task, bool safe)
 {
     uint32_t i;
-    for (i=0; i < n_capabilities; i++) {
+    for (i=0; i < getNumCapabilities(); i++) {
         ASSERT(task->incall->tso == NULL);
         shutdownCapability(capabilities[i], task, safe);
     }
@@ -1276,7 +1276,7 @@ freeCapabilities (void)
 {
 #if defined(THREADED_RTS)
     uint32_t i;
-    for (i=0; i < n_capabilities; i++) {
+    for (i=0; i < getNumCapabilities(); i++) {
         freeCapability(capabilities[i]);
         if (capabilities[i] != &MainCapability)
             stgFree(capabilities[i]);
@@ -1332,7 +1332,7 @@ void
 markCapabilities (evac_fn evac, void *user)
 {
     uint32_t n;
-    for (n = 0; n < n_capabilities; n++) {
+    for (n = 0; n < getNumCapabilities(); n++) {
         markCapability(evac, user, capabilities[n], false);
     }
 }
@@ -1344,7 +1344,7 @@ bool checkSparkCountInvariant (void)
     StgWord64 remaining = 0;
     uint32_t i;
 
-    for (i = 0; i < n_capabilities; i++) {
+    for (i = 0; i < getNumCapabilities(); i++) {
         sparks.created   += capabilities[i]->spark_stats.created;
         sparks.dud       += capabilities[i]->spark_stats.dud;
         sparks.overflowed+= capabilities[i]->spark_stats.overflowed;
diff --git a/rts/Messages.h b/rts/Messages.h
index ecae7e6365..c95532b7e6 100644
--- a/rts/Messages.h
+++ b/rts/Messages.h
@@ -27,7 +27,7 @@ doneWithMsgThrowTo (Capability *cap, MessageThrowTo *m)
 {
     // The message better be locked (unless we are running single-threaded,
     // where we are a bit more lenient (#19075).
-    ASSERT(n_capabilities == 1 || m->header.info == &stg_WHITEHOLE_info);
+    ASSERT(getNumCapabilities() == 1 || m->header.info == &stg_WHITEHOLE_info);
     IF_NONMOVING_WRITE_BARRIER_ENABLED {
       updateRemembSetPushMessageThrowTo(cap, m);
     }
diff --git a/rts/PrimOps.cmm b/rts/PrimOps.cmm
index 2f7f12df84..38845f1631 100644
--- a/rts/PrimOps.cmm
+++ b/rts/PrimOps.cmm
@@ -2782,7 +2782,9 @@ INFO_TABLE_RET(stg_noDuplicate, RET_SMALL, W_ info_ptr)
 stg_noDuplicatezh /* no arg list: explicit stack layout */
 {
     // With a single capability there's no chance of work duplication.
-    if (CInt[n_capabilities] == 1 :: CInt) {
+    CInt n_caps;
+    n_caps = %relaxed CInt[n_capabilities];
+    if (n_caps == 1 :: CInt) {
         jump %ENTRY_CODE(Sp(0)) [];
     }
 
diff --git a/rts/Printer.c b/rts/Printer.c
index 45b4a606cf..3dbb371a0b 100644
--- a/rts/Printer.c
+++ b/rts/Printer.c
@@ -714,7 +714,7 @@ void printWeakLists()
 {
     debugBelch("======= WEAK LISTS =======\n");
 
-    for (uint32_t cap_idx = 0; cap_idx < n_capabilities; ++cap_idx) {
+    for (uint32_t cap_idx = 0; cap_idx < getNumCapabilities(); ++cap_idx) {
         debugBelch("Capability %d:\n", cap_idx);
         Capability *cap = capabilities[cap_idx];
         for (StgWeak *weak = cap->weak_ptr_list_hd; weak; weak = weak->link) {
@@ -741,7 +741,7 @@ void printLargeAndPinnedObjects()
 {
     debugBelch("====== PINNED OBJECTS ======\n");
 
-    for (uint32_t cap_idx = 0; cap_idx < n_capabilities; ++cap_idx) {
+    for (uint32_t cap_idx = 0; cap_idx < getNumCapabilities(); ++cap_idx) {
         Capability *cap = capabilities[cap_idx];
 
         debugBelch("Capability %d: Current pinned object block: %p\n",
@@ -954,7 +954,7 @@ findPtr(P_ p, int follow)
   // We can't search the nursery, because we don't know which blocks contain
   // valid data, because the bd->free pointers in the nursery are only reset
   // just before a block is used.
-  for (n = 0; n < n_capabilities; n++) {
+  for (n = 0; n < getNumCapabilities(); n++) {
       bd = nurseries[i].blocks;
       i = findPtrBlocks(p,bd,arr,arr_size,i);
       if (i >= arr_size) return;
@@ -967,7 +967,7 @@ findPtr(P_ p, int follow)
       bd = generations[g].large_objects;
       i = findPtrBlocks(p,bd,arr,arr_size,i);
       if (i >= arr_size) return;
-      for (n = 0; n < n_capabilities; n++) {
+      for (n = 0; n < getNumCapabilities(); n++) {
           i = findPtrBlocks(p, gc_threads[n]->gens[g].part_list,
                             arr, arr_size, i);
           i = findPtrBlocks(p, gc_threads[n]->gens[g].todo_bd,
diff --git a/rts/ProfHeap.c b/rts/ProfHeap.c
index 41c8ab2508..288c5c8a54 100644
--- a/rts/ProfHeap.c
+++ b/rts/ProfHeap.c
@@ -1314,7 +1314,7 @@ void heapCensus (Time t)
       heapCensusChain( census, generations[g].large_objects );
       heapCensusCompactList ( census, generations[g].compact_objects );
 
-      for (n = 0; n < n_capabilities; n++) {
+      for (n = 0; n < getNumCapabilities(); n++) {
           ws = &gc_threads[n]->gens[g];
           heapCensusChain(census, ws->todo_bd);
           heapCensusChain(census, ws->part_list);
diff --git a/rts/ProfilerReport.c b/rts/ProfilerReport.c
index 436667066c..28a8f42f0d 100644
--- a/rts/ProfilerReport.c
+++ b/rts/ProfilerReport.c
@@ -296,10 +296,10 @@ writeCCSReport( FILE *prof_file, CostCentreStack const *stack,
 
     fprintf(prof_file, "\ttotal time  = %11.2f secs   (%lu ticks @ %d us, %d processor%s)\n",
             ((double) totals.total_prof_ticks *
-             (double) RtsFlags.MiscFlags.tickInterval) / (TIME_RESOLUTION * n_capabilities),
+             (double) RtsFlags.MiscFlags.tickInterval) / (TIME_RESOLUTION * getNumCapabilities()),
             (unsigned long) totals.total_prof_ticks,
             (int) TimeToUS(RtsFlags.MiscFlags.tickInterval),
-            n_capabilities, n_capabilities > 1 ? "s" : "");
+            getNumCapabilities(), getNumCapabilities() > 1 ? "s" : "");
 
     fprintf(prof_file, "\ttotal alloc = %11s bytes",
             showStgWord64(totals.total_alloc * sizeof(W_),
diff --git a/rts/ProfilerReportJson.c b/rts/ProfilerReportJson.c
index 71a557b8de..2cc0163a16 100644
--- a/rts/ProfilerReportJson.c
+++ b/rts/ProfilerReportJson.c
@@ -133,7 +133,7 @@ writeCCSReportJson(FILE *prof_file,
             RtsFlags.ParFlags.nCapabilities);
     fprintf(prof_file, "\"total_time\": %11.2f,\n",
             ((double) totals.total_prof_ticks *
-             (double) RtsFlags.MiscFlags.tickInterval) / (TIME_RESOLUTION * n_capabilities));
+             (double) RtsFlags.MiscFlags.tickInterval) / (TIME_RESOLUTION * getNumCapabilities()));
     fprintf(prof_file, "\"total_ticks\": %lu,\n",
             (unsigned long) totals.total_prof_ticks);
     fprintf(prof_file, "\"tick_interval\": %d,\n",
diff --git a/rts/Profiling.c b/rts/Profiling.c
index c04cb3ae6b..dd0f4bc915 100644
--- a/rts/Profiling.c
+++ b/rts/Profiling.c
@@ -153,7 +153,7 @@ void initProfiling (void)
     /* for the benefit of allocate()... */
     {
         uint32_t n;
-        for (n=0; n < n_capabilities; n++) {
+        for (n=0; n < getNumCapabilities(); n++) {
             capabilities[n]->r.rCCCS = CCS_SYSTEM;
         }
     }
diff --git a/rts/Proftimer.c b/rts/Proftimer.c
index 7bc5e865b4..6717731a6b 100644
--- a/rts/Proftimer.c
+++ b/rts/Proftimer.c
@@ -117,7 +117,7 @@ handleProfTick(void)
     total_ticks++;
     if (RELAXED_LOAD(&do_prof_ticks)) {
         uint32_t n;
-        for (n=0; n < n_capabilities; n++) {
+        for (n=0; n < getNumCapabilities(); n++) {
             capabilities[n]->r.rCCCS->time_ticks++;
             traceProfSampleCostCentre(capabilities[n], capabilities[n]->r.rCCCS, total_ticks);
         }
diff --git a/rts/RetainerProfile.c b/rts/RetainerProfile.c
index 6e2692f8cd..a7e0cb0501 100644
--- a/rts/RetainerProfile.c
+++ b/rts/RetainerProfile.c
@@ -391,7 +391,7 @@ computeRetainerSet( traverseState *ts )
     //
     // The following code assumes that WEAK objects are considered to be roots
     // for retainer profiling.
-    for (n = 0; n < n_capabilities; n++) {
+    for (n = 0; n < getNumCapabilities(); n++) {
         // NB: after a GC, all nursery weak_ptr_lists have been migrated
         // to the global lists living in the generations
         ASSERT(capabilities[n]->weak_ptr_list_hd == NULL);
diff --git a/rts/RtsAPI.c b/rts/RtsAPI.c
index 5cbea01496..74e6924ae3 100644
--- a/rts/RtsAPI.c
+++ b/rts/RtsAPI.c
@@ -768,7 +768,7 @@ void rts_resume (PauseToken *pauseToken)
 
     // releaseAllCapabilities will not block because the current task owns all
     // capabilities.
-    releaseAllCapabilities(n_capabilities, NULL, task);
+    releaseAllCapabilities(getNumCapabilities(), NULL, task);
     exitMyTask();
     stgFree(pauseToken);
 }
@@ -806,7 +806,7 @@ static void assert_isPausedOnMyTask(const char *functionName)
     }
 
     // Check that we own all capabilities.
-    for (unsigned int i = 0; i < n_capabilities; i++)
+    for (unsigned int i = 0; i < getNumCapabilities(); i++)
     {
         Capability *cap = capabilities[i];
         if (cap->running_task != task)
diff --git a/rts/RtsStartup.c b/rts/RtsStartup.c
index 7e0afe5f92..f4ad851e12 100644
--- a/rts/RtsStartup.c
+++ b/rts/RtsStartup.c
@@ -482,7 +482,7 @@ hs_exit_(bool wait_foreign)
     exitScheduler(wait_foreign);
 
     /* run C finalizers for all active weak pointers */
-    for (i = 0; i < n_capabilities; i++) {
+    for (i = 0; i < getNumCapabilities(); i++) {
         runAllCFinalizers(capabilities[i]->weak_ptr_list_hd);
     }
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
diff --git a/rts/SMPClosureOps.h b/rts/SMPClosureOps.h
index f5242a9cf1..d0b9e9ce5b 100644
--- a/rts/SMPClosureOps.h
+++ b/rts/SMPClosureOps.h
@@ -16,7 +16,9 @@
  * Arguments are swapped for uniformity with unlockClosure. */
 #if defined(THREADED_RTS)
 #define LOCK_CLOSURE(closure, info)                             \
-    if (CInt[n_capabilities] == 1 :: CInt) {                    \
+    CInt _n_caps;                                                 \
+    _n_caps = %relaxed CInt[n_capabilities];                    \
+    if (_n_caps == 1 :: CInt) {                                 \
         info = GET_INFO(closure);                               \
     } else {                                                    \
         ("ptr" info) = ccall reallyLockClosure(closure "ptr");  \
@@ -74,7 +76,7 @@ EXTERN_INLINE StgInfoTable *reallyLockClosure(StgClosure *p)
 
 INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p)
 {
-    if (n_capabilities == 1) {
+    if (getNumCapabilities() == 1) {
         return (StgInfoTable *)p->header.info;
     }
     else {
@@ -87,7 +89,7 @@ INLINE_HEADER StgInfoTable *lockClosure(StgClosure *p)
 EXTERN_INLINE StgInfoTable *tryLockClosure(StgClosure *p)
 {
     StgWord info;
-    if (n_capabilities == 1) {
+    if (RELAXED_LOAD(&n_capabilities) == 1) {
         return (StgInfoTable *)p->header.info;
     }
     else {
diff --git a/rts/STM.c b/rts/STM.c
index d3555cdb37..105e099203 100644
--- a/rts/STM.c
+++ b/rts/STM.c
@@ -1107,7 +1107,7 @@ StgBool stmCommitTransaction(Capability *cap, StgTRecHeader *trec) {
 
       max_commits_at_end = getMaxCommits();
       max_concurrent_commits = ((max_commits_at_end - max_commits_at_start) +
-                                (n_capabilities * TOKEN_BATCH_SIZE));
+                                (getNumCapabilities() * TOKEN_BATCH_SIZE));
       if (((max_concurrent_commits >> 32) > 0) || shake()) {
         result = false;
       }
diff --git a/rts/Schedule.c b/rts/Schedule.c
index 306a709883..75d9cffef4 100644
--- a/rts/Schedule.c
+++ b/rts/Schedule.c
@@ -724,7 +724,7 @@ schedulePushWork(Capability *cap USED_IF_THREADS,
 {
 #if defined(THREADED_RTS)
 
-    Capability *free_caps[n_capabilities], *cap0;
+    Capability *free_caps[getNumCapabilities()], *cap0;
     uint32_t i, n_wanted_caps, n_free_caps;
 
     uint32_t spare_threads = cap->n_run_queue > 0 ? cap->n_run_queue - 1 : 0;
@@ -741,9 +741,9 @@ schedulePushWork(Capability *cap USED_IF_THREADS,
 
     // First grab as many free Capabilities as we can.  ToDo: we should use
     // capabilities on the same NUMA node preferably, but not exclusively.
-    for (i = (cap->no + 1) % n_capabilities, n_free_caps=0;
+    for (i = (cap->no + 1) % getNumCapabilities(), n_free_caps=0;
          n_free_caps < n_wanted_caps && i != cap->no;
-         i = (i + 1) % n_capabilities) {
+         i = (i + 1) % getNumCapabilities()) {
         cap0 = capabilities[i];
         if (cap != cap0 && !cap0->disabled && tryGrabCapability(cap0,task)) {
             if (!emptyRunQueue(cap0)
@@ -1531,9 +1531,9 @@ static void acquireAllCapabilities(Capability *cap, Task *task)
     uint32_t i;
 
     ASSERT(SEQ_CST_LOAD(&pending_sync) != NULL);
-    for (i=0; i < n_capabilities; i++) {
+    for (i=0; i < getNumCapabilities(); i++) {
         debugTrace(DEBUG_sched, "grabbing all the capabilities (%d/%d)",
-                   i, n_capabilities);
+                   i, getNumCapabilities());
         tmpcap = capabilities[i];
         if (tmpcap != cap) {
             // we better hope this task doesn't get migrated to
@@ -1676,7 +1676,7 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
 
             // We need an array of size n_capabilities, but since this may
             // change each time around the loop we must allocate it afresh.
-            idle_cap = (bool *)stgMallocBytes(n_capabilities *
+            idle_cap = (bool *)stgMallocBytes(getNumCapabilities() *
                                               sizeof(bool),
                                               "scheduleDoGC");
             sync.idle = idle_cap;
@@ -1685,7 +1685,7 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
             // GC.  The best bet is to choose some inactive ones, so we look for
             // those first:
             uint32_t n_idle = need_idle;
-            for (i=0; i < n_capabilities; i++) {
+            for (i=0; i < getNumCapabilities(); i++) {
                 if (capabilities[i]->disabled) {
                     idle_cap[i] = true;
                 } else if (n_idle > 0 &&
@@ -1699,7 +1699,7 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
             }
             // If we didn't find enough inactive capabilities, just pick some
             // more to be idle.
-            for (i=0; n_idle > 0 && i < n_capabilities; i++) {
+            for (i=0; n_idle > 0 && i < getNumCapabilities(); i++) {
                 if (!idle_cap[i] && i != cap->no) {
                     idle_cap[i] = true;
                     n_idle--;
@@ -1733,7 +1733,7 @@ scheduleDoGC (Capability **pcap, Task *task USED_IF_THREADS,
 
     stat_startGCSync(gc_threads[cap->no]);
 
-    unsigned int old_n_capabilities = n_capabilities;
+    unsigned int old_n_capabilities = getNumCapabilities();
 
     interruptAllCapabilities();
 
@@ -2335,7 +2335,7 @@ setNumCapabilities (uint32_t new_n_capabilities USED_IF_THREADS)
 
     // update n_capabilities before things start running
     if (new_n_capabilities > n_capabilities) {
-        n_capabilities = enabled_capabilities = new_n_capabilities;
+        RELAXED_STORE(&n_capabilities, enabled_capabilities = new_n_capabilities);
     }
 
     // We're done: release the original Capabilities
diff --git a/rts/Stats.c b/rts/Stats.c
index 517e6c32f2..cf6af35f30 100644
--- a/rts/Stats.c
+++ b/rts/Stats.c
@@ -894,7 +894,7 @@ static void report_summary(const RTSSummaryStats* sum)
                 "(%d bound, %d peak workers (%d total), using -N%d)\n\n",
                 taskCount, sum->bound_task_count,
                 peakWorkerCount, workerCount,
-                n_capabilities);
+                getNumCapabilities());
 
     statsPrintf("  SPARKS: %" FMT_Word64
                 " (%" FMT_Word " converted, %" FMT_Word " overflowed, %"
@@ -1135,7 +1135,7 @@ static void report_machine_readable (const RTSSummaryStats * sum)
     MR_STAT("work_balance", "f", sum->work_balance);
 
     // next, globals (other than internal counters)
-    MR_STAT("n_capabilities", FMT_Word32, n_capabilities);
+    MR_STAT("n_capabilities", FMT_Word32, getNumCapabilities());
     MR_STAT("task_count", FMT_Word32, taskCount);
     MR_STAT("peak_worker_count", FMT_Word32, peakWorkerCount);
     MR_STAT("worker_count", FMT_Word32, workerCount);
@@ -1350,7 +1350,7 @@ stat_exitReport (void)
     #if defined(THREADED_RTS)
             sum.bound_task_count = taskCount - workerCount;
 
-            for (uint32_t i = 0; i < n_capabilities; i++) {
+            for (uint32_t i = 0; i < getNumCapabilities(); i++) {
                 sum.sparks.created   += capabilities[i]->spark_stats.created;
                 sum.sparks.dud       += capabilities[i]->spark_stats.dud;
                 sum.sparks.overflowed+=
@@ -1649,7 +1649,7 @@ statDescribeGens(void)
       gen_blocks = genLiveBlocks(gen);
 
       mut = 0;
-      for (i = 0; i < n_capabilities; i++) {
+      for (i = 0; i < getNumCapabilities(); i++) {
           mut += countOccupied(capabilities[i]->mut_lists[g]);
 
           // Add the pinned object block.
diff --git a/rts/Task.c b/rts/Task.c
index 6dbc597769..de24253db5 100644
--- a/rts/Task.c
+++ b/rts/Task.c
@@ -529,7 +529,7 @@ void rts_setInCallCapability (
 #if defined(THREADED_RTS)
     if (affinity) {
         if (RtsFlags.ParFlags.setAffinity) {
-            setThreadAffinity(preferred_capability, n_capabilities);
+            setThreadAffinity(preferred_capability, getNumCapabilities());
         }
     }
 #endif
diff --git a/rts/Threads.c b/rts/Threads.c
index 07d0d0a180..8d59956b50 100644
--- a/rts/Threads.c
+++ b/rts/Threads.c
@@ -987,7 +987,7 @@ printAllThreads(void)
 
   debugBelch("all threads:\n");
 
-  for (i = 0; i < n_capabilities; i++) {
+  for (i = 0; i < getNumCapabilities(); i++) {
       cap = capabilities[i];
       debugBelch("threads on capability %d:\n", cap->no);
       for (t = cap->run_queue_hd; t != END_TSO_QUEUE; t = t->_link) {
diff --git a/rts/TraverseHeap.c b/rts/TraverseHeap.c
index e40e380765..60b6b6fbcd 100644
--- a/rts/TraverseHeap.c
+++ b/rts/TraverseHeap.c
@@ -1097,7 +1097,7 @@ resetMutableObjects(traverseState* ts)
         // Traversing through mut_list is necessary
         // because we can find MUT_VAR objects which have not been
         // visited during heap traversal.
-        for (n = 0; n < n_capabilities; n++) {
+        for (n = 0; n < getNumCapabilities(); n++) {
           for (bd = capabilities[n]->mut_lists[g]; bd != NULL; bd = bd->link) {
             for (ml = bd->start; ml < bd->free; ml++) {
                 traverseMaybeInitClosureData(ts, (StgClosure *)*ml);
diff --git a/rts/eventlog/EventLog.c b/rts/eventlog/EventLog.c
index 30ffe9cf04..7b4150c72d 100644
--- a/rts/eventlog/EventLog.c
+++ b/rts/eventlog/EventLog.c
@@ -351,7 +351,8 @@ get_n_capabilities(void)
 {
 #if defined(THREADED_RTS)
     // XXX n_capabilities may not have been initialized yet
-    return (n_capabilities != 0) ? n_capabilities : RtsFlags.ParFlags.nCapabilities;
+    unsigned int n = getNumCapabilities();
+    return (n != 0) ? n : RtsFlags.ParFlags.nCapabilities;
 #else
     return 1;
 #endif
@@ -452,7 +453,7 @@ finishCapEventLogging(void)
     if (eventlog_enabled) {
         // Flush all events remaining in the capabilities' buffers and free them.
         // N.B. at this point we hold all capabilities.
-        for (uint32_t c = 0; c < n_capabilities; ++c) {
+        for (uint32_t c = 0; c < getNumCapabilities(); ++c) {
             if (capEventBuf[c].begin != NULL) {
                 printAndClearEventBuf(&capEventBuf[c]);
                 stgFree(capEventBuf[c].begin);
@@ -1570,7 +1571,7 @@ void flushAllCapsEventsBufs()
     printAndClearEventBuf(&eventBuf);
     RELEASE_LOCK(&eventBufMutex);
 
-    for (unsigned int i=0; i < n_capabilities; i++) {
+    for (unsigned int i=0; i < getNumCapabilities(); i++) {
         flushLocalEventsBuf(capabilities[i]);
     }
     flushEventLogWriter();
@@ -1590,7 +1591,7 @@ void flushEventLog(Capability **cap USED_IF_THREADS)
     Task *task = getMyTask();
     stopAllCapabilitiesWith(cap, task, SYNC_FLUSH_EVENT_LOG);
     flushAllCapsEventsBufs();
-    releaseAllCapabilities(n_capabilities, cap ? *cap : NULL, task);
+    releaseAllCapabilities(getNumCapabilities(), cap ? *cap : NULL, task);
 #else
     flushLocalEventsBuf(capabilities[0]);
 #endif
diff --git a/rts/hooks/LongGCSync.c b/rts/hooks/LongGCSync.c
index af56d0cb62..ccc5aa44b7 100644
--- a/rts/hooks/LongGCSync.c
+++ b/rts/hooks/LongGCSync.c
@@ -20,7 +20,7 @@ void LongGCSync (uint32_t me USED_IF_THREADS, Time t STG_UNUSED)
 #if defined(THREADED_RTS)
     {
         uint32_t i;
-        for (i=0; i < n_capabilities; i++) {
+        for (i=0; i < getNumCapabilities(); i++) {
             if (i != me && SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_STANDING_BY) {
                 debugBelch("Warning: slow GC sync: still waiting for cap %d\n",
                            i);
diff --git a/rts/include/rts/Threads.h b/rts/include/rts/Threads.h
index 9303a9e80d..035e0d0315 100644
--- a/rts/include/rts/Threads.h
+++ b/rts/include/rts/Threads.h
@@ -67,10 +67,13 @@ pid_t  forkProcess     (HsStablePtr *entry)
 HsBool rtsSupportsBoundThreads (void);
 
 // The number of Capabilities.
-// ToDo: I would like this to be private to the RTS and instead expose a
-// function getNumCapabilities(), but it is used in compiler/cbits/genSym.c
+// TODO: Ideally we would only provide getNumCapabilities
+// but this is used in compiler/cbits/genSym.c
 extern unsigned int n_capabilities;
 
+INLINE_HEADER unsigned int getNumCapabilities(void)
+{ return RELAXED_LOAD(&n_capabilities); }
+
 // The number of Capabilities that are not disabled
 extern uint32_t enabled_capabilities;
 
diff --git a/rts/posix/Signals.c b/rts/posix/Signals.c
index 7d58f3d7a2..8c29564ace 100644
--- a/rts/posix/Signals.c
+++ b/rts/posix/Signals.c
@@ -202,7 +202,7 @@ ioManagerDie (void)
 
     {
         // Shut down IO managers
-        for (i=0; i < n_capabilities; i++) {
+        for (i=0; i < getNumCapabilities(); i++) {
             const int fd = RELAXED_LOAD(&capabilities[i]->iomgr->control_fd);
             if (0 <= fd) {
                 r = write(fd, &byte, 1);
diff --git a/rts/sm/Compact.c b/rts/sm/Compact.c
index 3f8a1fcfd6..aa901ee172 100644
--- a/rts/sm/Compact.c
+++ b/rts/sm/Compact.c
@@ -993,7 +993,7 @@ compact(StgClosure *static_objects,
 
     // mutable lists
     for (W_ g = 1; g < RtsFlags.GcFlags.generations; g++) {
-        for (W_ n = 0; n < n_capabilities; n++) {
+        for (W_ n = 0; n < getNumCapabilities(); n++) {
             for (bdescr *bd = capabilities[n]->mut_lists[g];
                  bd != NULL; bd = bd->link) {
                 for (P_ p = bd->start; p < bd->free; p++) {
@@ -1039,7 +1039,7 @@ compact(StgClosure *static_objects,
         debugTrace(DEBUG_gc, "update_fwd:  %d", g);
 
         update_fwd(gen->blocks);
-        for (W_ n = 0; n < n_capabilities; n++) {
+        for (W_ n = 0; n < getNumCapabilities(); n++) {
             update_fwd(gc_threads[n]->gens[g].todo_bd);
             update_fwd(gc_threads[n]->gens[g].part_list);
         }
diff --git a/rts/sm/GC.c b/rts/sm/GC.c
index b315600c3c..8776f80b51 100644
--- a/rts/sm/GC.c
+++ b/rts/sm/GC.c
@@ -309,7 +309,7 @@ GarbageCollect (uint32_t collect_gen,
 #endif
 
 #if defined(PROFILING)
-  CostCentreStack *save_CCS[n_capabilities];
+  CostCentreStack *save_CCS[getNumCapabilities()];
 #endif
 
   ACQUIRE_SM_LOCK;
@@ -340,7 +340,7 @@ GarbageCollect (uint32_t collect_gen,
 
   // attribute any costs to CCS_GC
 #if defined(PROFILING)
-  for (n = 0; n < n_capabilities; n++) {
+  for (n = 0; n < getNumCapabilities(); n++) {
       save_CCS[n] = capabilities[n]->r.rCCCS;
       capabilities[n]->r.rCCCS = CCS_GC;
   }
@@ -396,9 +396,9 @@ GarbageCollect (uint32_t collect_gen,
    * here
   */
   if (gc_type == SYNC_GC_PAR) {
-      n_gc_threads = n_capabilities;
+      n_gc_threads = getNumCapabilities();
       n_gc_idle_threads = 0;
-      for (uint32_t i = 0; i < n_capabilities; ++i) {
+      for (uint32_t i = 0; i < getNumCapabilities(); ++i) {
           if (idle_cap[i]) {
               ASSERT(i != gct->thread_index);
               ++n_gc_idle_threads;
@@ -406,7 +406,7 @@ GarbageCollect (uint32_t collect_gen,
       }
   } else {
       n_gc_threads = 1;
-      n_gc_idle_threads = n_capabilities - 1;
+      n_gc_idle_threads = getNumCapabilities() - 1;
   }
   work_stealing = RtsFlags.ParFlags.parGcLoadBalancingEnabled &&
       N >= RtsFlags.ParFlags.parGcLoadBalancingGen &&
@@ -429,15 +429,15 @@ GarbageCollect (uint32_t collect_gen,
   SEQ_CST_STORE(&gc_running_threads, 0);
 
   ASSERT(n_gc_threads > 0);
-  ASSERT(n_gc_threads <= n_capabilities);
-  ASSERT(n_gc_idle_threads < n_capabilities);
+  ASSERT(n_gc_threads <= getNumCapabilities());
+  ASSERT(n_gc_idle_threads < getNumCapabilities());
   // If we are work stealing, there better be another(i.e. not us) non-idle gc
   // thread
   ASSERT(!work_stealing || n_gc_threads - 1 > n_gc_idle_threads);
 
 
   debugTrace(DEBUG_gc, "GC (gen %d, using %d thread(s), %s work stealing)",
-             N, (int)n_capabilities - (int)n_gc_idle_threads,
+             N, (int)getNumCapabilities() - (int)n_gc_idle_threads,
              work_stealing ? "with": "without");
 
 #if defined(DEBUG)
@@ -504,7 +504,7 @@ GarbageCollect (uint32_t collect_gen,
   // call back into the GC via mark_root() (due to the gct register
   // variable).
   if (!is_par_gc()) {
-      for (n = 0; n < n_capabilities; n++) {
+      for (n = 0; n < getNumCapabilities(); n++) {
 #if defined(THREADED_RTS)
           scavenge_capability_mut_Lists1(capabilities[n]);
 #else
@@ -513,7 +513,7 @@ GarbageCollect (uint32_t collect_gen,
       }
   } else {
       scavenge_capability_mut_lists(gct->cap);
-      for (n = 0; n < n_capabilities; n++) {
+      for (n = 0; n < getNumCapabilities(); n++) {
           if (idle_cap[n]) {
               markCapability(mark_root, gct, capabilities[n],
                              true/*don't mark sparks*/);
@@ -529,7 +529,7 @@ GarbageCollect (uint32_t collect_gen,
   // follow all the roots that the application knows about.
   gct->evac_gen_no = 0;
   if (!is_par_gc()) {
-      for (n = 0; n < n_capabilities; n++) {
+      for (n = 0; n < getNumCapabilities(); n++) {
           markCapability(mark_root, gct, capabilities[n],
                          true/*don't mark sparks*/);
       }
@@ -572,11 +572,11 @@ GarbageCollect (uint32_t collect_gen,
 
 #if defined(THREADED_RTS)
   if (!is_par_gc()) {
-      for (n = 0; n < n_capabilities; n++) {
+      for (n = 0; n < getNumCapabilities(); n++) {
           pruneSparkQueue(false, capabilities[n]);
       }
   } else {
-      for (n = 0; n < n_capabilities; n++) {
+      for (n = 0; n < getNumCapabilities(); n++) {
           if (n == cap->no || idle_cap[n]) {
               pruneSparkQueue(false, capabilities[n]);
          }
@@ -682,7 +682,7 @@ GarbageCollect (uint32_t collect_gen,
     // stats.  Every mutable list is copied during every GC.
     if (g > 0) {
         W_ mut_list_size = 0;
-        for (n = 0; n < n_capabilities; n++) {
+        for (n = 0; n < getNumCapabilities(); n++) {
             mut_list_size += countOccupied(capabilities[n]->mut_lists[g]);
         }
         copied +=  mut_list_size;
@@ -835,7 +835,7 @@ GarbageCollect (uint32_t collect_gen,
     // add in the partial blocks in the gen_workspaces
     {
         uint32_t i;
-        for (i = 0; i < n_capabilities; i++) {
+        for (i = 0; i < getNumCapabilities(); i++) {
             live_words  += gcThreadLiveWords(i, gen->no);
             live_blocks += gcThreadLiveBlocks(i, gen->no);
         }
@@ -846,7 +846,7 @@ GarbageCollect (uint32_t collect_gen,
   // flushing] in NonMovingMark.c
   if (RtsFlags.GcFlags.useNonmoving) {
       RELEASE_SM_LOCK;
-      for (n = 0; n < n_capabilities; n++) {
+      for (n = 0; n < getNumCapabilities(); n++) {
           nonmovingAddUpdRemSetBlocks(&capabilities[n]->upd_rem_set.queue);
       }
       ACQUIRE_SM_LOCK;
@@ -1078,7 +1078,7 @@ GarbageCollect (uint32_t collect_gen,
 
   // restore enclosing cost centre
 #if defined(PROFILING)
-  for (n = 0; n < n_capabilities; n++) {
+  for (n = 0; n < getNumCapabilities(); n++) {
       capabilities[n]->r.rCCCS = save_CCS[n];
   }
 #endif
@@ -1227,7 +1227,7 @@ freeGcThreads (void)
     if (gc_threads != NULL) {
 #if defined(THREADED_RTS)
         uint32_t i;
-        for (i = 0; i < n_capabilities; i++) {
+        for (i = 0; i < getNumCapabilities(); i++) {
             for (g = 0; g < RtsFlags.GcFlags.generations; g++)
             {
                 freeWSDeque(gc_threads[i]->gens[g].todo_q);
@@ -1457,26 +1457,26 @@ void
 waitForGcThreads (Capability *cap, bool idle_cap[])
 {
     // n_gc_threads is not valid here, we're too early
-    uint32_t n_threads = n_capabilities;
+    uint32_t n_threads = getNumCapabilities();
     const uint32_t me = cap->no;
     uint32_t i, cur_n_gc_entered;
     Time t0, t1, t2;
 
     t0 = t1 = t2 = getProcessElapsedTime();
 
-    for(i = 0; i < n_capabilities; ++i) {
+    for(i = 0; i < getNumCapabilities(); ++i) {
         if (i == me || idle_cap[i]) {
             --n_threads;
         }
     }
 
-    ASSERT(n_threads < n_capabilities); // must be less because we don't count ourself
+    ASSERT(n_threads < getNumCapabilities()); // must be less because we don't count ourself
     if(n_threads == 0) { return; }
 
     ACQUIRE_LOCK(&gc_entry_mutex);
     while((cur_n_gc_entered = SEQ_CST_LOAD(&n_gc_entered)) != n_threads) {
         ASSERT(cur_n_gc_entered < n_threads);
-        for(i = 0; i < n_capabilities; ++i) {
+        for(i = 0; i < getNumCapabilities(); ++i) {
             if (i == me || idle_cap[i]) { continue; }
             if (SEQ_CST_LOAD(&gc_threads[i]->wakeup) != GC_THREAD_STANDING_BY) {
                 prodCapability(capabilities[i], cap->running_task);
@@ -1566,7 +1566,7 @@ shutdown_gc_threads (uint32_t me USED_IF_THREADS USED_IF_DEBUG,
     }
 #if defined(DEBUG)
     uint32_t i;
-    for (i=0; i < n_capabilities; i++) {
+    for (i=0; i < getNumCapabilities(); i++) {
         if (i == me || idle_cap[i]) continue;
         ASSERT(SEQ_CST_LOAD(&gc_threads[i]->wakeup) == GC_THREAD_WAITING_TO_CONTINUE);
     }
@@ -1579,7 +1579,7 @@ shutdown_gc_threads (uint32_t me USED_IF_THREADS USED_IF_DEBUG,
 void
 releaseGCThreads (Capability *cap USED_IF_THREADS, bool idle_cap[])
 {
-    const uint32_t n_threads = n_capabilities;
+    const uint32_t n_threads = getNumCapabilities();
     const uint32_t me = cap->no;
     uint32_t i;
     uint32_t num_idle = 0;
@@ -1631,14 +1631,14 @@ prepare_collected_gen (generation *gen)
 
     if (RtsFlags.GcFlags.useNonmoving && g == oldest_gen->no) {
         // Nonmoving heap's mutable list is always a root.
-        for (i = 0; i < n_capabilities; i++) {
+        for (i = 0; i < getNumCapabilities(); i++) {
             stash_mut_list(capabilities[i], g);
         }
     } else if (g != 0) {
         // Otherwise throw away the current mutable list. Invariant: the
         // mutable list always has at least one block; this means we can avoid
         // a check for NULL in recordMutable().
-        for (i = 0; i < n_capabilities; i++) {
+        for (i = 0; i < getNumCapabilities(); i++) {
             bdescr *old = RELAXED_LOAD(&capabilities[i]->mut_lists[g]);
             freeChain(old);
 
@@ -1675,7 +1675,7 @@ prepare_collected_gen (generation *gen)
 
     // grab all the partial blocks stashed in the gc_thread workspaces and
     // move them to the old_blocks list of this gen.
-    for (n = 0; n < n_capabilities; n++) {
+    for (n = 0; n < getNumCapabilities(); n++) {
         ws = &gc_threads[n]->gens[gen->no];
 
         for (bd = ws->part_list; bd != NULL; bd = next) {
@@ -1774,7 +1774,7 @@ prepare_uncollected_gen (generation *gen)
     // save the current mutable lists for this generation, and
     // allocate a fresh block for each one.  We'll traverse these
     // mutable lists as roots early on in the GC.
-    for (i = 0; i < n_capabilities; i++) {
+    for (i = 0; i < getNumCapabilities(); i++) {
         stash_mut_list(capabilities[i], gen->no);
     }
 
@@ -1848,7 +1848,7 @@ collect_pinned_object_blocks (void)
     const bool use_nonmoving = RtsFlags.GcFlags.useNonmoving;
     generation *const gen = (use_nonmoving && major_gc) ? oldest_gen : g0;
 
-    for (uint32_t n = 0; n < n_capabilities; n++) {
+    for (uint32_t n = 0; n < getNumCapabilities(); n++) {
         bdescr *last = NULL;
         if (use_nonmoving && gen == oldest_gen) {
             // Mark objects as belonging to the nonmoving heap
@@ -1964,7 +1964,7 @@ resizeGenerations (void)
     // minimum size for generation zero
     min_alloc = stg_max((RtsFlags.GcFlags.pcFreeHeap * max) / 200,
                         RtsFlags.GcFlags.minAllocAreaSize
-                        * (W_)n_capabilities);
+                        * (W_)getNumCapabilities());
 
     // Auto-enable compaction when the residency reaches a
     // certain percentage of the maximum heap size (default: 30%).
@@ -2036,7 +2036,7 @@ static void
 resize_nursery (void)
 {
     const StgWord min_nursery =
-      RtsFlags.GcFlags.minAllocAreaSize * (StgWord)n_capabilities;
+      RtsFlags.GcFlags.minAllocAreaSize * (StgWord)getNumCapabilities();
 
     if (RtsFlags.GcFlags.generations == 1)
     {   // Two-space collector:
diff --git a/rts/sm/MarkWeak.c b/rts/sm/MarkWeak.c
index 2d4482501a..9883def2a8 100644
--- a/rts/sm/MarkWeak.c
+++ b/rts/sm/MarkWeak.c
@@ -385,7 +385,7 @@ void collectFreshWeakPtrs()
 {
     uint32_t i;
     // move recently allocated weak_ptr_list to the old list as well
-    for (i = 0; i < n_capabilities; i++) {
+    for (i = 0; i < getNumCapabilities(); i++) {
         Capability *cap = capabilities[i];
         if (cap->weak_ptr_list_tl != NULL) {
             IF_DEBUG(sanity, checkWeakPtrSanity(cap->weak_ptr_list_hd, cap->weak_ptr_list_tl));
diff --git a/rts/sm/NonMoving.c b/rts/sm/NonMoving.c
index 4a17386324..f1e2e73fff 100644
--- a/rts/sm/NonMoving.c
+++ b/rts/sm/NonMoving.c
@@ -732,7 +732,7 @@ void nonmovingInit(void)
     initMutex(&concurrent_coll_finished_lock);
 #endif
     for (unsigned int i = 0; i < NONMOVING_ALLOCA_CNT; i++) {
-        nonmovingHeap.allocators[i] = alloc_nonmoving_allocator(n_capabilities);
+        nonmovingHeap.allocators[i] = alloc_nonmoving_allocator(getNumCapabilities());
     }
     nonmovingMarkInitUpdRemSet();
 }
@@ -825,7 +825,7 @@ static void nonmovingPrepareMark(void)
         struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
 
         // Update current segments' snapshot pointers
-        for (uint32_t cap_n = 0; cap_n < n_capabilities; ++cap_n) {
+        for (uint32_t cap_n = 0; cap_n < getNumCapabilities(); ++cap_n) {
             struct NonmovingSegment *seg = alloca->current[cap_n];
             nonmovingSegmentInfo(seg)->next_free_snap = seg->next_free;
         }
@@ -944,7 +944,7 @@ void nonmovingCollect(StgWeak **dead_weaks, StgTSO **resurrected_threads)
     // Mark roots
     trace(TRACE_nonmoving_gc, "Marking roots for nonmoving GC");
     markCAFs((evac_fn)markQueueAddRoot, mark_queue);
-    for (unsigned int n = 0; n < n_capabilities; ++n) {
+    for (unsigned int n = 0; n < getNumCapabilities(); ++n) {
         markCapability((evac_fn)markQueueAddRoot, mark_queue,
                 capabilities[n], true/*don't mark sparks*/);
     }
@@ -1188,7 +1188,7 @@ static void nonmovingMark_(MarkQueue *mark_queue, StgWeak **dead_weaks, StgTSO *
     // Prune spark lists
     // See Note [Spark management under the nonmoving collector].
 #if defined(THREADED_RTS)
-    for (uint32_t n = 0; n < n_capabilities; n++) {
+    for (uint32_t n = 0; n < getNumCapabilities(); n++) {
         pruneSparkQueue(true, capabilities[n]);
     }
 #endif
@@ -1264,7 +1264,7 @@ void assert_in_nonmoving_heap(StgPtr p)
     bdescr *bd = Bdescr(p);
     if (bd->flags & BF_LARGE) {
         // It should be in a capability (if it's not filled yet) or in non-moving heap
-        for (uint32_t cap = 0; cap < n_capabilities; ++cap) {
+        for (uint32_t cap = 0; cap < getNumCapabilities(); ++cap) {
             if (bd == capabilities[cap]->pinned_object_block) {
                 return;
             }
@@ -1283,7 +1283,7 @@ void assert_in_nonmoving_heap(StgPtr p)
     for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
         struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
         // Search current segments
-        for (uint32_t cap_idx = 0; cap_idx < n_capabilities; ++cap_idx) {
+        for (uint32_t cap_idx = 0; cap_idx < getNumCapabilities(); ++cap_idx) {
             struct NonmovingSegment *seg = alloca->current[cap_idx];
             if (p >= (P_)seg && p < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
                 return;
@@ -1355,7 +1355,7 @@ void nonmovingPrintAllocator(struct NonmovingAllocator *alloc)
         debugBelch("%p ", (void*)seg);
     }
     debugBelch("\nCurrent segments:\n");
-    for (uint32_t i = 0; i < n_capabilities; ++i) {
+    for (uint32_t i = 0; i < getNumCapabilities(); ++i) {
         debugBelch("%p ", alloc->current[i]);
     }
     debugBelch("\n");
@@ -1366,7 +1366,7 @@ void locate_object(P_ obj)
     // Search allocators
     for (int alloca_idx = 0; alloca_idx < NONMOVING_ALLOCA_CNT; ++alloca_idx) {
         struct NonmovingAllocator *alloca = nonmovingHeap.allocators[alloca_idx];
-        for (uint32_t cap = 0; cap < n_capabilities; ++cap) {
+        for (uint32_t cap = 0; cap < getNumCapabilities(); ++cap) {
             struct NonmovingSegment *seg = alloca->current[cap];
             if (obj >= (P_)seg && obj < (((P_)seg) + NONMOVING_SEGMENT_SIZE_W)) {
                 debugBelch("%p is in current segment of capability %d of allocator %d at %p\n", obj, cap, alloca_idx, (void*)seg);
@@ -1497,7 +1497,7 @@ void nonmovingPrintSweepList()
 
 void check_in_mut_list(StgClosure *p)
 {
-    for (uint32_t cap_n = 0; cap_n < n_capabilities; ++cap_n) {
+    for (uint32_t cap_n = 0; cap_n < getNumCapabilities(); ++cap_n) {
         for (bdescr *bd = capabilities[cap_n]->mut_lists[oldest_gen->no]; bd; bd = bd->link) {
             for (StgPtr q = bd->start; q < bd->free; ++q) {
                 if (*((StgPtr**)q) == (StgPtr*)p) {
diff --git a/rts/sm/NonMovingCensus.c b/rts/sm/NonMovingCensus.c
index 2dcec4b745..41b6f7433b 100644
--- a/rts/sm/NonMovingCensus.c
+++ b/rts/sm/NonMovingCensus.c
@@ -56,7 +56,7 @@ nonmovingAllocatorCensus_(struct NonmovingAllocator *alloc, bool collect_live_wo
         }
     }
 
-    for (unsigned int cap=0; cap < n_capabilities; cap++)
+    for (unsigned int cap=0; cap < getNumCapabilities(); cap++)
     {
         struct NonmovingSegment *seg = alloc->current[cap];
         unsigned int n = nonmovingSegmentBlockCount(seg);
diff --git a/rts/sm/NonMovingMark.c b/rts/sm/NonMovingMark.c
index 91708c84f9..35780fcd49 100644
--- a/rts/sm/NonMovingMark.c
+++ b/rts/sm/NonMovingMark.c
@@ -323,7 +323,7 @@ void nonmovingBeginFlush(Task *task)
     // task suspended due to a foreign call) in which case our requestSync
     // logic won't have been hit. Make sure that everyone so far has flushed.
     // Ideally we want to mark asynchronously with syncing.
-    for (uint32_t i = 0; i < n_capabilities; i++) {
+    for (uint32_t i = 0; i < getNumCapabilities(); i++) {
         nonmovingFlushCapUpdRemSetBlocks(capabilities[i]);
     }
 }
@@ -335,7 +335,7 @@ bool nonmovingWaitForFlush()
 {
     ACQUIRE_LOCK(&upd_rem_set_lock);
     debugTrace(DEBUG_nonmoving_gc, "Flush count %d", upd_rem_set_flush_count);
-    bool finished = upd_rem_set_flush_count == n_capabilities;
+    bool finished = upd_rem_set_flush_count == getNumCapabilities();
     if (!finished) {
         waitCondition(&upd_rem_set_flushed_cond, &upd_rem_set_lock);
     }
@@ -398,7 +398,7 @@ bool nonmovingWaitForFlush()
 void nonmovingFinishFlush(Task *task)
 {
     // See Note [Unintentional marking in resurrectThreads]
-    for (uint32_t i = 0; i < n_capabilities; i++) {
+    for (uint32_t i = 0; i < getNumCapabilities(); i++) {
         reset_upd_rem_set(&capabilities[i]->upd_rem_set);
     }
     // Also reset upd_rem_set_block_list in case some of the UpdRemSets were
@@ -409,7 +409,7 @@ void nonmovingFinishFlush(Task *task)
     debugTrace(DEBUG_nonmoving_gc, "Finished update remembered set flush...");
     traceConcSyncEnd();
     stat_endNonmovingGcSync();
-    releaseAllCapabilities(n_capabilities, NULL, task);
+    releaseAllCapabilities(getNumCapabilities(), NULL, task);
 }
 #endif
 
@@ -1361,7 +1361,7 @@ mark_closure (MarkQueue *queue, const StgClosure *p0, StgClosure **origin)
     else if (bd->flags & BF_PINNED) {
 #if defined(DEBUG)
         bool found_it = false;
-        for (uint32_t i = 0; i < n_capabilities; ++i) {
+        for (uint32_t i = 0; i < getNumCapabilities(); ++i) {
             if (capabilities[i]->pinned_object_block == bd) {
                 found_it = true;
                 break;
diff --git a/rts/sm/NonMovingSweep.c b/rts/sm/NonMovingSweep.c
index 5c4752d4a3..977af49b04 100644
--- a/rts/sm/NonMovingSweep.c
+++ b/rts/sm/NonMovingSweep.c
@@ -279,7 +279,7 @@ dirty_BLOCKING_QUEUE:
 /* N.B. This happens during the pause so we own all capabilities. */
 void nonmovingSweepMutLists()
 {
-    for (uint32_t n = 0; n < n_capabilities; n++) {
+    for (uint32_t n = 0; n < getNumCapabilities(); n++) {
         Capability *cap = capabilities[n];
         bdescr *old_mut_list = cap->mut_lists[oldest_gen->no];
         cap->mut_lists[oldest_gen->no] = allocBlockOnNode_lock(cap->node);
diff --git a/rts/sm/Sanity.c b/rts/sm/Sanity.c
index 06a1ddcc91..461edd2171 100644
--- a/rts/sm/Sanity.c
+++ b/rts/sm/Sanity.c
@@ -635,7 +635,7 @@ void checkNonmovingHeap (const struct NonmovingHeap *heap)
         const struct NonmovingAllocator *alloc = heap->allocators[i];
         checkNonmovingSegments(alloc->filled);
         checkNonmovingSegments(alloc->active);
-        for (unsigned int cap=0; cap < n_capabilities; cap++) {
+        for (unsigned int cap=0; cap < getNumCapabilities(); cap++) {
             checkNonmovingSegments(alloc->current[cap]);
         }
     }
@@ -843,7 +843,7 @@ static void
 checkMutableLists (void)
 {
     uint32_t i;
-    for (i = 0; i < n_capabilities; i++) {
+    for (i = 0; i < getNumCapabilities(); i++) {
         checkLocalMutableLists(i);
     }
 }
@@ -962,7 +962,7 @@ static void checkGeneration (generation *gen,
 
     checkHeapChain(gen->blocks);
 
-    for (n = 0; n < n_capabilities; n++) {
+    for (n = 0; n < getNumCapabilities(); n++) {
         ws = &gc_threads[n]->gens[gen->no];
         checkHeapChain(ws->todo_bd);
         checkHeapChain(ws->part_list);
@@ -987,7 +987,7 @@ static void checkFullHeap (bool after_major_gc)
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
         checkGeneration(&generations[g], after_major_gc);
     }
-    for (n = 0; n < n_capabilities; n++) {
+    for (n = 0; n < getNumCapabilities(); n++) {
         checkNurserySanity(&nurseries[n]);
     }
 }
@@ -1035,7 +1035,7 @@ findMemoryLeak (void)
 {
     uint32_t g, i, j;
     for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-        for (i = 0; i < n_capabilities; i++) {
+        for (i = 0; i < getNumCapabilities(); i++) {
             markBlocks(capabilities[i]->mut_lists[g]);
             markBlocks(gc_threads[i]->gens[g].part_list);
             markBlocks(gc_threads[i]->gens[g].scavd_list);
@@ -1050,7 +1050,7 @@ findMemoryLeak (void)
         markBlocks(nurseries[i].blocks);
     }
 
-    for (i = 0; i < n_capabilities; i++) {
+    for (i = 0; i < getNumCapabilities(); i++) {
         markBlocks(gc_threads[i]->free_blocks);
         markBlocks(capabilities[i]->pinned_object_block);
         markBlocks(capabilities[i]->upd_rem_set.queue.blocks);
@@ -1066,7 +1066,7 @@ findMemoryLeak (void)
             struct NonmovingAllocator *alloc = nonmovingHeap.allocators[i];
             markNonMovingSegments(alloc->filled);
             markNonMovingSegments(alloc->active);
-            for (j = 0; j < n_capabilities; j++) {
+            for (j = 0; j < getNumCapabilities(); j++) {
                 markNonMovingSegments(alloc->current[j]);
             }
         }
@@ -1177,7 +1177,7 @@ countNonMovingAllocator(struct NonmovingAllocator *alloc)
 {
     W_ ret = countNonMovingSegments(alloc->filled)
            + countNonMovingSegments(alloc->active);
-    for (uint32_t i = 0; i < n_capabilities; ++i) {
+    for (uint32_t i = 0; i < getNumCapabilities(); ++i) {
         ret += countNonMovingSegments(alloc->current[i]);
     }
     return ret;
@@ -1219,7 +1219,7 @@ memInventory (bool show)
 
   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
       gen_blocks[g] = 0;
-      for (i = 0; i < n_capabilities; i++) {
+      for (i = 0; i < getNumCapabilities(); i++) {
           gen_blocks[g] += countBlocks(capabilities[i]->mut_lists[g]);
           gen_blocks[g] += countBlocks(gc_threads[i]->gens[g].part_list);
           gen_blocks[g] += countBlocks(gc_threads[i]->gens[g].scavd_list);
@@ -1232,7 +1232,7 @@ memInventory (bool show)
       ASSERT(countBlocks(nurseries[i].blocks) == nurseries[i].n_blocks);
       nursery_blocks += nurseries[i].n_blocks;
   }
-  for (i = 0; i < n_capabilities; i++) {
+  for (i = 0; i < getNumCapabilities(); i++) {
       W_ n = countBlocks(gc_threads[i]->free_blocks);
       gc_free_blocks += n;
       if (capabilities[i]->pinned_object_block != NULL) {
@@ -1258,7 +1258,7 @@ memInventory (bool show)
   free_blocks = countFreeList();
 
   // count UpdRemSet blocks
-  for (i = 0; i < n_capabilities; ++i) {
+  for (i = 0; i < getNumCapabilities(); ++i) {
       upd_rem_set_blocks += countBlocks(capabilities[i]->upd_rem_set.queue.blocks);
   }
   upd_rem_set_blocks += countBlocks(upd_rem_set_block_list);
diff --git a/rts/sm/Storage.c b/rts/sm/Storage.c
index dc27284500..99a8c18033 100644
--- a/rts/sm/Storage.c
+++ b/rts/sm/Storage.c
@@ -225,7 +225,7 @@ initStorage (void)
 #endif
 
   if (RtsFlags.GcFlags.useNonmoving)
-      nonmovingAddCapabilities(n_capabilities);
+      nonmovingAddCapabilities(getNumCapabilities());
 
   /* The oldest generation has one step. */
   if (RtsFlags.GcFlags.compact || RtsFlags.GcFlags.sweep) {
@@ -257,7 +257,7 @@ initStorage (void)
   for (n = 0; n < n_numa_nodes; n++) {
       next_nursery[n] = n;
   }
-  storageAddCapabilities(0, n_capabilities);
+  storageAddCapabilities(0, getNumCapabilities());
 
   IF_DEBUG(gc, statDescribeGens());
 
@@ -375,7 +375,7 @@ void listAllBlocks (ListBlocksCb cb, void *user)
 {
   uint32_t g, i;
   for (g = 0; g < RtsFlags.GcFlags.generations; g++) {
-      for (i = 0; i < n_capabilities; i++) {
+      for (i = 0; i < getNumCapabilities(); i++) {
           cb(user, capabilities[i]->mut_lists[g]);
           cb(user, gc_threads[i]->gens[g].part_list);
           cb(user, gc_threads[i]->gens[g].scavd_list);
@@ -387,7 +387,7 @@ void listAllBlocks (ListBlocksCb cb, void *user)
   for (i = 0; i < n_nurseries; i++) {
       cb(user, nurseries[i].blocks);
   }
-  for (i = 0; i < n_capabilities; i++) {
+  for (i = 0; i < getNumCapabilities(); i++) {
       if (capabilities[i]->pinned_object_block != NULL) {
           cb(user, capabilities[i]->pinned_object_block);
       }
@@ -816,7 +816,7 @@ resetNurseries (void)
     for (n = 0; n < n_numa_nodes; n++) {
         next_nursery[n] = n;
     }
-    assignNurseriesToCapabilities(0, n_capabilities);
+    assignNurseriesToCapabilities(0, getNumCapabilities());
 
 #if defined(DEBUG)
     bdescr *bd;
@@ -1568,7 +1568,7 @@ calcTotalAllocated (void)
     uint64_t tot_alloc = 0;
     W_ n;
 
-    for (n = 0; n < n_capabilities; n++) {
+    for (n = 0; n < getNumCapabilities(); n++) {
         tot_alloc += capabilities[n]->total_allocated;
 
         traceEventHeapAllocated(capabilities[n],
@@ -1589,7 +1589,7 @@ updateNurseriesStats (void)
     uint32_t i;
     bdescr *bd;
 
-    for (i = 0; i < n_capabilities; i++) {
+    for (i = 0; i < getNumCapabilities(); i++) {
         // The current nursery block and the current allocate block have not
         // yet been accounted for in cap->total_allocated, so we add them here.
         bd = capabilities[i]->r.rCurrentNursery;