5 files changed, 305 insertions, 12 deletions
diff --git a/docs/users_guide/8.2.1-notes.rst b/docs/users_guide/8.2.1-notes.rst
index 3e13f57c57..033f8da483 100644
--- a/docs/users_guide/8.2.1-notes.rst
+++ b/docs/users_guide/8.2.1-notes.rst
@@ -71,6 +71,10 @@ Runtime system
   event log, allowing heap profiles to be correlated with other tracing events
   (see :ghc-ticket:`11094`).
 
+- Added processor group support for Windows. This allows the runtime to allocate
+  threads to all cores in systems which have multiple processor groups.
+  (e.g. > 64 cores, see :ghc-ticket:`11054`)
+
 Build system
 ~~~~~~~~~~~~
 
diff --git a/includes/rts/OSThreads.h b/includes/rts/OSThreads.h
index bc84b714db..2eb58971ca 100644
--- a/includes/rts/OSThreads.h
+++ b/includes/rts/OSThreads.h
@@ -221,6 +221,11 @@ void releaseThreadNode (void);
 int forkOS_createThread ( HsStablePtr entry );
 
 //
+// Free any global resources created in OSThreads.
+//
+void freeThreadingResources(void);
+
+//
 // Returns the number of processor cores in the machine
 //
 uint32_t getNumberOfProcessors (void);
diff --git a/rts/RtsStartup.c b/rts/RtsStartup.c
index a2630b2f8f..7c2be0d7ef 100644
--- a/rts/RtsStartup.c
+++ b/rts/RtsStartup.c
@@ -434,6 +434,11 @@ hs_exit_(rtsBool wait_foreign)
 
     // Free the various argvs
     freeRtsArgs();
+
+#ifndef CMINUSMINUS
+    // Free threading resources
+    freeThreadingResources();
+#endif
 }
 
 // Flush stdout and stderr.  We do this during shutdown so that it
diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c
index 112a311f79..8c7c8f0e24 100644
--- a/rts/posix/OSThreads.c
+++ b/rts/posix/OSThreads.c
@@ -231,6 +231,8 @@ forkOS_createThread ( HsStablePtr entry )
     return result;
 }
 
+void freeThreadingResources (void) { /* nothing */ }
+
 uint32_t
 getNumberOfProcessors (void)
 {
@@ -334,6 +336,7 @@ void releaseThreadNode (void)
         stg_exit(1);
     }
 }
+
 #else
 void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ }
 void releaseThreadNode (void) { /* nothing */ }
@@ -353,6 +356,8 @@ forkOS_createThread ( HsStablePtr entry STG_UNUSED )
     return -1;
 }
 
+void freeThreadingResources (void) { /* nothing */ }
+
 uint32_t getNumberOfProcessors (void)
 {
     return 1;
diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c
index 78fe29784b..c9b594a208 100644
--- a/rts/win32/OSThreads.c
+++ b/rts/win32/OSThreads.c
@@ -18,6 +18,14 @@
 #undef __STRICT_ANSI__
 #include <process.h>
 
+
+/* Processor group info cache.  */
+static uint8_t* cpuGroupCache = NULL;
+/* Processor group cumulative summary cache.  */
+static uint32_t* cpuGroupCumulativeCache = NULL;
+/* Processor group dist cache.  */
+static uint8_t* cpuGroupDistCache = NULL;
+
 /* Win32 threads and synchronisation objects */
 
 /* A Condition is represented by a Win32 Event object;
@@ -242,11 +250,222 @@ forkOS_createThread ( HsStablePtr entry )
                            (unsigned*)&pId) == 0);
 }
 
+#if x86_64_HOST_ARCH
+/* We still support Windows Vista, so we can't depend on it
+   and must manually resolve these. */
+typedef DWORD(WINAPI *GetItemCountProc)(WORD);
+typedef DWORD(WINAPI *GetGroupCountProc)(void);
+typedef BOOL(WINAPI *SetThreadGroupAffinityProc)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
+#ifndef ALL_PROCESSOR_GROUPS
+#define ALL_PROCESSOR_GROUPS 0xffff
+#endif
+#endif
+
+void freeThreadingResources (void)
+{
+    if (cpuGroupCache)
+    {
+        free(cpuGroupCache);
+        cpuGroupCache = NULL;
+    }
+
+    if (cpuGroupCumulativeCache)
+    {
+        free(cpuGroupCumulativeCache);
+        cpuGroupCumulativeCache = NULL;
+    }
+
+    if (cpuGroupDistCache)
+    {
+        free(cpuGroupDistCache);
+        cpuGroupDistCache = NULL;
+    }
+}
+
+/* Processor groups are not guaranteed to be uniformly distributed
+   nor guaranteed to be filled before a next group is needed.
+   The OS will assign processors to groups based on physical proximity
+   and will never partially assign cores from one physical cpu to more
+   than one group. If one has two 48 core CPUs then you'd end up with
+   two groups of 48 logical cpus. Now add a 3rd CPU with 10 cores and
+   the group it is assigned to depends where the socket is on the board.
+
+   So we need to make a map of where the CPUs reside and how the groups are filled.
+   Since groups are created at boot time by the kernel, we can cache this information.
+
+   NOTE: This code does not support hot-swapping cores as it's caching the information.
+   If you activate a new core you have to restart the program. This builds a
+   simple lookup array for cpu -> group indexes. This gives O(1) lookup against
+   O(n) space. But n is < 256 so we'll only use 256 bytes of extra memory. */
+
+static uint8_t
+getNumberOfProcessorsGroups (void)
+{
+    /* Group count cache.  */
+    static uint8_t n_groups = 0;
+
+
+#if x86_64_HOST_ARCH
+    if (!n_groups)
+    {
+        /* We still support Windows Vista. Which means we can't rely
+           on the API being available. So we'll have to resolve manually.  */
+        HMODULE kernel = GetModuleHandleW(L"kernel32");
+
+        GetGroupCountProc GetActiveProcessorGroupCount = (GetGroupCountProc)GetProcAddress(kernel, "GetActiveProcessorGroupCount");
+        n_groups = GetActiveProcessorGroupCount();
+
+        IF_DEBUG(scheduler, debugBelch("[*] Number of processor groups detected: %u\n", n_groups));
+    }
+#endif
+
+    if (!n_groups)
+    {
+        n_groups = 1;
+    }
+
+    return n_groups;
+}
+
+static uint8_t*
+getProcessorsDistribution (void)
+{
+    if (cpuGroupDistCache)
+    {
+        return cpuGroupDistCache;
+    }
+
+    if (!cpuGroupDistCache)
+    {
+        uint8_t n_groups = getNumberOfProcessorsGroups();
+        cpuGroupDistCache = malloc(n_groups * sizeof(uint8_t));
+        memset(cpuGroupDistCache, MAXIMUM_PROCESSORS, n_groups * sizeof(uint8_t));
+
+#if x86_64_HOST_ARCH
+        /* We still support Windows Vista. Which means we can't rely
+        on the API being available. So we'll have to resolve manually.  */
+        HMODULE kernel = GetModuleHandleW(L"kernel32");
+
+        GetItemCountProc  GetActiveProcessorCount = (GetItemCountProc)GetProcAddress(kernel, "GetActiveProcessorCount");
+
+        if (GetActiveProcessorCount)
+        {
+            for (int i = 0; i < n_groups; i++)
+            {
+                cpuGroupDistCache[i] = GetActiveProcessorCount(i);
+                IF_DEBUG(scheduler, debugBelch("[*] Number of active processors in group %u detected: %u\n", i, cpuGroupDistCache[i]));
+            }
+        }
+#endif
+    }
+
+    return cpuGroupDistCache;
+}
+
+static uint32_t*
+getProcessorsCumulativeSum(void)
+{
+    if (cpuGroupCumulativeCache)
+    {
+        return cpuGroupCumulativeCache;
+    }
+
+    if (!cpuGroupCumulativeCache)
+    {
+        uint8_t n_groups = getNumberOfProcessorsGroups();
+        cpuGroupCumulativeCache = malloc(n_groups * sizeof(uint32_t));
+        memset(cpuGroupCumulativeCache, 0, n_groups * sizeof(uint32_t));
+        uint8_t* proc_dist = getProcessorsDistribution();
+        uint32_t cum_num_proc = 0;
+
+#if x86_64_HOST_ARCH
+        for (int i = 0; i < n_groups; i++)
+        {
+            cpuGroupCumulativeCache[i] = cum_num_proc;
+            cum_num_proc += proc_dist[i];
+            IF_DEBUG(scheduler, debugBelch("[*] Cumulative active processors for group %u: %u\n", i, cpuGroupCumulativeCache[i]));
+        }
+#endif
+    }
+
+    return cpuGroupCumulativeCache;
+}
+
+/*
+ Because processors can be distributed rather unpredictably inside
+ processor groups, we need to keep track of which processors are in
+ which group to be able to determine which mask to set and which bit
+ in the mask to set.
+
+ This can either be done by the typical trade-off: speed or
+ memory usage. In this case I prioritize speed.
+
+ This function will generate an array where each index is a processor
+ and the value of the array the group it belongs to. This allows us to
+ in constant time determine where a processor is.
+ */
+static uint8_t*
+createProcessorGroupMap (void)
+{
+    if (cpuGroupCache)
+    {
+        return cpuGroupCache;
+    }
+
+    uint32_t numProcs = getNumberOfProcessors();
+
+    cpuGroupCache = malloc(numProcs * sizeof(uint8_t));
+    /* For 32bit Windows and 64bit older than Windows 7, create a default mapping. */
+    memset(cpuGroupCache, 0, numProcs * sizeof(uint8_t));
+
+#if x86_64_HOST_ARCH
+    uint8_t* proc_dist = getProcessorsDistribution();
+
+    int totalProcs = 0;
+    uint8_t nGroups = getNumberOfProcessorsGroups();
+    int group;
+    for (group = 0; group < nGroups; group++)
+    {
+        uint8_t nProc = proc_dist[group];
+        memset(cpuGroupCache + totalProcs, group, nProc * sizeof(uint8_t));
+        totalProcs += nProc;
+    }
+
+    IF_DEBUG(scheduler, debugBelch("[*] Processor group map created\n"));
+#endif
+
+    return cpuGroupCache;
+}
+
 uint32_t
 getNumberOfProcessors (void)
 {
     static uint32_t nproc = 0;
 
+#if x86_64_HOST_ARCH
+    /* We still support Windows Vista. Which means we can't rely
+       on the API being available. So we'll have to resolve manually.  */
+    HMODULE kernel = GetModuleHandleW(L"kernel32");
+
+    GetItemCountProc GetActiveProcessorCount = (GetItemCountProc)GetProcAddress(kernel, "GetActiveProcessorCount");
+    if (GetActiveProcessorCount && !nproc)
+    {
+        nproc = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
+
+        if (nproc)
+        {
+            IF_DEBUG(scheduler, debugBelch("[*] Total number of active processors detected: %u\n", nproc));
+            return nproc;
+        }
+
+        IF_DEBUG(scheduler, debugBelch("Could not determine Max number of logical processors.\n"
+                                       "Falling back to old code which limits to 64 logical processors.\n"));
+    }
+#endif
+
+    /* This will return the maximum number of processes
+       within one processor group. It's also slower
+       so use it only when needed.  */
     if (nproc == 0) {
         SYSTEM_INFO si;
         GetSystemInfo(&si);
@@ -259,24 +478,77 @@ getNumberOfProcessors (void)
 void
 setThreadAffinity (uint32_t n, uint32_t m) // cap N of M
 {
+    ASSERT(n <= m);
+
     HANDLE hThread;
-    DWORD_PTR mask, r;  // 64-bit win is required to handle more than 32 procs
-    uint32_t nproc, i;
+    DWORD_PTR *mask, r;  // 64-bit win is required to handle more than 32 procs
+                         // and Windows 7+ required for more than 64 procs
+    uint32_t n_proc, i, ix;
+    uint8_t* proc_map      = createProcessorGroupMap();
+    uint32_t n_groups      = getNumberOfProcessorsGroups();
+    uint32_t* proc_cum     = getProcessorsCumulativeSum();
+    n_proc                 = getNumberOfProcessors();
+    hThread                = GetCurrentThread();
+
+    ASSERT(proc_map         );
+    ASSERT(proc_cum         );
+    ASSERT(hThread          );
+    ASSERT(n_groups      > 0);
+    ASSERT(n_proc        > 0);
+
+    mask = malloc(n_groups * sizeof(DWORD_PTR));
+    memset(mask, 0, n_groups * sizeof(DWORD_PTR));
+
+    /* The mask for the individual groups are all 0 based
+       so we need different masks for every group.  */
+    int group;
+    for (i = n; i < n_proc; i += m)
+    {
+        group = proc_map[i];
+        ix = i - proc_cum[group];
+        mask[group] |= 1 << ix;
+    }
 
-    hThread = GetCurrentThread();
+#if x86_64_HOST_ARCH
+    /* We still support Windows Vista. Which means we can't rely
+       on the API being available. So we'll have to resolve manually.  */
+    HMODULE kernel = GetModuleHandleW(L"kernel32");
 
-    nproc = getNumberOfProcessors();
+    SetThreadGroupAffinityProc SetThreadGroupAffinity = (SetThreadGroupAffinityProc)GetProcAddress(kernel, "SetThreadGroupAffinity");
+#endif
 
-    mask = 0;
-    for (i = n; i < nproc; i+=m) {
-        mask |= 1 << i;
+    for (i = 0; i < n_groups; i++)
+    {
+#if x86_64_HOST_ARCH
+        // If we support the new API, use it.
+        if (mask[i] > 0 && SetThreadGroupAffinity)
+        {
+            GROUP_AFFINITY hGroup;
+            ZeroMemory(&hGroup, sizeof(hGroup));
+            hGroup.Mask = mask[i];
+            hGroup.Group = i;
+
+            if (!SetThreadGroupAffinity(hThread, &hGroup, NULL))
+            {
+                sysErrorBelch("SetThreadGroupAffinity");
+            }
+
+            continue;
+        }
+#endif
+        // Fall-back methods. Only do it if there's a mask to set
+        if (mask[i] > 0)
+        {
+            r = SetThreadAffinityMask(hThread, mask[i]);
+            if (r == 0) {
+                free(mask);
+                sysErrorBelch("SetThreadAffinity");
+                stg_exit(EXIT_FAILURE);
+            }
+        }
     }
 
-    r = SetThreadAffinityMask(hThread, mask);
-    if (r == 0) {
-        sysErrorBelch("SetThreadAffinity");
-        stg_exit(EXIT_FAILURE);
-    }
+    free(mask);
 }
 
 typedef BOOL (WINAPI *PCSIO)(HANDLE);
@@ -311,6 +583,8 @@ forkOS_createThread ( HsStablePtr entry STG_UNUSED )
     return -1;
 }
 
+void freeThreadingResources (void) { /* nothing */ }
+
 uint32_t getNumberOfProcessors (void)
 {
     return 1;