summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/users_guide/8.2.1-notes.rst4
-rw-r--r--includes/rts/OSThreads.h5
-rw-r--r--rts/RtsStartup.c5
-rw-r--r--rts/posix/OSThreads.c5
-rw-r--r--rts/win32/OSThreads.c298
5 files changed, 305 insertions, 12 deletions
diff --git a/docs/users_guide/8.2.1-notes.rst b/docs/users_guide/8.2.1-notes.rst
index 3e13f57c57..033f8da483 100644
--- a/docs/users_guide/8.2.1-notes.rst
+++ b/docs/users_guide/8.2.1-notes.rst
@@ -71,6 +71,10 @@ Runtime system
event log, allowing heap profiles to be correlated with other tracing events
(see :ghc-ticket:`11094`).
+- Added processor group support for Windows. This allows the runtime to allocate
+ threads to all cores in systems which have multiple processor groups.
+ (e.g. > 64 cores, see :ghc-ticket:`11054`)
+
Build system
~~~~~~~~~~~~
diff --git a/includes/rts/OSThreads.h b/includes/rts/OSThreads.h
index bc84b714db..2eb58971ca 100644
--- a/includes/rts/OSThreads.h
+++ b/includes/rts/OSThreads.h
@@ -221,6 +221,11 @@ void releaseThreadNode (void);
int forkOS_createThread ( HsStablePtr entry );
//
+// Free any global resources created in OSThreads.
+//
+void freeThreadingResources(void);
+
+//
// Returns the number of processor cores in the machine
//
uint32_t getNumberOfProcessors (void);
diff --git a/rts/RtsStartup.c b/rts/RtsStartup.c
index a2630b2f8f..7c2be0d7ef 100644
--- a/rts/RtsStartup.c
+++ b/rts/RtsStartup.c
@@ -434,6 +434,11 @@ hs_exit_(rtsBool wait_foreign)
// Free the various argvs
freeRtsArgs();
+
+#ifndef CMINUSMINUS
+ // Free threading resources
+ freeThreadingResources();
+#endif
}
// Flush stdout and stderr. We do this during shutdown so that it
diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c
index 112a311f79..8c7c8f0e24 100644
--- a/rts/posix/OSThreads.c
+++ b/rts/posix/OSThreads.c
@@ -231,6 +231,8 @@ forkOS_createThread ( HsStablePtr entry )
return result;
}
+void freeThreadingResources (void) { /* nothing */ }
+
uint32_t
getNumberOfProcessors (void)
{
@@ -334,6 +336,7 @@ void releaseThreadNode (void)
stg_exit(1);
}
}
+
#else
void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ }
void releaseThreadNode (void) { /* nothing */ }
@@ -353,6 +356,8 @@ forkOS_createThread ( HsStablePtr entry STG_UNUSED )
return -1;
}
+void freeThreadingResources (void) { /* nothing */ }
+
uint32_t getNumberOfProcessors (void)
{
return 1;
diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c
index 78fe29784b..c9b594a208 100644
--- a/rts/win32/OSThreads.c
+++ b/rts/win32/OSThreads.c
@@ -18,6 +18,14 @@
#undef __STRICT_ANSI__
#include <process.h>
+
+/* Processor group info cache. */
+static uint8_t* cpuGroupCache = NULL;
+/* Processor group cumulative summary cache. */
+static uint32_t* cpuGroupCumulativeCache = NULL;
+/* Processor group dist cache. */
+static uint8_t* cpuGroupDistCache = NULL;
+
/* Win32 threads and synchronisation objects */
/* A Condition is represented by a Win32 Event object;
@@ -242,11 +250,222 @@ forkOS_createThread ( HsStablePtr entry )
(unsigned*)&pId) == 0);
}
+#if x86_64_HOST_ARCH
+/* We still support Windows Vista, so we can't depend on it
+ and must manually resolve these. */
+typedef DWORD(WINAPI *GetItemCountProc)(WORD);
+typedef DWORD(WINAPI *GetGroupCountProc)(void);
+typedef BOOL(WINAPI *SetThreadGroupAffinityProc)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY);
+#ifndef ALL_PROCESSOR_GROUPS
+#define ALL_PROCESSOR_GROUPS 0xffff
+#endif
+#endif
+
+void freeThreadingResources (void)
+{
+ if (cpuGroupCache)
+ {
+ free(cpuGroupCache);
+ cpuGroupCache = NULL;
+ }
+
+ if (cpuGroupCumulativeCache)
+ {
+ free(cpuGroupCumulativeCache);
+ cpuGroupCumulativeCache = NULL;
+ }
+
+ if (cpuGroupDistCache)
+ {
+ free(cpuGroupDistCache);
+ cpuGroupDistCache = NULL;
+ }
+}
+
+/* Processor groups are not guaranteed to be uniformly distributed
+ nor guaranteed to be filled before a next group is needed.
+ The OS will assign processors to groups based on physical proximity
+ and will never partially assign cores from one physical cpu to more
+ than one group. If one has two 48 core CPUs then you'd end up with
+ two groups of 48 logical cpus. Now add a 3rd CPU with 10 cores and
+ the group it is assigned to depends where the socket is on the board.
+
+ So we need to make a map of where the CPUs reside and how the groups are filled.
+ Since groups are created at boot time by the kernel, we can cache this information.
+
+ NOTE: This code does not support hot-swapping cores as it's caching the information.
+ If you activate a new core you have to restart the program. This builds a
+ simple lookup array for cpu -> group indexes. This gives O(1) lookup against
+ O(n) space. But n is < 256 so we'll only use 256 bytes of extra memory. */
+
+static uint8_t
+getNumberOfProcessorsGroups (void)
+{
+ /* Group count cache. */
+ static uint8_t n_groups = 0;
+
+
+#if x86_64_HOST_ARCH
+ if (!n_groups)
+ {
+ /* We still support Windows Vista. Which means we can't rely
+ on the API being available. So we'll have to resolve manually. */
+ HMODULE kernel = GetModuleHandleW(L"kernel32");
+
+ GetGroupCountProc GetActiveProcessorGroupCount = (GetGroupCountProc)GetProcAddress(kernel, "GetActiveProcessorGroupCount");
+ n_groups = GetActiveProcessorGroupCount();
+
+ IF_DEBUG(scheduler, debugBelch("[*] Number of processor groups detected: %u\n", n_groups));
+ }
+#endif
+
+ if (!n_groups)
+ {
+ n_groups = 1;
+ }
+
+ return n_groups;
+}
+
+static uint8_t*
+getProcessorsDistribution (void)
+{
+ if (cpuGroupDistCache)
+ {
+ return cpuGroupDistCache;
+ }
+
+ if (!cpuGroupDistCache)
+ {
+ uint8_t n_groups = getNumberOfProcessorsGroups();
+ cpuGroupDistCache = malloc(n_groups * sizeof(uint8_t));
+ memset(cpuGroupDistCache, MAXIMUM_PROCESSORS, n_groups * sizeof(uint8_t));
+
+#if x86_64_HOST_ARCH
+ /* We still support Windows Vista. Which means we can't rely
+ on the API being available. So we'll have to resolve manually. */
+ HMODULE kernel = GetModuleHandleW(L"kernel32");
+
+ GetItemCountProc GetActiveProcessorCount = (GetItemCountProc)GetProcAddress(kernel, "GetActiveProcessorCount");
+
+ if (GetActiveProcessorCount)
+ {
+ for (int i = 0; i < n_groups; i++)
+ {
+ cpuGroupDistCache[i] = GetActiveProcessorCount(i);
+ IF_DEBUG(scheduler, debugBelch("[*] Number of active processors in group %u detected: %u\n", i, cpuGroupDistCache[i]));
+ }
+ }
+#endif
+ }
+
+ return cpuGroupDistCache;
+}
+
+static uint32_t*
+getProcessorsCumulativeSum(void)
+{
+ if (cpuGroupCumulativeCache)
+ {
+ return cpuGroupCumulativeCache;
+ }
+
+ if (!cpuGroupCumulativeCache)
+ {
+ uint8_t n_groups = getNumberOfProcessorsGroups();
+ cpuGroupCumulativeCache = malloc(n_groups * sizeof(uint32_t));
+ memset(cpuGroupCumulativeCache, 0, n_groups * sizeof(uint32_t));
+ uint8_t* proc_dist = getProcessorsDistribution();
+ uint32_t cum_num_proc = 0;
+
+#if x86_64_HOST_ARCH
+ for (int i = 0; i < n_groups; i++)
+ {
+ cpuGroupCumulativeCache[i] = cum_num_proc;
+ cum_num_proc += proc_dist[i];
+ IF_DEBUG(scheduler, debugBelch("[*] Cumulative active processors for group %u: %u\n", i, cpuGroupCumulativeCache[i]));
+ }
+#endif
+ }
+
+ return cpuGroupCumulativeCache;
+}
+
+/*
+ Because processors can be distributed rather unpredictably inside
+ processor groups, we need to keep track of which processors are in
+ which group to be able to determine which mask to set and which bit
+ in the mask to set.
+
+ This can either be done by the typical trade-off: speed or
+ memory usage. In this case I prioritize speed.
+
+ This function will generate an array where each index is a processor
+ and the value of the array the group it belongs to. This allows us to
+ in constant time determine where a processor is.
+ */
+static uint8_t*
+createProcessorGroupMap (void)
+{
+ if (cpuGroupCache)
+ {
+ return cpuGroupCache;
+ }
+
+ uint32_t numProcs = getNumberOfProcessors();
+
+ cpuGroupCache = malloc(numProcs * sizeof(uint8_t));
+ /* For 32bit Windows and 64bit older than Windows 7, create a default mapping. */
+ memset(cpuGroupCache, 0, numProcs * sizeof(uint8_t));
+
+#if x86_64_HOST_ARCH
+ uint8_t* proc_dist = getProcessorsDistribution();
+
+ int totalProcs = 0;
+ uint8_t nGroups = getNumberOfProcessorsGroups();
+ int group;
+ for (group = 0; group < nGroups; group++)
+ {
+ uint8_t nProc = proc_dist[group];
+ memset(cpuGroupCache + totalProcs, group, nProc * sizeof(uint8_t));
+ totalProcs += nProc;
+ }
+
+ IF_DEBUG(scheduler, debugBelch("[*] Processor group map created\n"));
+#endif
+
+ return cpuGroupCache;
+}
+
uint32_t
getNumberOfProcessors (void)
{
static uint32_t nproc = 0;
+#if x86_64_HOST_ARCH
+ /* We still support Windows Vista. Which means we can't rely
+ on the API being available. So we'll have to resolve manually. */
+ HMODULE kernel = GetModuleHandleW(L"kernel32");
+
+ GetItemCountProc GetActiveProcessorCount = (GetItemCountProc)GetProcAddress(kernel, "GetActiveProcessorCount");
+ if (GetActiveProcessorCount && !nproc)
+ {
+ nproc = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS);
+
+ if (nproc)
+ {
+ IF_DEBUG(scheduler, debugBelch("[*] Total number of active processors detected: %u\n", nproc));
+ return nproc;
+ }
+
+ IF_DEBUG(scheduler, debugBelch("Could not determine Max number of logical processors.\n"
+ "Falling back to old code which limits to 64 logical processors.\n"));
+ }
+#endif
+
+ /* This will return the maximum number of processes
+ within one processor group. It's also slower
+ so use it only when needed. */
if (nproc == 0) {
SYSTEM_INFO si;
GetSystemInfo(&si);
@@ -259,24 +478,77 @@ getNumberOfProcessors (void)
void
setThreadAffinity (uint32_t n, uint32_t m) // cap N of M
{
+ ASSERT(n <= m);
+
HANDLE hThread;
- DWORD_PTR mask, r; // 64-bit win is required to handle more than 32 procs
- uint32_t nproc, i;
+ DWORD_PTR *mask, r; // 64-bit win is required to handle more than 32 procs
+ // and Windows 7+ required for more than 64 procs
+ uint32_t n_proc, i, ix;
+ uint8_t* proc_map = createProcessorGroupMap();
+ uint32_t n_groups = getNumberOfProcessorsGroups();
+ uint32_t* proc_cum = getProcessorsCumulativeSum();
+ n_proc = getNumberOfProcessors();
+ hThread = GetCurrentThread();
+
+ ASSERT(proc_map );
+ ASSERT(proc_cum );
+ ASSERT(hThread );
+ ASSERT(n_groups > 0);
+ ASSERT(n_proc > 0);
+
+ mask = malloc(n_groups * sizeof(DWORD_PTR));
+ memset(mask, 0, n_groups * sizeof(DWORD_PTR));
+
+ /* The mask for the individual groups are all 0 based
+ so we need different masks for every group. */
+ int group;
+ for (i = n; i < n_proc; i += m)
+ {
+ group = proc_map[i];
+ ix = i - proc_cum[group];
+ mask[group] |= 1 << ix;
+ }
- hThread = GetCurrentThread();
+#if x86_64_HOST_ARCH
+ /* We still support Windows Vista. Which means we can't rely
+ on the API being available. So we'll have to resolve manually. */
+ HMODULE kernel = GetModuleHandleW(L"kernel32");
- nproc = getNumberOfProcessors();
+ SetThreadGroupAffinityProc SetThreadGroupAffinity = (SetThreadGroupAffinityProc)GetProcAddress(kernel, "SetThreadGroupAffinity");
+#endif
- mask = 0;
- for (i = n; i < nproc; i+=m) {
- mask |= 1 << i;
+ for (i = 0; i < n_groups; i++)
+ {
+#if x86_64_HOST_ARCH
+ // If we support the new API, use it.
+ if (mask[i] > 0 && SetThreadGroupAffinity)
+ {
+ GROUP_AFFINITY hGroup;
+ ZeroMemory(&hGroup, sizeof(hGroup));
+ hGroup.Mask = mask[i];
+ hGroup.Group = i;
+
+ if (!SetThreadGroupAffinity(hThread, &hGroup, NULL))
+ {
+ sysErrorBelch("SetThreadGroupAffinity");
+ }
+
+ continue;
+ }
+#endif
+ // Fall-back methods. Only do it if there's a mask to set
+ if (mask[i] > 0)
+ {
+ r = SetThreadAffinityMask(hThread, mask[i]);
+ if (r == 0) {
+ free(mask);
+ sysErrorBelch("SetThreadAffinity");
+ stg_exit(EXIT_FAILURE);
+ }
+ }
}
- r = SetThreadAffinityMask(hThread, mask);
- if (r == 0) {
- sysErrorBelch("SetThreadAffinity");
- stg_exit(EXIT_FAILURE);
- }
+ free(mask);
}
typedef BOOL (WINAPI *PCSIO)(HANDLE);
@@ -311,6 +583,8 @@ forkOS_createThread ( HsStablePtr entry STG_UNUSED )
return -1;
}
+void freeThreadingResources (void) { /* nothing */ }
+
uint32_t getNumberOfProcessors (void)
{
return 1;