diff options
-rw-r--r-- | docs/users_guide/8.2.1-notes.rst | 4 | ||||
-rw-r--r-- | includes/rts/OSThreads.h | 5 | ||||
-rw-r--r-- | rts/RtsStartup.c | 5 | ||||
-rw-r--r-- | rts/posix/OSThreads.c | 5 | ||||
-rw-r--r-- | rts/win32/OSThreads.c | 298 |
5 files changed, 305 insertions, 12 deletions
diff --git a/docs/users_guide/8.2.1-notes.rst b/docs/users_guide/8.2.1-notes.rst index 3e13f57c57..033f8da483 100644 --- a/docs/users_guide/8.2.1-notes.rst +++ b/docs/users_guide/8.2.1-notes.rst @@ -71,6 +71,10 @@ Runtime system event log, allowing heap profiles to be correlated with other tracing events (see :ghc-ticket:`11094`). +- Added processor group support for Windows. This allows the runtime to allocate + threads to all cores in systems which have multiple processor groups. + (e.g. > 64 cores, see :ghc-ticket:`11054`) + Build system ~~~~~~~~~~~~ diff --git a/includes/rts/OSThreads.h b/includes/rts/OSThreads.h index bc84b714db..2eb58971ca 100644 --- a/includes/rts/OSThreads.h +++ b/includes/rts/OSThreads.h @@ -221,6 +221,11 @@ void releaseThreadNode (void); int forkOS_createThread ( HsStablePtr entry ); // +// Free any global resources created in OSThreads. +// +void freeThreadingResources(void); + +// // Returns the number of processor cores in the machine // uint32_t getNumberOfProcessors (void); diff --git a/rts/RtsStartup.c b/rts/RtsStartup.c index a2630b2f8f..7c2be0d7ef 100644 --- a/rts/RtsStartup.c +++ b/rts/RtsStartup.c @@ -434,6 +434,11 @@ hs_exit_(rtsBool wait_foreign) // Free the various argvs freeRtsArgs(); + +#ifndef CMINUSMINUS + // Free threading resources + freeThreadingResources(); +#endif } // Flush stdout and stderr. We do this during shutdown so that it diff --git a/rts/posix/OSThreads.c b/rts/posix/OSThreads.c index 112a311f79..8c7c8f0e24 100644 --- a/rts/posix/OSThreads.c +++ b/rts/posix/OSThreads.c @@ -231,6 +231,8 @@ forkOS_createThread ( HsStablePtr entry ) return result; } +void freeThreadingResources (void) { /* nothing */ } + uint32_t getNumberOfProcessors (void) { @@ -334,6 +336,7 @@ void releaseThreadNode (void) stg_exit(1); } } + #else void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ } void releaseThreadNode (void) { /* nothing */ } @@ -353,6 +356,8 @@ forkOS_createThread ( HsStablePtr entry STG_UNUSED ) return -1; } +void freeThreadingResources (void) { /* nothing */ } + uint32_t getNumberOfProcessors (void) { return 1; diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c index 78fe29784b..c9b594a208 100644 --- a/rts/win32/OSThreads.c +++ b/rts/win32/OSThreads.c @@ -18,6 +18,14 @@ #undef __STRICT_ANSI__ #include <process.h> + +/* Processor group info cache. */ +static uint8_t* cpuGroupCache = NULL; +/* Processor group cumulative summary cache. */ +static uint32_t* cpuGroupCumulativeCache = NULL; +/* Processor group dist cache. */ +static uint8_t* cpuGroupDistCache = NULL; + /* Win32 threads and synchronisation objects */ /* A Condition is represented by a Win32 Event object; @@ -242,11 +250,222 @@ forkOS_createThread ( HsStablePtr entry ) (unsigned*)&pId) == 0); } +#if x86_64_HOST_ARCH +/* We still support Windows Vista, so we can't depend on it + and must manually resolve these. */ +typedef DWORD(WINAPI *GetItemCountProc)(WORD); +typedef DWORD(WINAPI *GetGroupCountProc)(void); +typedef BOOL(WINAPI *SetThreadGroupAffinityProc)(HANDLE, const GROUP_AFFINITY*, PGROUP_AFFINITY); +#ifndef ALL_PROCESSOR_GROUPS +#define ALL_PROCESSOR_GROUPS 0xffff +#endif +#endif + +void freeThreadingResources (void) +{ + if (cpuGroupCache) + { + free(cpuGroupCache); + cpuGroupCache = NULL; + } + + if (cpuGroupCumulativeCache) + { + free(cpuGroupCumulativeCache); + cpuGroupCumulativeCache = NULL; + } + + if (cpuGroupDistCache) + { + free(cpuGroupDistCache); + cpuGroupDistCache = NULL; + } +} + +/* Processor groups are not guaranteed to be uniformly distributed + nor guaranteed to be filled before a next group is needed. + The OS will assign processors to groups based on physical proximity + and will never partially assign cores from one physical cpu to more + than one group. If one has two 48 core CPUs then you'd end up with + two groups of 48 logical cpus. Now add a 3rd CPU with 10 cores and + the group it is assigned to depends where the socket is on the board. + + So we need to make a map of where the CPUs reside and how the groups are filled. + Since groups are created at boot time by the kernel, we can cache this information. + + NOTE: This code does not support hot-swapping cores as it's caching the information. + If you activate a new core you have to restart the program. This builds a + simple lookup array for cpu -> group indexes. This gives O(1) lookup against + O(n) space. But n is < 256 so we'll only use 256 bytes of extra memory. */ + +static uint8_t +getNumberOfProcessorsGroups (void) +{ + /* Group count cache. */ + static uint8_t n_groups = 0; + + +#if x86_64_HOST_ARCH + if (!n_groups) + { + /* We still support Windows Vista. Which means we can't rely + on the API being available. So we'll have to resolve manually. */ + HMODULE kernel = GetModuleHandleW(L"kernel32"); + + GetGroupCountProc GetActiveProcessorGroupCount = (GetGroupCountProc)GetProcAddress(kernel, "GetActiveProcessorGroupCount"); + n_groups = GetActiveProcessorGroupCount(); + + IF_DEBUG(scheduler, debugBelch("[*] Number of processor groups detected: %u\n", n_groups)); + } +#endif + + if (!n_groups) + { + n_groups = 1; + } + + return n_groups; +} + +static uint8_t* +getProcessorsDistribution (void) +{ + if (cpuGroupDistCache) + { + return cpuGroupDistCache; + } + + if (!cpuGroupDistCache) + { + uint8_t n_groups = getNumberOfProcessorsGroups(); + cpuGroupDistCache = malloc(n_groups * sizeof(uint8_t)); + memset(cpuGroupDistCache, MAXIMUM_PROCESSORS, n_groups * sizeof(uint8_t)); + +#if x86_64_HOST_ARCH + /* We still support Windows Vista. Which means we can't rely + on the API being available. So we'll have to resolve manually. */ + HMODULE kernel = GetModuleHandleW(L"kernel32"); + + GetItemCountProc GetActiveProcessorCount = (GetItemCountProc)GetProcAddress(kernel, "GetActiveProcessorCount"); + + if (GetActiveProcessorCount) + { + for (int i = 0; i < n_groups; i++) + { + cpuGroupDistCache[i] = GetActiveProcessorCount(i); + IF_DEBUG(scheduler, debugBelch("[*] Number of active processors in group %u detected: %u\n", i, cpuGroupDistCache[i])); + } + } +#endif + } + + return cpuGroupDistCache; +} + +static uint32_t* +getProcessorsCumulativeSum(void) +{ + if (cpuGroupCumulativeCache) + { + return cpuGroupCumulativeCache; + } + + if (!cpuGroupCumulativeCache) + { + uint8_t n_groups = getNumberOfProcessorsGroups(); + cpuGroupCumulativeCache = malloc(n_groups * sizeof(uint32_t)); + memset(cpuGroupCumulativeCache, 0, n_groups * sizeof(uint32_t)); + uint8_t* proc_dist = getProcessorsDistribution(); + uint32_t cum_num_proc = 0; + +#if x86_64_HOST_ARCH + for (int i = 0; i < n_groups; i++) + { + cpuGroupCumulativeCache[i] = cum_num_proc; + cum_num_proc += proc_dist[i]; + IF_DEBUG(scheduler, debugBelch("[*] Cumulative active processors for group %u: %u\n", i, cpuGroupCumulativeCache[i])); + } +#endif + } + + return cpuGroupCumulativeCache; +} + +/* + Because processors can be distributed rather unpredictably inside + processor groups, we need to keep track of which processors are in + which group to be able to determine which mask to set and which bit + in the mask to set. + + This can either be done by the typical trade-off: speed or + memory usage. In this case I prioritize speed. + + This function will generate an array where each index is a processor + and the value of the array the group it belongs to. This allows us to + in constant time determine where a processor is. + */ +static uint8_t* +createProcessorGroupMap (void) +{ + if (cpuGroupCache) + { + return cpuGroupCache; + } + + uint32_t numProcs = getNumberOfProcessors(); + + cpuGroupCache = malloc(numProcs * sizeof(uint8_t)); + /* For 32bit Windows and 64bit older than Windows 7, create a default mapping. */ + memset(cpuGroupCache, 0, numProcs * sizeof(uint8_t)); + +#if x86_64_HOST_ARCH + uint8_t* proc_dist = getProcessorsDistribution(); + + int totalProcs = 0; + uint8_t nGroups = getNumberOfProcessorsGroups(); + int group; + for (group = 0; group < nGroups; group++) + { + uint8_t nProc = proc_dist[group]; + memset(cpuGroupCache + totalProcs, group, nProc * sizeof(uint8_t)); + totalProcs += nProc; + } + + IF_DEBUG(scheduler, debugBelch("[*] Processor group map created\n")); +#endif + + return cpuGroupCache; +} + uint32_t getNumberOfProcessors (void) { static uint32_t nproc = 0; +#if x86_64_HOST_ARCH + /* We still support Windows Vista. Which means we can't rely + on the API being available. So we'll have to resolve manually. */ + HMODULE kernel = GetModuleHandleW(L"kernel32"); + + GetItemCountProc GetActiveProcessorCount = (GetItemCountProc)GetProcAddress(kernel, "GetActiveProcessorCount"); + if (GetActiveProcessorCount && !nproc) + { + nproc = GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); + + if (nproc) + { + IF_DEBUG(scheduler, debugBelch("[*] Total number of active processors detected: %u\n", nproc)); + return nproc; + } + + IF_DEBUG(scheduler, debugBelch("Could not determine Max number of logical processors.\n" + "Falling back to old code which limits to 64 logical processors.\n")); + } +#endif + + /* This will return the maximum number of processes + within one processor group. It's also slower + so use it only when needed. */ if (nproc == 0) { SYSTEM_INFO si; GetSystemInfo(&si); @@ -259,24 +478,77 @@ getNumberOfProcessors (void) void setThreadAffinity (uint32_t n, uint32_t m) // cap N of M { + ASSERT(n <= m); + HANDLE hThread; - DWORD_PTR mask, r; // 64-bit win is required to handle more than 32 procs - uint32_t nproc, i; + DWORD_PTR *mask, r; // 64-bit win is required to handle more than 32 procs + // and Windows 7+ required for more than 64 procs + uint32_t n_proc, i, ix; + uint8_t* proc_map = createProcessorGroupMap(); + uint32_t n_groups = getNumberOfProcessorsGroups(); + uint32_t* proc_cum = getProcessorsCumulativeSum(); + n_proc = getNumberOfProcessors(); + hThread = GetCurrentThread(); + + ASSERT(proc_map ); + ASSERT(proc_cum ); + ASSERT(hThread ); + ASSERT(n_groups > 0); + ASSERT(n_proc > 0); + + mask = malloc(n_groups * sizeof(DWORD_PTR)); + memset(mask, 0, n_groups * sizeof(DWORD_PTR)); + + /* The mask for the individual groups are all 0 based + so we need different masks for every group. */ + int group; + for (i = n; i < n_proc; i += m) + { + group = proc_map[i]; + ix = i - proc_cum[group]; + mask[group] |= 1 << ix; + } - hThread = GetCurrentThread(); +#if x86_64_HOST_ARCH + /* We still support Windows Vista. Which means we can't rely + on the API being available. So we'll have to resolve manually. */ + HMODULE kernel = GetModuleHandleW(L"kernel32"); - nproc = getNumberOfProcessors(); + SetThreadGroupAffinityProc SetThreadGroupAffinity = (SetThreadGroupAffinityProc)GetProcAddress(kernel, "SetThreadGroupAffinity"); +#endif - mask = 0; - for (i = n; i < nproc; i+=m) { - mask |= 1 << i; + for (i = 0; i < n_groups; i++) + { +#if x86_64_HOST_ARCH + // If we support the new API, use it. + if (mask[i] > 0 && SetThreadGroupAffinity) + { + GROUP_AFFINITY hGroup; + ZeroMemory(&hGroup, sizeof(hGroup)); + hGroup.Mask = mask[i]; + hGroup.Group = i; + + if (!SetThreadGroupAffinity(hThread, &hGroup, NULL)) + { + sysErrorBelch("SetThreadGroupAffinity"); + } + + continue; + } +#endif + // Fall-back methods. Only do it if there's a mask to set + if (mask[i] > 0) + { + r = SetThreadAffinityMask(hThread, mask[i]); + if (r == 0) { + free(mask); + sysErrorBelch("SetThreadAffinity"); + stg_exit(EXIT_FAILURE); + } + } } - r = SetThreadAffinityMask(hThread, mask); - if (r == 0) { - sysErrorBelch("SetThreadAffinity"); - stg_exit(EXIT_FAILURE); - } + free(mask); } typedef BOOL (WINAPI *PCSIO)(HANDLE); @@ -311,6 +583,8 @@ forkOS_createThread ( HsStablePtr entry STG_UNUSED ) return -1; } +void freeThreadingResources (void) { /* nothing */ } + uint32_t getNumberOfProcessors (void) { return 1; |