diff options
author | Tamar Christina <tamar@zhox.com> | 2016-09-25 20:00:31 +0100 |
---|---|---|
committer | Tamar Christina <tamar@zhox.com> | 2016-10-01 23:20:04 +0100 |
commit | c93813d96b1da53a2ebd9c9ac5af6cc3e3443c43 (patch) | |
tree | a28cbbec64a5f1c70515a28b6ae3f2f4eaac87c9 /rts/win32 | |
parent | 1e795a008da8ab2ae88cca04aca01c50967b4397 (diff) | |
download | haskell-c93813d96b1da53a2ebd9c9ac5af6cc3e3443c43.tar.gz |
Add NUMA support for Windows
Summary:
NOTE: I have been able to do simple testing on emulated NUMA nodes.
Real hardware would be needed for a proper test.
D2199 Added NUMA support for Linux, I have just filled in the missing pieces following
the description of the Linux APIs.
Test Plan:
Use `bcdedit.exe /set groupsize 2` to modify the kernel again (Similar to D2533).
This generates some NUMA nodes:
```
Logical Processor to NUMA Node Map:
NUMA Node 0:
**
--
NUMA Node 1:
--
**
Approximate Cross-NUMA Node Access Cost (relative to fastest):
00 01
00: 1.1 1.1
01: 1.0 1.0
```
run ` ../test-numa.exe +RTS --numa -RTS`
and check PerfMon for NUMA allocations.
Reviewers: simonmar, erikd, bgamari, austin
Reviewed By: simonmar
Subscribers: thomie, #ghc_windows_task_force
Differential Revision: https://phabricator.haskell.org/D2534
GHC Trac Issues: #12602
Diffstat (limited to 'rts/win32')
-rw-r--r-- | rts/win32/OSMem.c | 81 | ||||
-rw-r--r-- | rts/win32/OSThreads.c | 45 |
2 files changed, 116 insertions, 10 deletions
diff --git a/rts/win32/OSMem.c b/rts/win32/OSMem.c index 3d9a304fda..b43636c198 100644 --- a/rts/win32/OSMem.c +++ b/rts/win32/OSMem.c @@ -11,9 +11,7 @@ #include "sm/HeapAlloc.h" #include "RtsUtils.h" -#if HAVE_WINDOWS_H #include <windows.h> -#endif typedef struct alloc_rec_ { char* base; // non-aligned base address, directly from VirtualAlloc @@ -39,11 +37,28 @@ static alloc_rec* allocs = NULL; /* free_blocks are kept in ascending order, and adjacent blocks are merged */ static block_rec* free_blocks = NULL; +/* Mingw-w64 does not currently have this in their header. So we have to import it.*/ +typedef LPVOID(WINAPI *VirtualAllocExNumaProc)(HANDLE, LPVOID, SIZE_T, DWORD, DWORD, DWORD); + +/* Cache NUMA API call. */ +VirtualAllocExNumaProc VirtualAllocExNuma; + void osMemInit(void) { allocs = NULL; free_blocks = NULL; + + /* Resolve and cache VirtualAllocExNuma. */ + if (osNumaAvailable() && RtsFlags.GcFlags.numa) + { + VirtualAllocExNuma = (VirtualAllocExNumaProc)GetProcAddress(GetModuleHandleW(L"kernel32"), "VirtualAllocExNuma"); + if (!VirtualAllocExNuma) + { + sysErrorBelch( + "osBindMBlocksToNode: VirtualAllocExNuma does not exist. How did you get this far?"); + } + } } static @@ -486,22 +501,72 @@ void osReleaseHeapMemory (void) rtsBool osNumaAvailable(void) { - return rtsFalse; + return osNumaNodes() > 1; } uint32_t osNumaNodes(void) { - return 1; + /* Cache the amount of NUMA values. */ + static ULONG numNumaNodes = 0; + + /* Cache the amount of NUMA nodes. */ + if (!numNumaNodes && !GetNumaHighestNodeNumber(&numNumaNodes)) + { + numNumaNodes = 1; + } + + return numNumaNodes; } StgWord osNumaMask(void) { - return 1; + StgWord numaMask; + if (!GetNumaNodeProcessorMask(0, &numaMask)) + { + return 1; + } + return numaMask; } void osBindMBlocksToNode( - void *addr STG_UNUSED, - StgWord size STG_UNUSED, - uint32_t node STG_UNUSED) + void *addr, + StgWord size, + uint32_t node) { + if (osNumaAvailable()) + { + void* temp; + if (RtsFlags.GcFlags.numa) { + /* Note [base memory] + I would like to use addr here to specify the base + memory of allocation. The problem is that the address + we are requesting is too high. I can't figure out if it's + because of my NUMA-emulation or a bug in the code. + + On windows also -xb is broken, it does nothing so that can't + be used to tweak it (see #12577). So for now, just let the OS decide. + */ + temp = VirtualAllocExNuma( + GetCurrentProcess(), + NULL, // addr? See base memory + size, + MEM_RESERVE | MEM_COMMIT, + PAGE_READWRITE, + node + ); + + if (!temp) { + if (GetLastError() == ERROR_NOT_ENOUGH_MEMORY) { + errorBelch("out of memory"); + } + else { + sysErrorBelch( + "osBindMBlocksToNode: VirtualAllocExNuma MEM_RESERVE %llu bytes " + "at address %p bytes failed", + size, addr); + } + stg_exit(EXIT_FAILURE); + } + } + } } diff --git a/rts/win32/OSThreads.c b/rts/win32/OSThreads.c index c9b594a208..b36c3e53da 100644 --- a/rts/win32/OSThreads.c +++ b/rts/win32/OSThreads.c @@ -9,6 +9,7 @@ #include "Rts.h" #include <windows.h> +#include "sm/OSMem.h" #if defined(THREADED_RTS) #include "RtsUtils.h" @@ -572,8 +573,48 @@ interruptOSThread (OSThreadId id) CloseHandle(hdl); } -void setThreadNode (uint32_t node STG_UNUSED) { /* nothing */ } -void releaseThreadNode (void) { /* nothing */ } +void setThreadNode (uint32_t node) +{ + if (osNumaAvailable()) + { + StgWord mask = 0; + mask |= 1 << node; + if (!SetThreadAffinityMask(GetCurrentThread(), mask)) + { + sysErrorBelch( + "setThreadNode: Error setting affinity of thread to NUMA node `%u': %lu.", + node, GetLastError()); + stg_exit(EXIT_FAILURE); + } + } +} + +void releaseThreadNode (void) +{ + if (osNumaAvailable()) + { + StgWord processMask; + StgWord systemMask; + if (!GetProcessAffinityMask(GetCurrentProcess(), + &processMask, + &systemMask)) + { + sysErrorBelch( + "releaseThreadNode: Error resetting affinity of thread: %lu", + GetLastError()); + stg_exit(EXIT_FAILURE); + } + + if (!SetThreadAffinityMask(GetCurrentThread(), processMask)) + { + sysErrorBelch( + "releaseThreadNode: Error reseting NUMA affinity mask of thread: %lu.", + GetLastError()); + stg_exit(EXIT_FAILURE); + } + + } +} #else /* !defined(THREADED_RTS) */ |