diff options
author | gregrodgers <Gregory.Rodgers@amd.com> | 2023-04-19 16:14:40 -0500 |
---|---|---|
committer | JP Lehr <JanPatrick.Lehr@amd.com> | 2023-05-04 06:01:14 -0400 |
commit | f238a98e844752b955dcf3d7b95b9c76c75a0017 (patch) | |
tree | 1be2db77947855a21aa4bd05c387175cadf6335d /openmp | |
parent | f3dcd3ad992c82be4f652fd2aac6b0ef414566a2 (diff) | |
download | llvm-f238a98e844752b955dcf3d7b95b9c76c75a0017.tar.gz |
[OpenMP][libomptarget][AMDGPU] Enable active HSA wait state
Adds HSA timeout hint of 2 seconds to the AMDGPU nextgen-plugin to improve
performance of small kernels.
The HSA runtime may stay in HSA_WAIT_STATE_ACTIVE for up to the timeout
value before switching to HSA_WAIT_STATE_BLOCKED. This can improve
latency from which small kernels can benefit.
The value was determined via experimentation w/ different benchmarks.
The timeout value can be overriden using the environment variable
LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT with a value in microseconds.
Original author: Greg Rodgers <Gregory.Rodgers@amd.com>
Contributions from: JP Lehr <JanPatrick.Lehr@amd.com>
Differential Revision: https://reviews.llvm.org/D148808
Diffstat (limited to 'openmp')
-rw-r--r-- | openmp/docs/design/Runtimes.rst | 9 | ||||
-rw-r--r-- | openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp | 33 |
2 files changed, 36 insertions, 6 deletions
diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index 98f47bc1c632..1402192581d3 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1160,6 +1160,7 @@ There are several environment variables to change the behavior of the plugins: * ``LIBOMPTARGET_AMDGPU_TEAMS_PER_CU`` * ``LIBOMPTARGET_AMDGPU_MAX_ASYNC_COPY_BYTES`` * ``LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS`` +* ``LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT`` The environment variables ``LIBOMPTARGET_SHARED_MEMORY_SIZE``, ``LIBOMPTARGET_STACK_SIZE`` and ``LIBOMPTARGET_HEAP_SIZE`` are described in @@ -1238,6 +1239,14 @@ managing several pre-created signals. These signals are mainly used by AMDGPU streams. More HSA signals will be created dynamically throughout the execution if needed. The default value is ``64``. +LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT +""""""""""""""""""""""""""""""""""" + +This environment variable controls the timeout hint in microseconds for the +HSA wait state within the AMDGPU plugin. For the duration of this value +the HSA runtime may busy wait. This can reduce overall latency. +The default value is ``2000000``. + .. _remote_offloading_plugin: Remote Offloading Plugin: diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp index f9b0371f903a..0d2d8fae149e 100644 --- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp @@ -511,8 +511,14 @@ struct AMDGPUSignalTy { } /// Wait until the signal gets a zero value. - Error wait() const { - // TODO: Is it better to use busy waiting or blocking the thread? + Error wait(const uint64_t ActiveTimeout = 0) const { + if (ActiveTimeout) { + hsa_signal_value_t Got = 1; + Got = hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, + ActiveTimeout, HSA_WAIT_STATE_ACTIVE); + if (Got == 0) + return Plugin::success(); + } while (hsa_signal_wait_scacquire(Signal, HSA_SIGNAL_CONDITION_EQ, 0, UINT64_MAX, HSA_WAIT_STATE_BLOCKED) != 0) ; @@ -884,6 +890,9 @@ private: /// Mutex to protect stream's management. mutable std::mutex Mutex; + /// Timeout hint for HSA actively waiting for signal value to change + const uint64_t StreamBusyWaitMicroseconds; + /// Return the current number of asychronous operations on the stream. uint32_t size() const { return NextSlot; } @@ -1247,7 +1256,7 @@ public: return Plugin::success(); // Wait until all previous operations on the stream have completed. - if (auto Err = Slots[last()].Signal->wait()) + if (auto Err = Slots[last()].Signal->wait(StreamBusyWaitMicroseconds)) return Err; // Reset the stream and perform all pending post actions. @@ -1555,6 +1564,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { 1 * 1024 * 1024), // 1MB OMPX_InitialNumSignals("LIBOMPTARGET_AMDGPU_NUM_INITIAL_HSA_SIGNALS", 64), + OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000), AMDGPUStreamManager(*this), AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice), Queues() {} @@ -1679,6 +1689,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { return Plugin::success(); } + const uint64_t getStreamBusyWaitMicroseconds() const { + return OMPX_StreamBusyWait; + } + Expected<std::unique_ptr<MemoryBuffer>> doJITPostProcessing(std::unique_ptr<MemoryBuffer> MB) const override { @@ -1941,7 +1955,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) return Err; - if (auto Err = Signal.wait()) + if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) return Err; if (auto Err = Signal.deinit()) @@ -1998,7 +2012,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Plugin::check(Status, "Error in hsa_amd_memory_async_copy: %s")) return Err; - if (auto Err = Signal.wait()) + if (auto Err = Signal.wait(getStreamBusyWaitMicroseconds())) return Err; if (auto Err = Signal.deinit()) @@ -2173,6 +2187,12 @@ private: /// will be created. UInt32Envar OMPX_InitialNumSignals; + /// Environment variables to set the time to wait in active state before + /// switching to blocked state. The default 2000000 busywaits for 2 seconds + /// before going into a blocking HSA wait state. The unit for these variables + /// are microseconds. + UInt32Envar OMPX_StreamBusyWait; + /// Stream manager for AMDGPU streams. AMDGPUStreamManagerTy AMDGPUStreamManager; @@ -2267,7 +2287,8 @@ AMDGPUStreamTy::AMDGPUStreamTy(AMDGPUDeviceTy &Device) : Agent(Device.getAgent()), Queue(Device.getNextQueue()), SignalManager(Device.getSignalManager()), // Initialize the std::deque with some empty positions. - Slots(32), NextSlot(0), SyncCycle(0) {} + Slots(32), NextSlot(0), SyncCycle(0), + StreamBusyWaitMicroseconds(Device.getStreamBusyWaitMicroseconds()) {} /// Class implementing the AMDGPU-specific functionalities of the global /// handler. |