[libc] Add more utility functions for the GPU

This patch adds extra intrinsics for the GPU. Some of these are unused for now but will be used later. We use these currently to update the `RPC` handling. Currently, every thread can update the RPC client, which isn't correct. This patch adds code neccesary to allow a single thread to perfrom the write while the others wait. Feedback is welcome for the naming of these functions. I'm copying the OpenMP nomenclature where we call an AMD `wavefront` or NVIDIA `warp` a `lane`. Reviewed By: tra Differential Revision: https://reviews.llvm.org/D148810
author: Joseph Huber <jhuber6@vols.utk.edu> 2023-04-20 11:16:01 -0500
committer: Joseph Huber <jhuber6@vols.utk.edu> 2023-04-24 15:47:53 -0500
commit: 50445dff43037014a23eb38b1f50bb698e64ffcf (patch)
tree: b44aeeb6b16e717993efdb2a6170ede2030fd906 /libc/startup
parent: 5084ba395e487adee67ba38cc5c68ff7e052e37c (diff)
download: llvm-50445dff43037014a23eb38b1f50bb698e64ffcf.tar.gz
4 files changed, 53 insertions, 7 deletions
diff --git a/libc/startup/gpu/amdgpu/CMakeLists.txt b/libc/startup/gpu/amdgpu/CMakeLists.txt
index 891d20993b08..a9f33af6d79e 100644
--- a/libc/startup/gpu/amdgpu/CMakeLists.txt
+++ b/libc/startup/gpu/amdgpu/CMakeLists.txt
@@ -4,6 +4,7 @@ add_startup_object(
     start.cpp
   DEPENDS
     libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 66f06b086a23..e8b5029f2a76 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -6,16 +6,38 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 
-static __llvm_libc::cpp::Atomic<uint32_t> lock;
-
 extern "C" int main(int argc, char **argv, char **envp);
 
+namespace __llvm_libc {
+
+static cpp::Atomic<uint32_t> lock = 0;
+
+static cpp::Atomic<uint32_t> init = 0;
+
+void init_rpc(void *in, void *out, void *buffer) {
+  // Only a single thread should update the RPC data.
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    rpc::client.reset(&lock, in, out, buffer);
+    init.store(1, cpp::MemoryOrder::RELAXED);
+  }
+
+  // Wait until the previous thread signals that the data has been written.
+  while (!init.load(cpp::MemoryOrder::RELAXED))
+    rpc::sleep_briefly();
+
+  // Wait for the threads in the block to converge and fence the write.
+  gpu::sync_threads();
+}
+
+} // namespace __llvm_libc
+
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::rpc::client.reset(&lock, in, out, buffer);
+  __llvm_libc::init_rpc(in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index 49661691ecb5..b8a9f49d5be5 100644
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -5,6 +5,7 @@ add_startup_object(
     start.cpp
   DEPENDS
     libc.src.__support.RPC.rpc_client
+    libc.src.__support.GPU.utils
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 9939c6e21330..7b88e30f7f37 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -6,16 +6,38 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
 
-static __llvm_libc::cpp::Atomic<uint32_t> lock;
-
 extern "C" int main(int argc, char **argv, char **envp);
 
-extern "C" [[gnu::visibility("protected")]] __attribute__((nvptx_kernel)) void
+namespace __llvm_libc {
+
+static cpp::Atomic<uint32_t> lock = 0;
+
+static cpp::Atomic<uint32_t> init = 0;
+
+void init_rpc(void *in, void *out, void *buffer) {
+  // Only a single thread should update the RPC data.
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    rpc::client.reset(&lock, in, out, buffer);
+    init.store(1, cpp::MemoryOrder::RELAXED);
+  }
+
+  // Wait until the previous thread signals that the data has been written.
+  while (!init.load(cpp::MemoryOrder::RELAXED))
+    rpc::sleep_briefly();
+
+  // Wait for the threads in the block to converge and fence the write.
+  gpu::sync_threads();
+}
+
+} // namespace __llvm_libc
+
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::rpc::client.reset(&lock, in, out, buffer);
+  __llvm_libc::init_rpc(in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
 }
author	Joseph Huber <jhuber6@vols.utk.edu>	2023-04-20 11:16:01 -0500
committer	Joseph Huber <jhuber6@vols.utk.edu>	2023-04-24 15:47:53 -0500
commit	50445dff43037014a23eb38b1f50bb698e64ffcf (patch)
tree	b44aeeb6b16e717993efdb2a6170ede2030fd906 /libc/startup
parent	5084ba395e487adee67ba38cc5c68ff7e052e37c (diff)
download	llvm-50445dff43037014a23eb38b1f50bb698e64ffcf.tar.gz