[libc] Enable multiple threads to use RPC on the GPU

The execution model of the GPU expects that groups of threads will execute in lock-step in SIMD fashion. It's both important for performance and correctness that we treat this as the smallest possible granularity for an RPC operation. Thus, we map multiple threads to a single larger buffer and ship that across the wire. This patch makes the necessary changes to support executing the RPC on the GPU with multiple threads. This requires some workarounds to mimic the model when handling the protocol from the CPU. I'm not completely happy with some of the workarounds required, but I think it should work. Uses some of the implementation details from D148191. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D148943
author: Joseph Huber <jhuber6@vols.utk.edu> 2023-05-04 14:53:28 -0500
committer: Joseph Huber <jhuber6@vols.utk.edu> 2023-05-04 19:31:41 -0500
commit: 507edb52f9a9a5c1ab2a92ec2e291a7b63c3fbff (patch)
tree: dcd9f8ef610af4a60ead26e721c5d3aead79777b /libc/utils
parent: fe9f557578a565ed01faf75cd07ea4d9b75feeb1 (diff)
download: llvm-507edb52f9a9a5c1ab2a92ec2e291a7b63c3fbff.tar.gz
4 files changed, 33 insertions, 14 deletions
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index 9c6413ee45d8..feaa8e0079bb 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -29,6 +29,11 @@ struct LaunchParameters {
 int load(int argc, char **argv, char **evnp, void *image, size_t size,
          const LaunchParameters &params);
 
+/// Return \p V aligned "upwards" according to \p Align.
+template <typename V, typename A> inline V align_up(V val, A align) {
+  return ((val + V(align) - 1) / V(align)) * V(align);
+}
+
 /// Copy the system's argument vector to GPU memory allocated using \p alloc.
 template <typename Allocator>
 void *copy_argument_vector(int argc, char **argv, Allocator alloc) {
diff --git a/libc/utils/gpu/loader/Server.h b/libc/utils/gpu/loader/Server.h
index cd043359b1ea..6ffb32955c89 100644
--- a/libc/utils/gpu/loader/Server.h
+++ b/libc/utils/gpu/loader/Server.h
@@ -30,15 +30,19 @@ void handle_server() {
 
   switch (port->get_opcode()) {
   case __llvm_libc::rpc::Opcode::PRINT_TO_STDERR: {
-    uint64_t str_size;
-    char *str = nullptr;
-    port->recv_n([&](uint64_t size) {
-      str_size = size;
-      str = new char[size];
-      return str;
+    uint64_t str_size[__llvm_libc::rpc::MAX_LANE_SIZE] = {0};
+    char *strs[__llvm_libc::rpc::MAX_LANE_SIZE] = {nullptr};
+    port->recv_n([&](uint64_t size, uint32_t id) {
+      str_size[id] = size;
+      strs[id] = new char[size];
+      return strs[id];
     });
-    fwrite(str, str_size, 1, stderr);
-    delete[] str;
+    for (uint64_t i = 0; i < __llvm_libc::rpc::MAX_LANE_SIZE; ++i) {
+      if (strs[i]) {
+        fwrite(strs[i], str_size[i], 1, stderr);
+        delete[] strs[i];
+      }
+    }
     break;
   }
   case __llvm_libc::rpc::Opcode::EXIT: {
@@ -54,8 +58,7 @@ void handle_server() {
     break;
   }
   default:
-    port->recv([](__llvm_libc::rpc::Buffer *) { /* no-op */ });
-    return;
+    port->recv([](__llvm_libc::rpc::Buffer *buffer) {});
   }
   port->close();
 }
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index af5f00878e65..f9a7b75ff11b 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -287,6 +287,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   hsa_amd_memory_fill(dev_ret, 0, sizeof(int));
 
   // Allocate finegrained memory for the RPC server and client to share.
+  uint32_t wavefront_size = 0;
+  if (hsa_status_t err = hsa_agent_get_info(
+          dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
+    handle_error(err);
   void *server_inbox;
   void *server_outbox;
   void *buffer;
@@ -299,7 +303,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
           /*flags=*/0, &server_outbox))
     handle_error(err);
   if (hsa_status_t err = hsa_amd_memory_pool_allocate(
-          finegrained_pool, sizeof(__llvm_libc::rpc::Buffer),
+          finegrained_pool,
+          align_up(sizeof(__llvm_libc::rpc::Header) +
+                       (wavefront_size * sizeof(__llvm_libc::rpc::Buffer)),
+                   alignof(__llvm_libc::rpc::Packet)),
           /*flags=*/0, &buffer))
     handle_error(err);
   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox);
@@ -351,7 +358,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
 
   // Initialize the RPC server's buffer for host-device communication.
-  server.reset(&lock, server_inbox, server_outbox, buffer);
+  server.reset(wavefront_size, &lock, server_inbox, server_outbox, buffer);
 
   // Initialize the packet header and set the doorbell signal to begin execution
   // by the HSA runtime.
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index baf8baaff7cd..77e6967dd022 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -232,9 +232,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   if (CUresult err = cuMemsetD32(dev_ret, 0, 1))
     handle_error(err);
 
+  uint32_t warp_size = 32;
   void *server_inbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
   void *server_outbox = allocator(sizeof(__llvm_libc::cpp::Atomic<int>));
-  void *buffer = allocator(sizeof(__llvm_libc::rpc::Buffer));
+  void *buffer =
+      allocator(align_up(sizeof(__llvm_libc::rpc::Header) +
+                             (warp_size * sizeof(__llvm_libc::rpc::Buffer)),
+                         alignof(__llvm_libc::rpc::Packet)));
   if (!server_inbox || !server_outbox || !buffer)
     handle_error("Failed to allocate memory the RPC client / server.");
 
@@ -254,7 +258,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
                          CU_LAUNCH_PARAM_END};
 
   // Initialize the RPC server's buffer for host-device communication.
-  server.reset(&lock, server_inbox, server_outbox, buffer);
+  server.reset(warp_size, &lock, server_inbox, server_outbox, buffer);
 
   // Call the kernel with the given arguments.
   if (CUresult err = cuLaunchKernel(
author	Joseph Huber <jhuber6@vols.utk.edu>	2023-05-04 14:53:28 -0500
committer	Joseph Huber <jhuber6@vols.utk.edu>	2023-05-04 19:31:41 -0500
commit	507edb52f9a9a5c1ab2a92ec2e291a7b63c3fbff (patch)
tree	dcd9f8ef610af4a60ead26e721c5d3aead79777b /libc/utils
parent	fe9f557578a565ed01faf75cd07ea4d9b75feeb1 (diff)
download	llvm-507edb52f9a9a5c1ab2a92ec2e291a7b63c3fbff.tar.gz