From bbeae142bfe2f2961816d51b45fb385821052b34 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield@gmail.com>
Date: Thu, 11 May 2023 03:04:55 +0100
Subject: [libc][rpc] Allocate a single block of shared memory instead of three

Allows moving the pointer swap between server and client into reset.
Single allocation simplifies whatever allocates the client/server, currently
the libc loaders.

Reviewed By: jhuber6

Differential Revision: https://reviews.llvm.org/D150337
---
 libc/src/__support/RPC/rpc.h            | 55 ++++++++++++++++++++++++++++++---
 libc/startup/gpu/amdgpu/start.cpp       |  6 ++--
 libc/startup/gpu/nvptx/start.cpp        |  6 ++--
 libc/utils/gpu/loader/Loader.h          |  4 +--
 libc/utils/gpu/loader/amdgpu/Loader.cpp | 42 +++++++------------------
 libc/utils/gpu/loader/nvptx/Loader.cpp  | 26 ++++++----------
 6 files changed, 78 insertions(+), 61 deletions(-)

(limited to 'libc')
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 1285f2b7cd50..2304b4d9b242 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -114,13 +114,30 @@ template <bool InvertInbox> struct Process {
   cpp::Atomic<uint32_t> lock[default_port_count] = {0};
 
   /// Initialize the communication channels.
-  LIBC_INLINE void reset(uint64_t port_count, uint32_t lane_size, void *inbox,
-                         void *outbox, void *packet) {
+  LIBC_INLINE void reset(uint64_t port_count, uint32_t lane_size, void *state) {
+    uint64_t p = memory_offset_primary_mailbox(port_count);
+    uint64_t s = memory_offset_secondary_mailbox(port_count);
     this->port_count = port_count;
     this->lane_size = lane_size;
-    this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(inbox);
-    this->outbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(outbox);
-    this->packet = reinterpret_cast<Packet *>(packet);
+    this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
+        static_cast<char *>(state) + (InvertInbox ? s : p));
+    this->outbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
+        static_cast<char *>(state) + (InvertInbox ? p : s));
+    this->packet = reinterpret_cast<Packet *>(static_cast<char *>(state) +
+                                              memory_offset_buffer(port_count));
+  }
+
+  /// Allocate a single block of memory for use by client and server
+  /// template<size_t N>, N is generally a runtime value
+  /// struct equivalent {
+  ///   atomic<uint32_t> primary[N];
+  ///   atomic<uint32_t> secondary[N];
+  ///   Packet buffer[N];
+  /// };
+  LIBC_INLINE static uint64_t allocation_size(uint64_t port_count,
+                                              uint32_t lane_size) {
+    return memory_offset_buffer(port_count) +
+           memory_allocated_buffer(port_count, lane_size);
   }
 
   /// The length of the packet is flexible because the server needs to look up
@@ -245,6 +262,34 @@ template <bool InvertInbox> struct Process {
           fn(&packet.payload.slot[i], i);
     }
   }
+
+  /// Number of bytes allocated for mailbox or buffer
+  LIBC_INLINE static uint64_t memory_allocated_mailbox(uint64_t port_count) {
+    return port_count * sizeof(cpp::Atomic<uint32_t>);
+  }
+
+  LIBC_INLINE static uint64_t memory_allocated_buffer(uint64_t port_count,
+                                                      uint32_t lane_size) {
+#if defined(LIBC_TARGET_ARCH_IS_GPU)
+    (void)lane_size;
+    return port_count * sizeof(Packet);
+#else
+    return port_count * (sizeof(Packet) + sizeof(Buffer) * lane_size);
+#endif
+  }
+
+  /// Offset of mailbox/buffer in single allocation
+  LIBC_INLINE static uint64_t
+  memory_offset_primary_mailbox(uint64_t /*port_count*/) {
+    return 0;
+  }
+  LIBC_INLINE static uint64_t
+  memory_offset_secondary_mailbox(uint64_t port_count) {
+    return memory_allocated_mailbox(port_count);
+  }
+  LIBC_INLINE static uint64_t memory_offset_buffer(uint64_t port_count) {
+    return align_up(2 * memory_allocated_mailbox(port_count), alignof(Packet));
+  }
 };
 
 /// The port provides the interface to communicate between the multiple
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 9761c64cb318..b9f1df204b41 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -38,12 +38,12 @@ static void call_fini_array_callbacks() {
 } // namespace __llvm_libc
 
 extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
-_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) {
+_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
   // We need to set up the RPC client first in case any of the constructors
   // require it.
   __llvm_libc::rpc::client.reset(__llvm_libc::rpc::default_port_count,
-                                 __llvm_libc::gpu::get_lane_size(), in, out,
-                                 buffer);
+                                 __llvm_libc::gpu::get_lane_size(),
+                                 rpc_shared_buffer);
 
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 78cdc64ed967..709a5936d82e 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -42,12 +42,12 @@ static void call_fini_array_callbacks() {
 } // namespace __llvm_libc
 
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) {
+_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
   // We need to set up the RPC client first in case any of the constructors
   // require it.
   __llvm_libc::rpc::client.reset(__llvm_libc::rpc::default_port_count,
-                                 __llvm_libc::gpu::get_lane_size(), in, out,
-                                 buffer);
+                                 __llvm_libc::gpu::get_lane_size(),
+                                 rpc_shared_buffer);
 
   // We want the fini array callbacks to be run after other atexit
   // callbacks are run. So, we register them before running the init
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index 2f55b3ac8fc4..fcff0ec1516e 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -28,9 +28,7 @@ struct begin_args_t {
   int argc;
   void *argv;
   void *envp;
-  void *inbox;
-  void *outbox;
-  void *buffer;
+  void *rpc_shared_buffer;
 };
 
 /// The arguments to the '_start' kernel.
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index ad5e02116918..fac81a2057b5 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -335,31 +335,18 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   if (hsa_status_t err = hsa_agent_get_info(
           dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
     handle_error(err);
-  void *server_inbox;
-  void *server_outbox;
-  void *buffer;
-  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
-          finegrained_pool, port_size * sizeof(__llvm_libc::cpp::Atomic<int>),
-          /*flags=*/0, &server_inbox))
-    handle_error(err);
-  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
-          finegrained_pool, port_size * sizeof(__llvm_libc::cpp::Atomic<int>),
-          /*flags=*/0, &server_outbox))
-    handle_error(err);
-  if (hsa_status_t err = hsa_amd_memory_pool_allocate(
-          finegrained_pool,
-          port_size *
-              align_up(sizeof(__llvm_libc::rpc::Header) +
-                           (wavefront_size * sizeof(__llvm_libc::rpc::Buffer)),
-                       alignof(__llvm_libc::rpc::Packet)),
-          /*flags=*/0, &buffer))
-    handle_error(err);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer);
+
+  uint64_t rpc_shared_buffer_size =
+      __llvm_libc::rpc::Server::allocation_size(port_size, wavefront_size);
+  void *rpc_shared_buffer;
+  if (hsa_status_t err =
+          hsa_amd_memory_pool_allocate(finegrained_pool, rpc_shared_buffer_size,
+                                       /*flags=*/0, &rpc_shared_buffer))
+    handle_error(err);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_shared_buffer);
 
   // Initialize the RPC server's buffer for host-device communication.
-  server.reset(port_size, wavefront_size, server_inbox, server_outbox, buffer);
+  server.reset(port_size, wavefront_size, rpc_shared_buffer);
 
   // Obtain a queue with the minimum (power of two) size, used to send commands
   // to the HSA runtime and launch execution on the device.
@@ -374,8 +361,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
 
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
-  begin_args_t init_args = {argc,          dev_argv,     dev_envp,
-                            server_outbox, server_inbox, buffer};
+  begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer};
   if (hsa_status_t err =
           launch_kernel(dev_agent, executable, kernargs_pool, queue,
                         single_threaded_params, "_begin.kd", init_args))
@@ -422,11 +408,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
     handle_error(err);
-  if (hsa_status_t err = hsa_amd_memory_pool_free(server_inbox))
-    handle_error(err);
-  if (hsa_status_t err = hsa_amd_memory_pool_free(server_outbox))
-    handle_error(err);
-  if (hsa_status_t err = hsa_amd_memory_pool_free(buffer))
+  if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_shared_buffer))
     handle_error(err);
   if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
     handle_error(err);
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 2230f55ea24e..c5c23604a1aa 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -248,24 +248,20 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
 
   uint64_t port_size = __llvm_libc::rpc::default_port_count;
   uint32_t warp_size = 32;
-  void *server_inbox =
-      allocator(port_size * sizeof(__llvm_libc::cpp::Atomic<int>));
-  void *server_outbox =
-      allocator(port_size * sizeof(__llvm_libc::cpp::Atomic<int>));
-  void *buffer = allocator(
-      port_size * align_up(sizeof(__llvm_libc::rpc::Header) +
-                               (warp_size * sizeof(__llvm_libc::rpc::Buffer)),
-                           alignof(__llvm_libc::rpc::Packet)));
-  if (!server_inbox || !server_outbox || !buffer)
+
+  uint64_t rpc_shared_buffer_size =
+      __llvm_libc::rpc::Server::allocation_size(port_size, warp_size);
+  void *rpc_shared_buffer = allocator(rpc_shared_buffer_size);
+
+  if (!rpc_shared_buffer)
     handle_error("Failed to allocate memory the RPC client / server.");
 
   // Initialize the RPC server's buffer for host-device communication.
-  server.reset(port_size, warp_size, server_inbox, server_outbox, buffer);
+  server.reset(port_size, warp_size, rpc_shared_buffer);
 
   LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
   // Call the kernel to
-  begin_args_t init_args = {argc,          dev_argv,     dev_envp,
-                            server_outbox, server_inbox, buffer};
+  begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer};
   if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
                                    "_begin", init_args))
     handle_error(err);
@@ -295,11 +291,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
   if (CUresult err = cuMemFreeHost(dev_argv))
     handle_error(err);
-  if (CUresult err = cuMemFreeHost(server_inbox))
-    handle_error(err);
-  if (CUresult err = cuMemFreeHost(server_outbox))
-    handle_error(err);
-  if (CUresult err = cuMemFreeHost(buffer))
+  if (CUresult err = cuMemFreeHost(rpc_shared_buffer))
     handle_error(err);
 
   // Destroy the context and the loaded binary.
-- 
cgit v1.2.1