From bbeae142bfe2f2961816d51b45fb385821052b34 Mon Sep 17 00:00:00 2001 From: Jon Chesterfield Date: Thu, 11 May 2023 03:04:55 +0100 Subject: [libc][rpc] Allocate a single block of shared memory instead of three Allows moving the pointer swap between server and client into reset. Single allocation simplifies whatever allocates the client/server, currently the libc loaders. Reviewed By: jhuber6 Differential Revision: https://reviews.llvm.org/D150337 --- libc/src/__support/RPC/rpc.h | 55 ++++++++++++++++++++++++++++++--- libc/startup/gpu/amdgpu/start.cpp | 6 ++-- libc/startup/gpu/nvptx/start.cpp | 6 ++-- libc/utils/gpu/loader/Loader.h | 4 +-- libc/utils/gpu/loader/amdgpu/Loader.cpp | 42 +++++++------------------ libc/utils/gpu/loader/nvptx/Loader.cpp | 26 ++++++---------- 6 files changed, 78 insertions(+), 61 deletions(-) (limited to 'libc') diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h index 1285f2b7cd50..2304b4d9b242 100644 --- a/libc/src/__support/RPC/rpc.h +++ b/libc/src/__support/RPC/rpc.h @@ -114,13 +114,30 @@ template struct Process { cpp::Atomic lock[default_port_count] = {0}; /// Initialize the communication channels. - LIBC_INLINE void reset(uint64_t port_count, uint32_t lane_size, void *inbox, - void *outbox, void *packet) { + LIBC_INLINE void reset(uint64_t port_count, uint32_t lane_size, void *state) { + uint64_t p = memory_offset_primary_mailbox(port_count); + uint64_t s = memory_offset_secondary_mailbox(port_count); this->port_count = port_count; this->lane_size = lane_size; - this->inbox = reinterpret_cast *>(inbox); - this->outbox = reinterpret_cast *>(outbox); - this->packet = reinterpret_cast(packet); + this->inbox = reinterpret_cast *>( + static_cast(state) + (InvertInbox ? s : p)); + this->outbox = reinterpret_cast *>( + static_cast(state) + (InvertInbox ? p : s)); + this->packet = reinterpret_cast(static_cast(state) + + memory_offset_buffer(port_count)); + } + + /// Allocate a single block of memory for use by client and server + /// template, N is generally a runtime value + /// struct equivalent { + /// atomic primary[N]; + /// atomic secondary[N]; + /// Packet buffer[N]; + /// }; + LIBC_INLINE static uint64_t allocation_size(uint64_t port_count, + uint32_t lane_size) { + return memory_offset_buffer(port_count) + + memory_allocated_buffer(port_count, lane_size); } /// The length of the packet is flexible because the server needs to look up @@ -245,6 +262,34 @@ template struct Process { fn(&packet.payload.slot[i], i); } } + + /// Number of bytes allocated for mailbox or buffer + LIBC_INLINE static uint64_t memory_allocated_mailbox(uint64_t port_count) { + return port_count * sizeof(cpp::Atomic); + } + + LIBC_INLINE static uint64_t memory_allocated_buffer(uint64_t port_count, + uint32_t lane_size) { +#if defined(LIBC_TARGET_ARCH_IS_GPU) + (void)lane_size; + return port_count * sizeof(Packet); +#else + return port_count * (sizeof(Packet) + sizeof(Buffer) * lane_size); +#endif + } + + /// Offset of mailbox/buffer in single allocation + LIBC_INLINE static uint64_t + memory_offset_primary_mailbox(uint64_t /*port_count*/) { + return 0; + } + LIBC_INLINE static uint64_t + memory_offset_secondary_mailbox(uint64_t port_count) { + return memory_allocated_mailbox(port_count); + } + LIBC_INLINE static uint64_t memory_offset_buffer(uint64_t port_count) { + return align_up(2 * memory_allocated_mailbox(port_count), alignof(Packet)); + } }; /// The port provides the interface to communicate between the multiple diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp index 9761c64cb318..b9f1df204b41 100644 --- a/libc/startup/gpu/amdgpu/start.cpp +++ b/libc/startup/gpu/amdgpu/start.cpp @@ -38,12 +38,12 @@ static void call_fini_array_callbacks() { } // namespace __llvm_libc extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void -_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) { +_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) { // We need to set up the RPC client first in case any of the constructors // require it. __llvm_libc::rpc::client.reset(__llvm_libc::rpc::default_port_count, - __llvm_libc::gpu::get_lane_size(), in, out, - buffer); + __llvm_libc::gpu::get_lane_size(), + rpc_shared_buffer); // We want the fini array callbacks to be run after other atexit // callbacks are run. So, we register them before running the init diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp index 78cdc64ed967..709a5936d82e 100644 --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -42,12 +42,12 @@ static void call_fini_array_callbacks() { } // namespace __llvm_libc extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void -_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) { +_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) { // We need to set up the RPC client first in case any of the constructors // require it. __llvm_libc::rpc::client.reset(__llvm_libc::rpc::default_port_count, - __llvm_libc::gpu::get_lane_size(), in, out, - buffer); + __llvm_libc::gpu::get_lane_size(), + rpc_shared_buffer); // We want the fini array callbacks to be run after other atexit // callbacks are run. So, we register them before running the init diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h index 2f55b3ac8fc4..fcff0ec1516e 100644 --- a/libc/utils/gpu/loader/Loader.h +++ b/libc/utils/gpu/loader/Loader.h @@ -28,9 +28,7 @@ struct begin_args_t { int argc; void *argv; void *envp; - void *inbox; - void *outbox; - void *buffer; + void *rpc_shared_buffer; }; /// The arguments to the '_start' kernel. diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp index ad5e02116918..fac81a2057b5 100644 --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -335,31 +335,18 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, if (hsa_status_t err = hsa_agent_get_info( dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size)) handle_error(err); - void *server_inbox; - void *server_outbox; - void *buffer; - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, port_size * sizeof(__llvm_libc::cpp::Atomic), - /*flags=*/0, &server_inbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, port_size * sizeof(__llvm_libc::cpp::Atomic), - /*flags=*/0, &server_outbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_allocate( - finegrained_pool, - port_size * - align_up(sizeof(__llvm_libc::rpc::Header) + - (wavefront_size * sizeof(__llvm_libc::rpc::Buffer)), - alignof(__llvm_libc::rpc::Packet)), - /*flags=*/0, &buffer)) - handle_error(err); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox); - hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer); + + uint64_t rpc_shared_buffer_size = + __llvm_libc::rpc::Server::allocation_size(port_size, wavefront_size); + void *rpc_shared_buffer; + if (hsa_status_t err = + hsa_amd_memory_pool_allocate(finegrained_pool, rpc_shared_buffer_size, + /*flags=*/0, &rpc_shared_buffer)) + handle_error(err); + hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_shared_buffer); // Initialize the RPC server's buffer for host-device communication. - server.reset(port_size, wavefront_size, server_inbox, server_outbox, buffer); + server.reset(port_size, wavefront_size, rpc_shared_buffer); // Obtain a queue with the minimum (power of two) size, used to send commands // to the HSA runtime and launch execution on the device. @@ -374,8 +361,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; - begin_args_t init_args = {argc, dev_argv, dev_envp, - server_outbox, server_inbox, buffer}; + begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer}; if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool, queue, single_threaded_params, "_begin.kd", init_args)) @@ -422,11 +408,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret)) handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_free(server_inbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_free(server_outbox)) - handle_error(err); - if (hsa_status_t err = hsa_amd_memory_pool_free(buffer)) + if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_shared_buffer)) handle_error(err); if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret)) handle_error(err); diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp index 2230f55ea24e..c5c23604a1aa 100644 --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -248,24 +248,20 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, uint64_t port_size = __llvm_libc::rpc::default_port_count; uint32_t warp_size = 32; - void *server_inbox = - allocator(port_size * sizeof(__llvm_libc::cpp::Atomic)); - void *server_outbox = - allocator(port_size * sizeof(__llvm_libc::cpp::Atomic)); - void *buffer = allocator( - port_size * align_up(sizeof(__llvm_libc::rpc::Header) + - (warp_size * sizeof(__llvm_libc::rpc::Buffer)), - alignof(__llvm_libc::rpc::Packet))); - if (!server_inbox || !server_outbox || !buffer) + + uint64_t rpc_shared_buffer_size = + __llvm_libc::rpc::Server::allocation_size(port_size, warp_size); + void *rpc_shared_buffer = allocator(rpc_shared_buffer_size); + + if (!rpc_shared_buffer) handle_error("Failed to allocate memory the RPC client / server."); // Initialize the RPC server's buffer for host-device communication. - server.reset(port_size, warp_size, server_inbox, server_outbox, buffer); + server.reset(port_size, warp_size, rpc_shared_buffer); LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1}; // Call the kernel to - begin_args_t init_args = {argc, dev_argv, dev_envp, - server_outbox, server_inbox, buffer}; + begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer}; if (CUresult err = launch_kernel(binary, stream, single_threaded_params, "_begin", init_args)) handle_error(err); @@ -295,11 +291,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size, handle_error(err); if (CUresult err = cuMemFreeHost(dev_argv)) handle_error(err); - if (CUresult err = cuMemFreeHost(server_inbox)) - handle_error(err); - if (CUresult err = cuMemFreeHost(server_outbox)) - handle_error(err); - if (CUresult err = cuMemFreeHost(buffer)) + if (CUresult err = cuMemFreeHost(rpc_shared_buffer)) handle_error(err); // Destroy the context and the loaded binary. -- cgit v1.2.1