summaryrefslogtreecommitdiff
path: root/libc
diff options
context:
space:
mode:
authorJon Chesterfield <jonathanchesterfield@gmail.com>2023-05-11 03:04:55 +0100
committerJon Chesterfield <jonathanchesterfield@gmail.com>2023-05-11 03:04:56 +0100
commitbbeae142bfe2f2961816d51b45fb385821052b34 (patch)
treeda951f84c4eff88c37983eef00d332db638bf59e /libc
parent657dbb4c394daec626bccfcaddc5341b8b9fc14e (diff)
downloadllvm-bbeae142bfe2f2961816d51b45fb385821052b34.tar.gz
[libc][rpc] Allocate a single block of shared memory instead of three
Allows moving the pointer swap between server and client into reset. Single allocation simplifies whatever allocates the client/server, currently the libc loaders. Reviewed By: jhuber6 Differential Revision: https://reviews.llvm.org/D150337
Diffstat (limited to 'libc')
-rw-r--r--libc/src/__support/RPC/rpc.h55
-rw-r--r--libc/startup/gpu/amdgpu/start.cpp6
-rw-r--r--libc/startup/gpu/nvptx/start.cpp6
-rw-r--r--libc/utils/gpu/loader/Loader.h4
-rw-r--r--libc/utils/gpu/loader/amdgpu/Loader.cpp42
-rw-r--r--libc/utils/gpu/loader/nvptx/Loader.cpp26
6 files changed, 78 insertions, 61 deletions
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 1285f2b7cd50..2304b4d9b242 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -114,13 +114,30 @@ template <bool InvertInbox> struct Process {
cpp::Atomic<uint32_t> lock[default_port_count] = {0};
/// Initialize the communication channels.
- LIBC_INLINE void reset(uint64_t port_count, uint32_t lane_size, void *inbox,
- void *outbox, void *packet) {
+ LIBC_INLINE void reset(uint64_t port_count, uint32_t lane_size, void *state) {
+ uint64_t p = memory_offset_primary_mailbox(port_count);
+ uint64_t s = memory_offset_secondary_mailbox(port_count);
this->port_count = port_count;
this->lane_size = lane_size;
- this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(inbox);
- this->outbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(outbox);
- this->packet = reinterpret_cast<Packet *>(packet);
+ this->inbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
+ static_cast<char *>(state) + (InvertInbox ? s : p));
+ this->outbox = reinterpret_cast<cpp::Atomic<uint32_t> *>(
+ static_cast<char *>(state) + (InvertInbox ? p : s));
+ this->packet = reinterpret_cast<Packet *>(static_cast<char *>(state) +
+ memory_offset_buffer(port_count));
+ }
+
+ /// Allocate a single block of memory for use by client and server
+ /// template<size_t N>, N is generally a runtime value
+ /// struct equivalent {
+ /// atomic<uint32_t> primary[N];
+ /// atomic<uint32_t> secondary[N];
+ /// Packet buffer[N];
+ /// };
+ LIBC_INLINE static uint64_t allocation_size(uint64_t port_count,
+ uint32_t lane_size) {
+ return memory_offset_buffer(port_count) +
+ memory_allocated_buffer(port_count, lane_size);
}
/// The length of the packet is flexible because the server needs to look up
@@ -245,6 +262,34 @@ template <bool InvertInbox> struct Process {
fn(&packet.payload.slot[i], i);
}
}
+
+ /// Number of bytes allocated for mailbox or buffer
+ LIBC_INLINE static uint64_t memory_allocated_mailbox(uint64_t port_count) {
+ return port_count * sizeof(cpp::Atomic<uint32_t>);
+ }
+
+ LIBC_INLINE static uint64_t memory_allocated_buffer(uint64_t port_count,
+ uint32_t lane_size) {
+#if defined(LIBC_TARGET_ARCH_IS_GPU)
+ (void)lane_size;
+ return port_count * sizeof(Packet);
+#else
+ return port_count * (sizeof(Packet) + sizeof(Buffer) * lane_size);
+#endif
+ }
+
+ /// Offset of mailbox/buffer in single allocation
+ LIBC_INLINE static uint64_t
+ memory_offset_primary_mailbox(uint64_t /*port_count*/) {
+ return 0;
+ }
+ LIBC_INLINE static uint64_t
+ memory_offset_secondary_mailbox(uint64_t port_count) {
+ return memory_allocated_mailbox(port_count);
+ }
+ LIBC_INLINE static uint64_t memory_offset_buffer(uint64_t port_count) {
+ return align_up(2 * memory_allocated_mailbox(port_count), alignof(Packet));
+ }
};
/// The port provides the interface to communicate between the multiple
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index 9761c64cb318..b9f1df204b41 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -38,12 +38,12 @@ static void call_fini_array_callbacks() {
} // namespace __llvm_libc
extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
-_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) {
+_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
// We need to set up the RPC client first in case any of the constructors
// require it.
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::default_port_count,
- __llvm_libc::gpu::get_lane_size(), in, out,
- buffer);
+ __llvm_libc::gpu::get_lane_size(),
+ rpc_shared_buffer);
// We want the fini array callbacks to be run after other atexit
// callbacks are run. So, we register them before running the init
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 78cdc64ed967..709a5936d82e 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -42,12 +42,12 @@ static void call_fini_array_callbacks() {
} // namespace __llvm_libc
extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) {
+_begin(int argc, char **argv, char **env, void *rpc_shared_buffer) {
// We need to set up the RPC client first in case any of the constructors
// require it.
__llvm_libc::rpc::client.reset(__llvm_libc::rpc::default_port_count,
- __llvm_libc::gpu::get_lane_size(), in, out,
- buffer);
+ __llvm_libc::gpu::get_lane_size(),
+ rpc_shared_buffer);
// We want the fini array callbacks to be run after other atexit
// callbacks are run. So, we register them before running the init
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index 2f55b3ac8fc4..fcff0ec1516e 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -28,9 +28,7 @@ struct begin_args_t {
int argc;
void *argv;
void *envp;
- void *inbox;
- void *outbox;
- void *buffer;
+ void *rpc_shared_buffer;
};
/// The arguments to the '_start' kernel.
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index ad5e02116918..fac81a2057b5 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -335,31 +335,18 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (hsa_status_t err = hsa_agent_get_info(
dev_agent, HSA_AGENT_INFO_WAVEFRONT_SIZE, &wavefront_size))
handle_error(err);
- void *server_inbox;
- void *server_outbox;
- void *buffer;
- if (hsa_status_t err = hsa_amd_memory_pool_allocate(
- finegrained_pool, port_size * sizeof(__llvm_libc::cpp::Atomic<int>),
- /*flags=*/0, &server_inbox))
- handle_error(err);
- if (hsa_status_t err = hsa_amd_memory_pool_allocate(
- finegrained_pool, port_size * sizeof(__llvm_libc::cpp::Atomic<int>),
- /*flags=*/0, &server_outbox))
- handle_error(err);
- if (hsa_status_t err = hsa_amd_memory_pool_allocate(
- finegrained_pool,
- port_size *
- align_up(sizeof(__llvm_libc::rpc::Header) +
- (wavefront_size * sizeof(__llvm_libc::rpc::Buffer)),
- alignof(__llvm_libc::rpc::Packet)),
- /*flags=*/0, &buffer))
- handle_error(err);
- hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_inbox);
- hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox);
- hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer);
+
+ uint64_t rpc_shared_buffer_size =
+ __llvm_libc::rpc::Server::allocation_size(port_size, wavefront_size);
+ void *rpc_shared_buffer;
+ if (hsa_status_t err =
+ hsa_amd_memory_pool_allocate(finegrained_pool, rpc_shared_buffer_size,
+ /*flags=*/0, &rpc_shared_buffer))
+ handle_error(err);
+ hsa_amd_agents_allow_access(1, &dev_agent, nullptr, rpc_shared_buffer);
// Initialize the RPC server's buffer for host-device communication.
- server.reset(port_size, wavefront_size, server_inbox, server_outbox, buffer);
+ server.reset(port_size, wavefront_size, rpc_shared_buffer);
// Obtain a queue with the minimum (power of two) size, used to send commands
// to the HSA runtime and launch execution on the device.
@@ -374,8 +361,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
handle_error(err);
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
- begin_args_t init_args = {argc, dev_argv, dev_envp,
- server_outbox, server_inbox, buffer};
+ begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer};
if (hsa_status_t err =
launch_kernel(dev_agent, executable, kernargs_pool, queue,
single_threaded_params, "_begin.kd", init_args))
@@ -422,11 +408,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
handle_error(err);
- if (hsa_status_t err = hsa_amd_memory_pool_free(server_inbox))
- handle_error(err);
- if (hsa_status_t err = hsa_amd_memory_pool_free(server_outbox))
- handle_error(err);
- if (hsa_status_t err = hsa_amd_memory_pool_free(buffer))
+ if (hsa_status_t err = hsa_amd_memory_pool_free(rpc_shared_buffer))
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(host_ret))
handle_error(err);
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 2230f55ea24e..c5c23604a1aa 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -248,24 +248,20 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
uint64_t port_size = __llvm_libc::rpc::default_port_count;
uint32_t warp_size = 32;
- void *server_inbox =
- allocator(port_size * sizeof(__llvm_libc::cpp::Atomic<int>));
- void *server_outbox =
- allocator(port_size * sizeof(__llvm_libc::cpp::Atomic<int>));
- void *buffer = allocator(
- port_size * align_up(sizeof(__llvm_libc::rpc::Header) +
- (warp_size * sizeof(__llvm_libc::rpc::Buffer)),
- alignof(__llvm_libc::rpc::Packet)));
- if (!server_inbox || !server_outbox || !buffer)
+
+ uint64_t rpc_shared_buffer_size =
+ __llvm_libc::rpc::Server::allocation_size(port_size, warp_size);
+ void *rpc_shared_buffer = allocator(rpc_shared_buffer_size);
+
+ if (!rpc_shared_buffer)
handle_error("Failed to allocate memory the RPC client / server.");
// Initialize the RPC server's buffer for host-device communication.
- server.reset(port_size, warp_size, server_inbox, server_outbox, buffer);
+ server.reset(port_size, warp_size, rpc_shared_buffer);
LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
// Call the kernel to
- begin_args_t init_args = {argc, dev_argv, dev_envp,
- server_outbox, server_inbox, buffer};
+ begin_args_t init_args = {argc, dev_argv, dev_envp, rpc_shared_buffer};
if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
"_begin", init_args))
handle_error(err);
@@ -295,11 +291,7 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
handle_error(err);
if (CUresult err = cuMemFreeHost(dev_argv))
handle_error(err);
- if (CUresult err = cuMemFreeHost(server_inbox))
- handle_error(err);
- if (CUresult err = cuMemFreeHost(server_outbox))
- handle_error(err);
- if (CUresult err = cuMemFreeHost(buffer))
+ if (CUresult err = cuMemFreeHost(rpc_shared_buffer))
handle_error(err);
// Destroy the context and the loaded binary.