summaryrefslogtreecommitdiff
path: root/libc
diff options
context:
space:
mode:
Diffstat (limited to 'libc')
-rw-r--r--libc/startup/gpu/amdgpu/start.cpp76
-rw-r--r--libc/startup/gpu/nvptx/start.cpp78
-rw-r--r--libc/utils/gpu/loader/Loader.h23
-rw-r--r--libc/utils/gpu/loader/amdgpu/Loader.cpp247
-rw-r--r--libc/utils/gpu/loader/nvptx/Loader.cpp85
5 files changed, 242 insertions, 267 deletions
diff --git a/libc/startup/gpu/amdgpu/start.cpp b/libc/startup/gpu/amdgpu/start.cpp
index b28ad7960bf2..d1dfc7b8c11d 100644
--- a/libc/startup/gpu/amdgpu/start.cpp
+++ b/libc/startup/gpu/amdgpu/start.cpp
@@ -17,8 +17,6 @@ namespace __llvm_libc {
static cpp::Atomic<uint32_t> lock = 0;
-static cpp::Atomic<uint32_t> count = 0;
-
extern "C" uintptr_t __init_array_start[];
extern "C" uintptr_t __init_array_end[];
extern "C" uintptr_t __fini_array_start[];
@@ -27,10 +25,6 @@ extern "C" uintptr_t __fini_array_end[];
using InitCallback = void(int, char **, char **);
using FiniCallback = void(void);
-static uint64_t get_grid_size() {
- return gpu::get_num_threads() * gpu::get_num_blocks();
-}
-
static void call_init_array_callbacks(int argc, char **argv, char **env) {
size_t init_array_size = __init_array_end - __init_array_start;
for (size_t i = 0; i < init_array_size; ++i)
@@ -43,56 +37,34 @@ static void call_fini_array_callbacks() {
reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
}
-void initialize(int argc, char **argv, char **env, void *in, void *out,
- void *buffer) {
- // We need a single GPU thread to perform the initialization of the global
- // constructors and data. We simply mask off all but a single thread and
- // execute.
- count.fetch_add(1, cpp::MemoryOrder::RELAXED);
- if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
- // We need to set up the RPC client first in case any of the constructors
- // require it.
- rpc::client.reset(gpu::get_lane_size(), &lock, in, out, buffer);
-
- // We want the fini array callbacks to be run after other atexit
- // callbacks are run. So, we register them before running the init
- // array callbacks as they can potentially register their own atexit
- // callbacks.
- atexit(&call_fini_array_callbacks);
- call_init_array_callbacks(argc, argv, env);
- }
-
- // We wait until every single thread launched on the GPU has seen the
- // initialization code. This will get very, very slow for high thread counts,
- // but for testing purposes it is unlikely to matter.
- while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
- rpc::sleep_briefly();
- gpu::sync_threads();
-}
-
-void finalize(int retval) {
- // We wait until every single thread launched on the GPU has finished
- // executing and reached the finalize region.
- count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
- while (count.load(cpp::MemoryOrder::RELAXED) != 0)
- rpc::sleep_briefly();
- gpu::sync_threads();
- if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
- // Only a single thread should call `exit` here, the rest should gracefully
- // return from the kernel. This is so only one thread calls the destructors
- // registred with 'atexit' above.
- __llvm_libc::exit(retval);
- }
-}
-
} // namespace __llvm_libc
extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
-_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
- void *buffer) {
- __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
+_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) {
+ // We need to set up the RPC client first in case any of the constructors
+ // require it.
+ __llvm_libc::rpc::client.reset(__llvm_libc::gpu::get_lane_size(),
+ &__llvm_libc::lock, in, out, buffer);
+
+ // We want the fini array callbacks to be run after other atexit
+ // callbacks are run. So, we register them before running the init
+ // array callbacks as they can potentially register their own atexit
+ // callbacks.
+ __llvm_libc::atexit(&__llvm_libc::call_fini_array_callbacks);
+ __llvm_libc::call_init_array_callbacks(argc, argv, env);
+}
+extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+_start(int argc, char **argv, char **envp, int *ret) {
+ // Invoke the 'main' function with every active thread that the user launched
+ // the _start kernel with.
__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+}
- __llvm_libc::finalize(*ret);
+extern "C" [[gnu::visibility("protected"), clang::amdgpu_kernel]] void
+_end(int retval) {
+ // Only a single thread should call `exit` here, the rest should gracefully
+ // return from the kernel. This is so only one thread calls the destructors
+ // registred with 'atexit' above.
+ __llvm_libc::exit(retval);
}
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 9ed755987a5d..83453ae1e47a 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -17,8 +17,6 @@ namespace __llvm_libc {
static cpp::Atomic<uint32_t> lock = 0;
-static cpp::Atomic<uint32_t> count = 0;
-
extern "C" {
// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
// to manually create them and update the globals in the loader implememtation.
@@ -31,10 +29,6 @@ uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
using InitCallback = void(int, char **, char **);
using FiniCallback = void(void);
-static uint64_t get_grid_size() {
- return gpu::get_num_threads() * gpu::get_num_blocks();
-}
-
static void call_init_array_callbacks(int argc, char **argv, char **env) {
size_t init_array_size = __init_array_end - __init_array_start;
for (size_t i = 0; i < init_array_size; ++i)
@@ -47,59 +41,33 @@ static void call_fini_array_callbacks() {
reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
}
-// TODO: Put this in a separate kernel and call it with one thread.
-void initialize(int argc, char **argv, char **env, void *in, void *out,
- void *buffer) {
- // We need a single GPU thread to perform the initialization of the global
- // constructors and data. We simply mask off all but a single thread and
- // execute.
- count.fetch_add(1, cpp::MemoryOrder::RELAXED);
- if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
- // We need to set up the RPC client first in case any of the constructors
- // require it.
- rpc::client.reset(gpu::get_lane_size(), &lock, in, out, buffer);
-
- // We want the fini array callbacks to be run after other atexit
- // callbacks are run. So, we register them before running the init
- // array callbacks as they can potentially register their own atexit
- // callbacks.
- // FIXME: The function pointer escaping this TU causes warnings.
- __llvm_libc::atexit(&call_fini_array_callbacks);
- call_init_array_callbacks(argc, argv, env);
- }
-
- // We wait until every single thread launched on the GPU has seen the
- // initialization code. This will get very, very slow for high thread counts,
- // but for testing purposes it is unlikely to matter.
- while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
- rpc::sleep_briefly();
- gpu::sync_threads();
-}
-
-// TODO: Put this in a separate kernel and call it with one thread.
-void finalize(int retval) {
- // We wait until every single thread launched on the GPU has finished
- // executing and reached the finalize region.
- count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
- while (count.load(cpp::MemoryOrder::RELAXED) != 0)
- rpc::sleep_briefly();
- gpu::sync_threads();
- if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
- // Only a single thread should call `exit` here, the rest should gracefully
- // return from the kernel. This is so only one thread calls the destructors
- // registred with 'atexit' above.
- __llvm_libc::exit(retval);
- }
-}
-
} // namespace __llvm_libc
extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
- void *buffer) {
- __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
+_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) {
+ // We need to set up the RPC client first in case any of the constructors
+ // require it.
+ __llvm_libc::rpc::client.reset(__llvm_libc::gpu::get_lane_size(),
+ &__llvm_libc::lock, in, out, buffer);
+
+ // We want the fini array callbacks to be run after other atexit
+ // callbacks are run. So, we register them before running the init
+ // array callbacks as they can potentially register their own atexit
+ // callbacks.
+ __llvm_libc::atexit(&__llvm_libc::call_fini_array_callbacks);
+ __llvm_libc::call_init_array_callbacks(argc, argv, env);
+}
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
+_start(int argc, char **argv, char **envp, int *ret) {
+ // Invoke the 'main' function with every active thread that the user launched
+ // the _start kernel with.
__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+}
- __llvm_libc::finalize(*ret);
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
+_end(int retval) {
+ // To finis the execution we invoke all the callbacks registered via 'atexit'
+ // and then exit with the appropriate return value.
+ __llvm_libc::exit(retval);
}
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index feaa8e0079bb..2f55b3ac8fc4 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -23,6 +23,29 @@ struct LaunchParameters {
uint32_t num_blocks_z;
};
+/// The arguments to the '_begin' kernel.
+struct begin_args_t {
+ int argc;
+ void *argv;
+ void *envp;
+ void *inbox;
+ void *outbox;
+ void *buffer;
+};
+
+/// The arguments to the '_start' kernel.
+struct start_args_t {
+ int argc;
+ void *argv;
+ void *envp;
+ void *ret;
+};
+
+/// The arguments to the '_end' kernel.
+struct end_args_t {
+ int argc;
+};
+
/// Generic interface to load the \p image and launch execution of the _start
/// kernel on the target device. Copies \p argc and \p argv to the device.
/// Returns the final value of the `main` function on the device.
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index f9a7b75ff11b..ee12d6d63ffb 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -24,20 +24,6 @@
#include <cstring>
#include <utility>
-/// The name of the kernel we will launch. All AMDHSA kernels end with '.kd'.
-constexpr const char *KERNEL_START = "_start.kd";
-
-/// The arguments to the '_start' kernel.
-struct kernel_args_t {
- int argc;
- void *argv;
- void *envp;
- void *ret;
- void *inbox;
- void *outbox;
- void *buffer;
-};
-
/// Print the error code and exit if \p code indicates an error.
static void handle_error(hsa_status_t code) {
if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
@@ -145,6 +131,105 @@ hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
return iterate_agent_memory_pools(agent, cb);
}
+template <typename args_t>
+hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
+ hsa_amd_memory_pool_t kernargs_pool,
+ hsa_queue_t *queue, const LaunchParameters &params,
+ const char *kernel_name, args_t kernel_args) {
+ // Look up the '_start' kernel in the loaded executable.
+ hsa_executable_symbol_t symbol;
+ if (hsa_status_t err = hsa_executable_get_symbol_by_name(
+ executable, kernel_name, &dev_agent, &symbol))
+ return err;
+
+ // Retrieve different properties of the kernel symbol used for launch.
+ uint64_t kernel;
+ uint32_t args_size;
+ uint32_t group_size;
+ uint32_t private_size;
+
+ std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
+ {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
+
+ for (auto &[info, value] : symbol_infos)
+ if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
+ return err;
+
+ // Allocate space for the kernel arguments on the host and allow the GPU agent
+ // to access it.
+ void *args;
+ if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
+ /*flags=*/0, &args))
+ handle_error(err);
+ hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
+
+ // Initialie all the arguments (explicit and implicit) to zero, then set the
+ // explicit arguments to the values created above.
+ std::memset(args, 0, args_size);
+ std::memcpy(args, &kernel_args, sizeof(args_t));
+
+ // Obtain a packet from the queue.
+ uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
+ while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
+ ;
+
+ const uint32_t mask = queue->size - 1;
+ hsa_kernel_dispatch_packet_t *packet =
+ static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
+ (packet_id & mask);
+
+ // Set up the packet for exeuction on the device. We currently only launch
+ // with one thread on the device, forcing the rest of the wavefront to be
+ // masked off.
+ std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
+ packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
+ (params.num_blocks_z * params.num_threads_z != 1))
+ << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+ packet->workgroup_size_x = params.num_threads_x;
+ packet->workgroup_size_y = params.num_threads_y;
+ packet->workgroup_size_z = params.num_threads_z;
+ packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
+ packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
+ packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
+ packet->private_segment_size = private_size;
+ packet->group_segment_size = group_size;
+ packet->kernel_object = kernel;
+ packet->kernarg_address = args;
+
+ // Create a signal to indicate when this packet has been completed.
+ if (hsa_status_t err =
+ hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
+ handle_error(err);
+
+ // Initialize the packet header and set the doorbell signal to begin execution
+ // by the HSA runtime.
+ uint16_t setup = packet->setup;
+ uint16_t header =
+ (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+ (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
+ (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+ __atomic_store_n(&packet->header, header | (setup << 16), __ATOMIC_RELEASE);
+ hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+
+ // Wait until the kernel has completed execution on the device. Periodically
+ // check the RPC client for work to be performed on the server.
+ while (hsa_signal_wait_scacquire(
+ packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
+ /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
+ handle_server();
+
+ // Destroy the resources acquired to launch the kernel and return.
+ if (hsa_status_t err = hsa_amd_memory_pool_free(args))
+ handle_error(err);
+ if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
+ handle_error(err);
+
+ return HSA_STATUS_SUCCESS;
+}
+
int load(int argc, char **argv, char **envp, void *image, size_t size,
const LaunchParameters &params) {
// Initialize the HSA runtime used to communicate with the device.
@@ -169,18 +254,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
handle_error(err);
- // Obtain a queue with the minimum (power of two) size, used to send commands
- // to the HSA runtime and launch execution on the device.
- uint64_t queue_size;
- if (hsa_status_t err = hsa_agent_get_info(
- dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
- handle_error(err);
- hsa_queue_t *queue = nullptr;
- if (hsa_status_t err =
- hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_SINGLE,
- nullptr, nullptr, UINT32_MAX, UINT32_MAX, &queue))
- handle_error(err);
-
// Load the code object's ISA information and executable data segments.
hsa_code_object_t object;
if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object))
@@ -228,36 +301,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
dev_agent, &coarsegrained_pool))
handle_error(err);
- // Look up the '_start' kernel in the loaded executable.
- hsa_executable_symbol_t symbol;
- if (hsa_status_t err = hsa_executable_get_symbol_by_name(
- executable, KERNEL_START, &dev_agent, &symbol))
- handle_error(err);
-
- // Retrieve different properties of the kernel symbol used for launch.
- uint64_t kernel;
- uint32_t args_size;
- uint32_t group_size;
- uint32_t private_size;
-
- std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
- {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
- {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
- {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
- {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
-
- for (auto &[info, value] : symbol_infos)
- if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
- handle_error(err);
-
- // Allocate space for the kernel arguments on the host and allow the GPU agent
- // to access it.
- void *args;
- if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
- /*flags=*/0, &args))
- handle_error(err);
- hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
-
// Allocate fine-grained memory on the host to hold the pointer array for the
// copied argv and allow the GPU agent to access it.
auto allocator = [&](uint64_t size) -> void * {
@@ -313,69 +356,33 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox);
hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer);
- // Initialie all the arguments (explicit and implicit) to zero, then set the
- // explicit arguments to the values created above.
- std::memset(args, 0, args_size);
- kernel_args_t *kernel_args = reinterpret_cast<kernel_args_t *>(args);
- kernel_args->argc = argc;
- kernel_args->argv = dev_argv;
- kernel_args->envp = dev_envp;
- kernel_args->ret = dev_ret;
- kernel_args->inbox = server_outbox;
- kernel_args->outbox = server_inbox;
- kernel_args->buffer = buffer;
-
- // Obtain a packet from the queue.
- uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
- while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue_size)
- ;
-
- const uint32_t mask = queue_size - 1;
- hsa_kernel_dispatch_packet_t *packet =
- (hsa_kernel_dispatch_packet_t *)queue->base_address + (packet_id & mask);
-
- // Set up the packet for exeuction on the device. We currently only launch
- // with one thread on the device, forcing the rest of the wavefront to be
- // masked off.
- std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
- packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
- (params.num_blocks_z * params.num_threads_z != 1))
- << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
- packet->workgroup_size_x = params.num_threads_x;
- packet->workgroup_size_y = params.num_threads_y;
- packet->workgroup_size_z = params.num_threads_z;
- packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
- packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
- packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
- packet->private_segment_size = private_size;
- packet->group_segment_size = group_size;
- packet->kernel_object = kernel;
- packet->kernarg_address = args;
+ // Initialize the RPC server's buffer for host-device communication.
+ server.reset(wavefront_size, &lock, server_inbox, server_outbox, buffer);
- // Create a signal to indicate when this packet has been completed.
+ // Obtain a queue with the minimum (power of two) size, used to send commands
+ // to the HSA runtime and launch execution on the device.
+ uint64_t queue_size;
+ if (hsa_status_t err = hsa_agent_get_info(
+ dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
+ handle_error(err);
+ hsa_queue_t *queue = nullptr;
if (hsa_status_t err =
- hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
+ hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
+ nullptr, UINT32_MAX, UINT32_MAX, &queue))
handle_error(err);
- // Initialize the RPC server's buffer for host-device communication.
- server.reset(wavefront_size, &lock, server_inbox, server_outbox, buffer);
-
- // Initialize the packet header and set the doorbell signal to begin execution
- // by the HSA runtime.
- uint16_t header =
- (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
- (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
- (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
- __atomic_store_n(&packet->header, header | (packet->setup << 16),
- __ATOMIC_RELEASE);
- hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+ LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
+ begin_args_t init_args = {argc, dev_argv, dev_envp,
+ server_outbox, server_inbox, buffer};
+ if (hsa_status_t err =
+ launch_kernel(dev_agent, executable, kernargs_pool, queue,
+ single_threaded_params, "_begin.kd", init_args))
+ handle_error(err);
- // Wait until the kernel has completed execution on the device. Periodically
- // check the RPC client for work to be performed on the server.
- while (hsa_signal_wait_scacquire(
- packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
- /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
- handle_server();
+ start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
+ if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
+ queue, params, "_start.kd", args))
+ handle_error(err);
// Create a memory signal and copy the return value back from the device into
// a new buffer.
@@ -402,9 +409,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
// Save the return value and perform basic clean-up.
int ret = *static_cast<int *>(host_ret);
- // Free the memory allocated for the device.
- if (hsa_status_t err = hsa_amd_memory_pool_free(args))
+ end_args_t fini_args = {ret};
+ if (hsa_status_t err =
+ launch_kernel(dev_agent, executable, kernargs_pool, queue,
+ single_threaded_params, "_end.kd", fini_args))
handle_error(err);
+
+ // Free the memory allocated for the device.
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
handle_error(err);
if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
@@ -420,10 +431,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (hsa_status_t err = hsa_signal_destroy(memory_signal))
handle_error(err);
-
- if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
- handle_error(err);
-
if (hsa_status_t err = hsa_queue_destroy(queue))
handle_error(err);
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 77e6967dd022..ca18da939f4c 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -30,17 +30,6 @@
using namespace llvm;
using namespace object;
-/// The arguments to the '_start' kernel.
-struct kernel_args_t {
- int argc;
- void *argv;
- void *envp;
- void *ret;
- void *inbox;
- void *outbox;
- void *buffer;
-};
-
static void handle_error(CUresult err) {
if (err == CUDA_SUCCESS)
return;
@@ -170,6 +159,36 @@ Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
return dev_memory;
}
+template <typename args_t>
+CUresult launch_kernel(CUmodule binary, CUstream stream,
+ const LaunchParameters &params, const char *kernel_name,
+ args_t kernel_args) {
+ // look up the '_start' kernel in the loaded module.
+ CUfunction function;
+ if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
+ handle_error(err);
+
+ // Set up the arguments to the '_start' kernel on the GPU.
+ uint64_t args_size = sizeof(args_t);
+ void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
+ CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
+ CU_LAUNCH_PARAM_END};
+
+ // Call the kernel with the given arguments.
+ if (CUresult err = cuLaunchKernel(
+ function, params.num_blocks_x, params.num_blocks_y,
+ params.num_blocks_z, params.num_threads_x, params.num_threads_y,
+ params.num_threads_z, 0, stream, nullptr, args_config))
+ handle_error(err);
+
+ // Wait until the kernel has completed execution on the device. Periodically
+ // check the RPC client for work to be performed on the server.
+ while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
+ handle_server();
+
+ return CUDA_SUCCESS;
+}
+
int load(int argc, char **argv, char **envp, void *image, size_t size,
const LaunchParameters &params) {
@@ -197,11 +216,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
handle_error(err);
- // look up the '_start' kernel in the loaded module.
- CUfunction function;
- if (CUresult err = cuModuleGetFunction(&function, binary, "_start"))
- handle_error(err);
-
// Allocate pinned memory on the host to hold the pointer array for the
// copied argv and allow the GPU device to access it.
auto allocator = [&](uint64_t size) -> void * {
@@ -242,35 +256,21 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (!server_inbox || !server_outbox || !buffer)
handle_error("Failed to allocate memory the RPC client / server.");
- // Set up the arguments to the '_start' kernel on the GPU.
- uint64_t args_size = sizeof(kernel_args_t);
- kernel_args_t args;
- std::memset(&args, 0, args_size);
- args.argc = argc;
- args.argv = dev_argv;
- args.envp = dev_envp;
- args.ret = reinterpret_cast<void *>(dev_ret);
- args.inbox = server_outbox;
- args.outbox = server_inbox;
- args.buffer = buffer;
- void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
- CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
- CU_LAUNCH_PARAM_END};
-
// Initialize the RPC server's buffer for host-device communication.
server.reset(warp_size, &lock, server_inbox, server_outbox, buffer);
- // Call the kernel with the given arguments.
- if (CUresult err = cuLaunchKernel(
- function, params.num_blocks_x, params.num_blocks_y,
- params.num_blocks_z, params.num_threads_x, params.num_threads_y,
- params.num_threads_z, 0, stream, nullptr, args_config))
+ LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
+ // Call the kernel to
+ begin_args_t init_args = {argc, dev_argv, dev_envp,
+ server_outbox, server_inbox, buffer};
+ if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
+ "_begin", init_args))
handle_error(err);
- // Wait until the kernel has completed execution on the device. Periodically
- // check the RPC client for work to be performed on the server.
- while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
- handle_server();
+ start_args_t args = {argc, dev_argv, dev_envp,
+ reinterpret_cast<void *>(dev_ret)};
+ if (CUresult err = launch_kernel(binary, stream, params, "_start", args))
+ handle_error(err);
// Copy the return value back from the kernel and wait.
int host_ret = 0;
@@ -280,6 +280,11 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
if (CUresult err = cuStreamSynchronize(stream))
handle_error(err);
+ end_args_t fini_args = {host_ret};
+ if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
+ "_end", fini_args))
+ handle_error(err);
+
// Free the memory allocated for the device.
if (CUresult err = cuMemFreeHost(*memory_or_err))
handle_error(err);