[libc] Change GPU startup and loader to use multiple kernels

The GPU has a different execution model to standard `_start` implementations. On the GPU, all threads are active at the start of a kernel. In order to correctly intitialize and call the constructors we want single threaded semantics. Previously, this was done using a makeshift global barrier with atomics. However, it should be easier to simply put the portions of the code that must be single threaded in separate kernels and then call those with only one thread. Generally, mixing global state between kernel launches makes optimizations more difficult, similarly to calling a function outside of the TU, but for testing it is better to be correct. Depends on D149527 D148943 Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D149581
author: Joseph Huber <jhuber6@vols.utk.edu> 2023-05-01 08:17:39 -0500
committer: Joseph Huber <jhuber6@vols.utk.edu> 2023-05-04 19:31:41 -0500
commit: 901266dad313c114e12c181651249e30e5902e26 (patch)
tree: f58bb03eeae716e0df8ee5416d97e0946bb1b7ad /libc/utils
parent: 507edb52f9a9a5c1ab2a92ec2e291a7b63c3fbff (diff)
download: llvm-901266dad313c114e12c181651249e30e5902e26.tar.gz
3 files changed, 195 insertions, 160 deletions
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index feaa8e0079bb..2f55b3ac8fc4 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -23,6 +23,29 @@ struct LaunchParameters {
   uint32_t num_blocks_z;
 };
 
+/// The arguments to the '_begin' kernel.
+struct begin_args_t {
+  int argc;
+  void *argv;
+  void *envp;
+  void *inbox;
+  void *outbox;
+  void *buffer;
+};
+
+/// The arguments to the '_start' kernel.
+struct start_args_t {
+  int argc;
+  void *argv;
+  void *envp;
+  void *ret;
+};
+
+/// The arguments to the '_end' kernel.
+struct end_args_t {
+  int argc;
+};
+
 /// Generic interface to load the \p image and launch execution of the _start
 /// kernel on the target device. Copies \p argc and \p argv to the device.
 /// Returns the final value of the `main` function on the device.
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index f9a7b75ff11b..ee12d6d63ffb 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -24,20 +24,6 @@
 #include <cstring>
 #include <utility>
 
-/// The name of the kernel we will launch. All AMDHSA kernels end with '.kd'.
-constexpr const char *KERNEL_START = "_start.kd";
-
-/// The arguments to the '_start' kernel.
-struct kernel_args_t {
-  int argc;
-  void *argv;
-  void *envp;
-  void *ret;
-  void *inbox;
-  void *outbox;
-  void *buffer;
-};
-
 /// Print the error code and exit if \p code indicates an error.
 static void handle_error(hsa_status_t code) {
   if (code == HSA_STATUS_SUCCESS || code == HSA_STATUS_INFO_BREAK)
@@ -145,6 +131,105 @@ hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
   return iterate_agent_memory_pools(agent, cb);
 }
 
+template <typename args_t>
+hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
+                           hsa_amd_memory_pool_t kernargs_pool,
+                           hsa_queue_t *queue, const LaunchParameters &params,
+                           const char *kernel_name, args_t kernel_args) {
+  // Look up the '_start' kernel in the loaded executable.
+  hsa_executable_symbol_t symbol;
+  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
+          executable, kernel_name, &dev_agent, &symbol))
+    return err;
+
+  // Retrieve different properties of the kernel symbol used for launch.
+  uint64_t kernel;
+  uint32_t args_size;
+  uint32_t group_size;
+  uint32_t private_size;
+
+  std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
+      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
+
+  for (auto &[info, value] : symbol_infos)
+    if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
+      return err;
+
+  // Allocate space for the kernel arguments on the host and allow the GPU agent
+  // to access it.
+  void *args;
+  if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
+                                                      /*flags=*/0, &args))
+    handle_error(err);
+  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
+
+  // Initialie all the arguments (explicit and implicit) to zero, then set the
+  // explicit arguments to the values created above.
+  std::memset(args, 0, args_size);
+  std::memcpy(args, &kernel_args, sizeof(args_t));
+
+  // Obtain a packet from the queue.
+  uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
+  while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue->size)
+    ;
+
+  const uint32_t mask = queue->size - 1;
+  hsa_kernel_dispatch_packet_t *packet =
+      static_cast<hsa_kernel_dispatch_packet_t *>(queue->base_address) +
+      (packet_id & mask);
+
+  // Set up the packet for exeuction on the device. We currently only launch
+  // with one thread on the device, forcing the rest of the wavefront to be
+  // masked off.
+  std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
+  packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
+                   (params.num_blocks_z * params.num_threads_z != 1))
+                  << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+  packet->workgroup_size_x = params.num_threads_x;
+  packet->workgroup_size_y = params.num_threads_y;
+  packet->workgroup_size_z = params.num_threads_z;
+  packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
+  packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
+  packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
+  packet->private_segment_size = private_size;
+  packet->group_segment_size = group_size;
+  packet->kernel_object = kernel;
+  packet->kernarg_address = args;
+
+  // Create a signal to indicate when this packet has been completed.
+  if (hsa_status_t err =
+          hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
+    handle_error(err);
+
+  // Initialize the packet header and set the doorbell signal to begin execution
+  // by the HSA runtime.
+  uint16_t setup = packet->setup;
+  uint16_t header =
+      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
+      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
+      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
+  __atomic_store_n(&packet->header, header | (setup << 16), __ATOMIC_RELEASE);
+  hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+
+  // Wait until the kernel has completed execution on the device. Periodically
+  // check the RPC client for work to be performed on the server.
+  while (hsa_signal_wait_scacquire(
+             packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
+             /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
+    handle_server();
+
+  // Destroy the resources acquired to launch the kernel and return.
+  if (hsa_status_t err = hsa_amd_memory_pool_free(args))
+    handle_error(err);
+  if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
+    handle_error(err);
+
+  return HSA_STATUS_SUCCESS;
+}
+
 int load(int argc, char **argv, char **envp, void *image, size_t size,
          const LaunchParameters &params) {
   // Initialize the HSA runtime used to communicate with the device.
@@ -169,18 +254,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   if (hsa_status_t err = get_agent<HSA_DEVICE_TYPE_CPU>(&host_agent))
     handle_error(err);
 
-  // Obtain a queue with the minimum (power of two) size, used to send commands
-  // to the HSA runtime and launch execution on the device.
-  uint64_t queue_size;
-  if (hsa_status_t err = hsa_agent_get_info(
-          dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
-    handle_error(err);
-  hsa_queue_t *queue = nullptr;
-  if (hsa_status_t err =
-          hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_SINGLE,
-                           nullptr, nullptr, UINT32_MAX, UINT32_MAX, &queue))
-    handle_error(err);
-
   // Load the code object's ISA information and executable data segments.
   hsa_code_object_t object;
   if (hsa_status_t err = hsa_code_object_deserialize(image, size, "", &object))
@@ -228,36 +301,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
               dev_agent, &coarsegrained_pool))
     handle_error(err);
 
-  // Look up the '_start' kernel in the loaded executable.
-  hsa_executable_symbol_t symbol;
-  if (hsa_status_t err = hsa_executable_get_symbol_by_name(
-          executable, KERNEL_START, &dev_agent, &symbol))
-    handle_error(err);
-
-  // Retrieve different properties of the kernel symbol used for launch.
-  uint64_t kernel;
-  uint32_t args_size;
-  uint32_t group_size;
-  uint32_t private_size;
-
-  std::pair<hsa_executable_symbol_info_t, void *> symbol_infos[] = {
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, &args_size},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, &group_size},
-      {HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, &private_size}};
-
-  for (auto &[info, value] : symbol_infos)
-    if (hsa_status_t err = hsa_executable_symbol_get_info(symbol, info, value))
-      handle_error(err);
-
-  // Allocate space for the kernel arguments on the host and allow the GPU agent
-  // to access it.
-  void *args;
-  if (hsa_status_t err = hsa_amd_memory_pool_allocate(kernargs_pool, args_size,
-                                                      /*flags=*/0, &args))
-    handle_error(err);
-  hsa_amd_agents_allow_access(1, &dev_agent, nullptr, args);
-
   // Allocate fine-grained memory on the host to hold the pointer array for the
   // copied argv and allow the GPU agent to access it.
   auto allocator = [&](uint64_t size) -> void * {
@@ -313,69 +356,33 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, server_outbox);
   hsa_amd_agents_allow_access(1, &dev_agent, nullptr, buffer);
 
-  // Initialie all the arguments (explicit and implicit) to zero, then set the
-  // explicit arguments to the values created above.
-  std::memset(args, 0, args_size);
-  kernel_args_t *kernel_args = reinterpret_cast<kernel_args_t *>(args);
-  kernel_args->argc = argc;
-  kernel_args->argv = dev_argv;
-  kernel_args->envp = dev_envp;
-  kernel_args->ret = dev_ret;
-  kernel_args->inbox = server_outbox;
-  kernel_args->outbox = server_inbox;
-  kernel_args->buffer = buffer;
-
-  // Obtain a packet from the queue.
-  uint64_t packet_id = hsa_queue_add_write_index_relaxed(queue, 1);
-  while (packet_id - hsa_queue_load_read_index_scacquire(queue) >= queue_size)
-    ;
-
-  const uint32_t mask = queue_size - 1;
-  hsa_kernel_dispatch_packet_t *packet =
-      (hsa_kernel_dispatch_packet_t *)queue->base_address + (packet_id & mask);
-
-  // Set up the packet for exeuction on the device. We currently only launch
-  // with one thread on the device, forcing the rest of the wavefront to be
-  // masked off.
-  std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
-  packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
-                   (params.num_blocks_z * params.num_threads_z != 1))
-                  << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
-  packet->workgroup_size_x = params.num_threads_x;
-  packet->workgroup_size_y = params.num_threads_y;
-  packet->workgroup_size_z = params.num_threads_z;
-  packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
-  packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
-  packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
-  packet->private_segment_size = private_size;
-  packet->group_segment_size = group_size;
-  packet->kernel_object = kernel;
-  packet->kernarg_address = args;
+  // Initialize the RPC server's buffer for host-device communication.
+  server.reset(wavefront_size, &lock, server_inbox, server_outbox, buffer);
 
-  // Create a signal to indicate when this packet has been completed.
+  // Obtain a queue with the minimum (power of two) size, used to send commands
+  // to the HSA runtime and launch execution on the device.
+  uint64_t queue_size;
+  if (hsa_status_t err = hsa_agent_get_info(
+          dev_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, &queue_size))
+    handle_error(err);
+  hsa_queue_t *queue = nullptr;
   if (hsa_status_t err =
-          hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
+          hsa_queue_create(dev_agent, queue_size, HSA_QUEUE_TYPE_MULTI, nullptr,
+                           nullptr, UINT32_MAX, UINT32_MAX, &queue))
     handle_error(err);
 
-  // Initialize the RPC server's buffer for host-device communication.
-  server.reset(wavefront_size, &lock, server_inbox, server_outbox, buffer);
-
-  // Initialize the packet header and set the doorbell signal to begin execution
-  // by the HSA runtime.
-  uint16_t header =
-      (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
-      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE) |
-      (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE);
-  __atomic_store_n(&packet->header, header | (packet->setup << 16),
-                   __ATOMIC_RELEASE);
-  hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
+  LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
+  begin_args_t init_args = {argc,          dev_argv,     dev_envp,
+                            server_outbox, server_inbox, buffer};
+  if (hsa_status_t err =
+          launch_kernel(dev_agent, executable, kernargs_pool, queue,
+                        single_threaded_params, "_begin.kd", init_args))
+    handle_error(err);
 
-  // Wait until the kernel has completed execution on the device. Periodically
-  // check the RPC client for work to be performed on the server.
-  while (hsa_signal_wait_scacquire(
-             packet->completion_signal, HSA_SIGNAL_CONDITION_EQ, 0,
-             /*timeout_hint=*/1024, HSA_WAIT_STATE_ACTIVE) != 0)
-    handle_server();
+  start_args_t args = {argc, dev_argv, dev_envp, dev_ret};
+  if (hsa_status_t err = launch_kernel(dev_agent, executable, kernargs_pool,
+                                       queue, params, "_start.kd", args))
+    handle_error(err);
 
   // Create a memory signal and copy the return value back from the device into
   // a new buffer.
@@ -402,9 +409,13 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   // Save the return value and perform basic clean-up.
   int ret = *static_cast<int *>(host_ret);
 
-  // Free the memory allocated for the device.
-  if (hsa_status_t err = hsa_amd_memory_pool_free(args))
+  end_args_t fini_args = {ret};
+  if (hsa_status_t err =
+          launch_kernel(dev_agent, executable, kernargs_pool, queue,
+                        single_threaded_params, "_end.kd", fini_args))
     handle_error(err);
+
+  // Free the memory allocated for the device.
   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_argv))
     handle_error(err);
   if (hsa_status_t err = hsa_amd_memory_pool_free(dev_ret))
@@ -420,10 +431,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
 
   if (hsa_status_t err = hsa_signal_destroy(memory_signal))
     handle_error(err);
-
-  if (hsa_status_t err = hsa_signal_destroy(packet->completion_signal))
-    handle_error(err);
-
   if (hsa_status_t err = hsa_queue_destroy(queue))
     handle_error(err);
 
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index 77e6967dd022..ca18da939f4c 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -30,17 +30,6 @@
 using namespace llvm;
 using namespace object;
 
-/// The arguments to the '_start' kernel.
-struct kernel_args_t {
-  int argc;
-  void *argv;
-  void *envp;
-  void *ret;
-  void *inbox;
-  void *outbox;
-  void *buffer;
-};
-
 static void handle_error(CUresult err) {
   if (err == CUDA_SUCCESS)
     return;
@@ -170,6 +159,36 @@ Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
   return dev_memory;
 }
 
+template <typename args_t>
+CUresult launch_kernel(CUmodule binary, CUstream stream,
+                       const LaunchParameters &params, const char *kernel_name,
+                       args_t kernel_args) {
+  // look up the '_start' kernel in the loaded module.
+  CUfunction function;
+  if (CUresult err = cuModuleGetFunction(&function, binary, kernel_name))
+    handle_error(err);
+
+  // Set up the arguments to the '_start' kernel on the GPU.
+  uint64_t args_size = sizeof(args_t);
+  void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &kernel_args,
+                         CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
+                         CU_LAUNCH_PARAM_END};
+
+  // Call the kernel with the given arguments.
+  if (CUresult err = cuLaunchKernel(
+          function, params.num_blocks_x, params.num_blocks_y,
+          params.num_blocks_z, params.num_threads_x, params.num_threads_y,
+          params.num_threads_z, 0, stream, nullptr, args_config))
+    handle_error(err);
+
+  // Wait until the kernel has completed execution on the device. Periodically
+  // check the RPC client for work to be performed on the server.
+  while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
+    handle_server();
+
+  return CUDA_SUCCESS;
+}
+
 int load(int argc, char **argv, char **envp, void *image, size_t size,
          const LaunchParameters &params) {
 
@@ -197,11 +216,6 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   if (CUresult err = cuModuleLoadDataEx(&binary, image, 0, nullptr, nullptr))
     handle_error(err);
 
-  // look up the '_start' kernel in the loaded module.
-  CUfunction function;
-  if (CUresult err = cuModuleGetFunction(&function, binary, "_start"))
-    handle_error(err);
-
   // Allocate pinned memory on the host to hold the pointer array for the
   // copied argv and allow the GPU device to access it.
   auto allocator = [&](uint64_t size) -> void * {
@@ -242,35 +256,21 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   if (!server_inbox || !server_outbox || !buffer)
     handle_error("Failed to allocate memory the RPC client / server.");
 
-  // Set up the arguments to the '_start' kernel on the GPU.
-  uint64_t args_size = sizeof(kernel_args_t);
-  kernel_args_t args;
-  std::memset(&args, 0, args_size);
-  args.argc = argc;
-  args.argv = dev_argv;
-  args.envp = dev_envp;
-  args.ret = reinterpret_cast<void *>(dev_ret);
-  args.inbox = server_outbox;
-  args.outbox = server_inbox;
-  args.buffer = buffer;
-  void *args_config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, &args,
-                         CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
-                         CU_LAUNCH_PARAM_END};
-
   // Initialize the RPC server's buffer for host-device communication.
   server.reset(warp_size, &lock, server_inbox, server_outbox, buffer);
 
-  // Call the kernel with the given arguments.
-  if (CUresult err = cuLaunchKernel(
-          function, params.num_blocks_x, params.num_blocks_y,
-          params.num_blocks_z, params.num_threads_x, params.num_threads_y,
-          params.num_threads_z, 0, stream, nullptr, args_config))
+  LaunchParameters single_threaded_params = {1, 1, 1, 1, 1, 1};
+  // Call the kernel to
+  begin_args_t init_args = {argc,          dev_argv,     dev_envp,
+                            server_outbox, server_inbox, buffer};
+  if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
+                                   "_begin", init_args))
     handle_error(err);
 
-  // Wait until the kernel has completed execution on the device. Periodically
-  // check the RPC client for work to be performed on the server.
-  while (cuStreamQuery(stream) == CUDA_ERROR_NOT_READY)
-    handle_server();
+  start_args_t args = {argc, dev_argv, dev_envp,
+                       reinterpret_cast<void *>(dev_ret)};
+  if (CUresult err = launch_kernel(binary, stream, params, "_start", args))
+    handle_error(err);
 
   // Copy the return value back from the kernel and wait.
   int host_ret = 0;
@@ -280,6 +280,11 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
   if (CUresult err = cuStreamSynchronize(stream))
     handle_error(err);
 
+  end_args_t fini_args = {host_ret};
+  if (CUresult err = launch_kernel(binary, stream, single_threaded_params,
+                                   "_end", fini_args))
+    handle_error(err);
+
   // Free the memory allocated for the device.
   if (CUresult err = cuMemFreeHost(*memory_or_err))
     handle_error(err);
author	Joseph Huber <jhuber6@vols.utk.edu>	2023-05-01 08:17:39 -0500
committer	Joseph Huber <jhuber6@vols.utk.edu>	2023-05-04 19:31:41 -0500
commit	901266dad313c114e12c181651249e30e5902e26 (patch)
tree	f58bb03eeae716e0df8ee5416d97e0946bb1b7ad /libc/utils
parent	507edb52f9a9a5c1ab2a92ec2e291a7b63c3fbff (diff)
download	llvm-901266dad313c114e12c181651249e30e5902e26.tar.gz