summaryrefslogtreecommitdiff
path: root/libc/utils
diff options
context:
space:
mode:
authorJoseph Huber <jhuber6@vols.utk.edu>2023-04-17 15:08:59 -0500
committerJoseph Huber <jhuber6@vols.utk.edu>2023-04-19 08:01:58 -0500
commitbc11bb3e26e98b167737cee94ca23a6fb5a40881 (patch)
tree92c9f377ddadad8d942c987f86eedfd857f9032f /libc/utils
parent814dfb016aad7ceae2e3fda19659e0bb20f10464 (diff)
downloadllvm-bc11bb3e26e98b167737cee94ca23a6fb5a40881.tar.gz
[libc] Add the '--threads' and '--blocks' option to the GPU loaders
We will want to test the GPU `libc` with multiple threads in the future. This patch adds the `--threads` and `--blocks` option to set the `x` dimension of the kernel. Using CUDA terminology instead of OpenCL for familiarity. Depends on D148288 D148342 Reviewed By: jdoerfert, sivachandra, tra Differential Revision: https://reviews.llvm.org/D148485
Diffstat (limited to 'libc/utils')
-rw-r--r--libc/utils/gpu/loader/Loader.h13
-rw-r--r--libc/utils/gpu/loader/Main.cpp58
-rw-r--r--libc/utils/gpu/loader/amdgpu/Loader.cpp19
-rw-r--r--libc/utils/gpu/loader/nvptx/Loader.cpp11
4 files changed, 82 insertions, 19 deletions
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h
index 2c3059307584..9c6413ee45d8 100644
--- a/libc/utils/gpu/loader/Loader.h
+++ b/libc/utils/gpu/loader/Loader.h
@@ -13,10 +13,21 @@
#include <cstring>
#include <stddef.h>
+/// Generic launch parameters for configuration the number of blocks / threads.
+struct LaunchParameters {
+ uint32_t num_threads_x;
+ uint32_t num_threads_y;
+ uint32_t num_threads_z;
+ uint32_t num_blocks_x;
+ uint32_t num_blocks_y;
+ uint32_t num_blocks_z;
+};
+
/// Generic interface to load the \p image and launch execution of the _start
/// kernel on the target device. Copies \p argc and \p argv to the device.
/// Returns the final value of the `main` function on the device.
-int load(int argc, char **argv, char **evnp, void *image, size_t size);
+int load(int argc, char **argv, char **evnp, void *image, size_t size,
+ const LaunchParameters &params);
/// Copy the system's argument vector to GPU memory allocated using \p alloc.
template <typename Allocator>
diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp
index 00354720dda9..b711ec91c9f3 100644
--- a/libc/utils/gpu/loader/Main.cpp
+++ b/libc/utils/gpu/loader/Main.cpp
@@ -15,21 +15,69 @@
#include <cstdio>
#include <cstdlib>
+#include <string>
+#include <vector>
int main(int argc, char **argv, char **envp) {
if (argc < 2) {
- printf("USAGE: ./loader <device_image> <args>, ...\n");
+ printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> "
+ "<args>, ...\n");
return EXIT_SUCCESS;
}
- // TODO: We should perform some validation on the file.
- FILE *file = fopen(argv[1], "r");
+ int offset = 0;
+ FILE *file = nullptr;
+ char *ptr;
+ LaunchParameters params = {1, 1, 1, 1, 1, 1};
+ while (!file && ++offset < argc) {
+ if (argv[offset] == std::string("--threads") ||
+ argv[offset] == std::string("--threads-x")) {
+ params.num_threads_x =
+ offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+ offset++;
+ continue;
+ } else if (argv[offset] == std::string("--threads-y")) {
+ params.num_threads_y =
+ offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+ offset++;
+ continue;
+ } else if (argv[offset] == std::string("--threads-z")) {
+ params.num_threads_z =
+ offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+ offset++;
+ continue;
+ } else if (argv[offset] == std::string("--blocks") ||
+ argv[offset] == std::string("--blocks-x")) {
+ params.num_blocks_x =
+ offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+ offset++;
+ continue;
+ } else if (argv[offset] == std::string("--blocks-y")) {
+ params.num_blocks_y =
+ offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+ offset++;
+ continue;
+ } else if (argv[offset] == std::string("--blocks-z")) {
+ params.num_blocks_z =
+ offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1;
+ offset++;
+ continue;
+ } else {
+ file = fopen(argv[offset], "r");
+ if (!file) {
+ fprintf(stderr, "Failed to open image file '%s'\n", argv[offset]);
+ return EXIT_FAILURE;
+ }
+ break;
+ }
+ }
if (!file) {
- fprintf(stderr, "Failed to open image file %s\n", argv[1]);
+ fprintf(stderr, "No image file provided\n");
return EXIT_FAILURE;
}
+ // TODO: We should perform some validation on the file.
fseek(file, 0, SEEK_END);
const auto size = ftell(file);
fseek(file, 0, SEEK_SET);
@@ -39,7 +87,7 @@ int main(int argc, char **argv, char **envp) {
fclose(file);
// Drop the loader from the program arguments.
- int ret = load(argc - 1, &argv[1], envp, image, size);
+ int ret = load(argc - offset, &argv[offset], envp, image, size, params);
free(image);
return ret;
diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index 87dd3ce48d82..54e6caf81e2d 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -170,7 +170,8 @@ hsa_status_t get_agent_memory_pool(hsa_agent_t agent,
return iterate_agent_memory_pools(agent, cb);
}
-int load(int argc, char **argv, char **envp, void *image, size_t size) {
+int load(int argc, char **argv, char **envp, void *image, size_t size,
+ const LaunchParameters &params) {
// Initialize the HSA runtime used to communicate with the device.
if (hsa_status_t err = hsa_init())
handle_error(err);
@@ -355,13 +356,15 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
// with one thread on the device, forcing the rest of the wavefront to be
// masked off.
std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
- packet->setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
- packet->workgroup_size_x = 1;
- packet->workgroup_size_y = 1;
- packet->workgroup_size_z = 1;
- packet->grid_size_x = 1;
- packet->grid_size_y = 1;
- packet->grid_size_z = 1;
+ packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
+ (params.num_blocks_z * params.num_threads_z != 1))
+ << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+ packet->workgroup_size_x = params.num_threads_x;
+ packet->workgroup_size_y = params.num_threads_y;
+ packet->workgroup_size_z = params.num_threads_z;
+ packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
+ packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
+ packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
packet->private_segment_size = private_size;
packet->group_segment_size = group_size;
packet->kernel_object = kernel;
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index ed8b8d018c6a..15ff11a3bd80 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -76,7 +76,8 @@ static void handle_error(const char *msg) {
exit(EXIT_FAILURE);
}
-int load(int argc, char **argv, char **envp, void *image, size_t size) {
+int load(int argc, char **argv, char **envp, void *image, size_t size,
+ const LaunchParameters &params) {
if (CUresult err = cuInit(0))
handle_error(err);
@@ -157,10 +158,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) {
server.reset(server_inbox, server_outbox, buffer);
// Call the kernel with the given arguments.
- if (CUresult err =
- cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1,
- /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1,
- /*bloackDimZ=*/1, 0, stream, nullptr, args_config))
+ if (CUresult err = cuLaunchKernel(
+ function, params.num_blocks_x, params.num_blocks_y,
+ params.num_blocks_z, params.num_threads_x, params.num_threads_y,
+ params.num_threads_z, 0, stream, nullptr, args_config))
handle_error(err);
// Wait until the kernel has completed execution on the device. Periodically