diff options
author | Joseph Huber <jhuber6@vols.utk.edu> | 2023-04-17 15:08:59 -0500 |
---|---|---|
committer | Joseph Huber <jhuber6@vols.utk.edu> | 2023-04-19 08:01:58 -0500 |
commit | bc11bb3e26e98b167737cee94ca23a6fb5a40881 (patch) | |
tree | 92c9f377ddadad8d942c987f86eedfd857f9032f /libc/utils | |
parent | 814dfb016aad7ceae2e3fda19659e0bb20f10464 (diff) | |
download | llvm-bc11bb3e26e98b167737cee94ca23a6fb5a40881.tar.gz |
[libc] Add the '--threads' and '--blocks' option to the GPU loaders
We will want to test the GPU `libc` with multiple threads in the future.
This patch adds the `--threads` and `--blocks` option to set the `x`
dimension of the kernel. Using CUDA terminology instead of OpenCL for
familiarity.
Depends on D148288 D148342
Reviewed By: jdoerfert, sivachandra, tra
Differential Revision: https://reviews.llvm.org/D148485
Diffstat (limited to 'libc/utils')
-rw-r--r-- | libc/utils/gpu/loader/Loader.h | 13 | ||||
-rw-r--r-- | libc/utils/gpu/loader/Main.cpp | 58 | ||||
-rw-r--r-- | libc/utils/gpu/loader/amdgpu/Loader.cpp | 19 | ||||
-rw-r--r-- | libc/utils/gpu/loader/nvptx/Loader.cpp | 11 |
4 files changed, 82 insertions, 19 deletions
diff --git a/libc/utils/gpu/loader/Loader.h b/libc/utils/gpu/loader/Loader.h index 2c3059307584..9c6413ee45d8 100644 --- a/libc/utils/gpu/loader/Loader.h +++ b/libc/utils/gpu/loader/Loader.h @@ -13,10 +13,21 @@ #include <cstring> #include <stddef.h> +/// Generic launch parameters for configuration the number of blocks / threads. +struct LaunchParameters { + uint32_t num_threads_x; + uint32_t num_threads_y; + uint32_t num_threads_z; + uint32_t num_blocks_x; + uint32_t num_blocks_y; + uint32_t num_blocks_z; +}; + /// Generic interface to load the \p image and launch execution of the _start /// kernel on the target device. Copies \p argc and \p argv to the device. /// Returns the final value of the `main` function on the device. -int load(int argc, char **argv, char **evnp, void *image, size_t size); +int load(int argc, char **argv, char **evnp, void *image, size_t size, + const LaunchParameters ¶ms); /// Copy the system's argument vector to GPU memory allocated using \p alloc. template <typename Allocator> diff --git a/libc/utils/gpu/loader/Main.cpp b/libc/utils/gpu/loader/Main.cpp index 00354720dda9..b711ec91c9f3 100644 --- a/libc/utils/gpu/loader/Main.cpp +++ b/libc/utils/gpu/loader/Main.cpp @@ -15,21 +15,69 @@ #include <cstdio> #include <cstdlib> +#include <string> +#include <vector> int main(int argc, char **argv, char **envp) { if (argc < 2) { - printf("USAGE: ./loader <device_image> <args>, ...\n"); + printf("USAGE: ./loader [--threads <n>, --blocks <n>] <device_image> " + "<args>, ...\n"); return EXIT_SUCCESS; } - // TODO: We should perform some validation on the file. - FILE *file = fopen(argv[1], "r"); + int offset = 0; + FILE *file = nullptr; + char *ptr; + LaunchParameters params = {1, 1, 1, 1, 1, 1}; + while (!file && ++offset < argc) { + if (argv[offset] == std::string("--threads") || + argv[offset] == std::string("--threads-x")) { + params.num_threads_x = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else if (argv[offset] == std::string("--threads-y")) { + params.num_threads_y = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else if (argv[offset] == std::string("--threads-z")) { + params.num_threads_z = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else if (argv[offset] == std::string("--blocks") || + argv[offset] == std::string("--blocks-x")) { + params.num_blocks_x = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else if (argv[offset] == std::string("--blocks-y")) { + params.num_blocks_y = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else if (argv[offset] == std::string("--blocks-z")) { + params.num_blocks_z = + offset + 1 < argc ? strtoul(argv[offset + 1], &ptr, 10) : 1; + offset++; + continue; + } else { + file = fopen(argv[offset], "r"); + if (!file) { + fprintf(stderr, "Failed to open image file '%s'\n", argv[offset]); + return EXIT_FAILURE; + } + break; + } + } if (!file) { - fprintf(stderr, "Failed to open image file %s\n", argv[1]); + fprintf(stderr, "No image file provided\n"); return EXIT_FAILURE; } + // TODO: We should perform some validation on the file. fseek(file, 0, SEEK_END); const auto size = ftell(file); fseek(file, 0, SEEK_SET); @@ -39,7 +87,7 @@ int main(int argc, char **argv, char **envp) { fclose(file); // Drop the loader from the program arguments. - int ret = load(argc - 1, &argv[1], envp, image, size); + int ret = load(argc - offset, &argv[offset], envp, image, size, params); free(image); return ret; diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp index 87dd3ce48d82..54e6caf81e2d 100644 --- a/libc/utils/gpu/loader/amdgpu/Loader.cpp +++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp @@ -170,7 +170,8 @@ hsa_status_t get_agent_memory_pool(hsa_agent_t agent, return iterate_agent_memory_pools(agent, cb); } -int load(int argc, char **argv, char **envp, void *image, size_t size) { +int load(int argc, char **argv, char **envp, void *image, size_t size, + const LaunchParameters ¶ms) { // Initialize the HSA runtime used to communicate with the device. if (hsa_status_t err = hsa_init()) handle_error(err); @@ -355,13 +356,15 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) { // with one thread on the device, forcing the rest of the wavefront to be // masked off. std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t)); - packet->setup = 1 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; - packet->workgroup_size_x = 1; - packet->workgroup_size_y = 1; - packet->workgroup_size_z = 1; - packet->grid_size_x = 1; - packet->grid_size_y = 1; - packet->grid_size_z = 1; + packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) + + (params.num_blocks_z * params.num_threads_z != 1)) + << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; + packet->workgroup_size_x = params.num_threads_x; + packet->workgroup_size_y = params.num_threads_y; + packet->workgroup_size_z = params.num_threads_z; + packet->grid_size_x = params.num_blocks_x * params.num_threads_x; + packet->grid_size_y = params.num_blocks_y * params.num_threads_y; + packet->grid_size_z = params.num_blocks_z * params.num_threads_z; packet->private_segment_size = private_size; packet->group_segment_size = group_size; packet->kernel_object = kernel; diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp index ed8b8d018c6a..15ff11a3bd80 100644 --- a/libc/utils/gpu/loader/nvptx/Loader.cpp +++ b/libc/utils/gpu/loader/nvptx/Loader.cpp @@ -76,7 +76,8 @@ static void handle_error(const char *msg) { exit(EXIT_FAILURE); } -int load(int argc, char **argv, char **envp, void *image, size_t size) { +int load(int argc, char **argv, char **envp, void *image, size_t size, + const LaunchParameters ¶ms) { if (CUresult err = cuInit(0)) handle_error(err); @@ -157,10 +158,10 @@ int load(int argc, char **argv, char **envp, void *image, size_t size) { server.reset(server_inbox, server_outbox, buffer); // Call the kernel with the given arguments. - if (CUresult err = - cuLaunchKernel(function, /*gridDimX=*/1, /*gridDimY=*/1, - /*gridDimZ=*/1, /*blockDimX=*/1, /*blockDimY=*/1, - /*bloackDimZ=*/1, 0, stream, nullptr, args_config)) + if (CUresult err = cuLaunchKernel( + function, params.num_blocks_x, params.num_blocks_y, + params.num_blocks_z, params.num_threads_x, params.num_threads_y, + params.num_threads_z, 0, stream, nullptr, args_config)) handle_error(err); // Wait until the kernel has completed execution on the device. Periodically |