diff options
Diffstat (limited to 'libc/startup/gpu/nvptx/start.cpp')
-rw-r--r-- | libc/startup/gpu/nvptx/start.cpp | 78 |
1 files changed, 23 insertions, 55 deletions
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp index 9ed755987a5d..83453ae1e47a 100644 --- a/libc/startup/gpu/nvptx/start.cpp +++ b/libc/startup/gpu/nvptx/start.cpp @@ -17,8 +17,6 @@ namespace __llvm_libc { static cpp::Atomic<uint32_t> lock = 0; -static cpp::Atomic<uint32_t> count = 0; - extern "C" { // Nvidia's 'nvlink' linker does not provide these symbols. We instead need // to manually create them and update the globals in the loader implememtation. @@ -31,10 +29,6 @@ uintptr_t *__fini_array_end [[gnu::visibility("protected")]]; using InitCallback = void(int, char **, char **); using FiniCallback = void(void); -static uint64_t get_grid_size() { - return gpu::get_num_threads() * gpu::get_num_blocks(); -} - static void call_init_array_callbacks(int argc, char **argv, char **env) { size_t init_array_size = __init_array_end - __init_array_start; for (size_t i = 0; i < init_array_size; ++i) @@ -47,59 +41,33 @@ static void call_fini_array_callbacks() { reinterpret_cast<FiniCallback *>(__fini_array_start[i])(); } -// TODO: Put this in a separate kernel and call it with one thread. -void initialize(int argc, char **argv, char **env, void *in, void *out, - void *buffer) { - // We need a single GPU thread to perform the initialization of the global - // constructors and data. We simply mask off all but a single thread and - // execute. - count.fetch_add(1, cpp::MemoryOrder::RELAXED); - if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) { - // We need to set up the RPC client first in case any of the constructors - // require it. - rpc::client.reset(gpu::get_lane_size(), &lock, in, out, buffer); - - // We want the fini array callbacks to be run after other atexit - // callbacks are run. So, we register them before running the init - // array callbacks as they can potentially register their own atexit - // callbacks. - // FIXME: The function pointer escaping this TU causes warnings. - __llvm_libc::atexit(&call_fini_array_callbacks); - call_init_array_callbacks(argc, argv, env); - } - - // We wait until every single thread launched on the GPU has seen the - // initialization code. This will get very, very slow for high thread counts, - // but for testing purposes it is unlikely to matter. - while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size()) - rpc::sleep_briefly(); - gpu::sync_threads(); -} - -// TODO: Put this in a separate kernel and call it with one thread. -void finalize(int retval) { - // We wait until every single thread launched on the GPU has finished - // executing and reached the finalize region. - count.fetch_sub(1, cpp::MemoryOrder::RELAXED); - while (count.load(cpp::MemoryOrder::RELAXED) != 0) - rpc::sleep_briefly(); - gpu::sync_threads(); - if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) { - // Only a single thread should call `exit` here, the rest should gracefully - // return from the kernel. This is so only one thread calls the destructors - // registred with 'atexit' above. - __llvm_libc::exit(retval); - } -} - } // namespace __llvm_libc extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void -_start(int argc, char **argv, char **envp, int *ret, void *in, void *out, - void *buffer) { - __llvm_libc::initialize(argc, argv, envp, in, out, buffer); +_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) { + // We need to set up the RPC client first in case any of the constructors + // require it. + __llvm_libc::rpc::client.reset(__llvm_libc::gpu::get_lane_size(), + &__llvm_libc::lock, in, out, buffer); + + // We want the fini array callbacks to be run after other atexit + // callbacks are run. So, we register them before running the init + // array callbacks as they can potentially register their own atexit + // callbacks. + __llvm_libc::atexit(&__llvm_libc::call_fini_array_callbacks); + __llvm_libc::call_init_array_callbacks(argc, argv, env); +} +extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void +_start(int argc, char **argv, char **envp, int *ret) { + // Invoke the 'main' function with every active thread that the user launched + // the _start kernel with. __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED); +} - __llvm_libc::finalize(*ret); +extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void +_end(int retval) { + // To finis the execution we invoke all the callbacks registered via 'atexit' + // and then exit with the appropriate return value. + __llvm_libc::exit(retval); } |