summaryrefslogtreecommitdiff
path: root/libc/startup/gpu/nvptx/start.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'libc/startup/gpu/nvptx/start.cpp')
-rw-r--r--libc/startup/gpu/nvptx/start.cpp78
1 files changed, 23 insertions, 55 deletions
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 9ed755987a5d..83453ae1e47a 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -17,8 +17,6 @@ namespace __llvm_libc {
static cpp::Atomic<uint32_t> lock = 0;
-static cpp::Atomic<uint32_t> count = 0;
-
extern "C" {
// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
// to manually create them and update the globals in the loader implememtation.
@@ -31,10 +29,6 @@ uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
using InitCallback = void(int, char **, char **);
using FiniCallback = void(void);
-static uint64_t get_grid_size() {
- return gpu::get_num_threads() * gpu::get_num_blocks();
-}
-
static void call_init_array_callbacks(int argc, char **argv, char **env) {
size_t init_array_size = __init_array_end - __init_array_start;
for (size_t i = 0; i < init_array_size; ++i)
@@ -47,59 +41,33 @@ static void call_fini_array_callbacks() {
reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
}
-// TODO: Put this in a separate kernel and call it with one thread.
-void initialize(int argc, char **argv, char **env, void *in, void *out,
- void *buffer) {
- // We need a single GPU thread to perform the initialization of the global
- // constructors and data. We simply mask off all but a single thread and
- // execute.
- count.fetch_add(1, cpp::MemoryOrder::RELAXED);
- if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
- // We need to set up the RPC client first in case any of the constructors
- // require it.
- rpc::client.reset(gpu::get_lane_size(), &lock, in, out, buffer);
-
- // We want the fini array callbacks to be run after other atexit
- // callbacks are run. So, we register them before running the init
- // array callbacks as they can potentially register their own atexit
- // callbacks.
- // FIXME: The function pointer escaping this TU causes warnings.
- __llvm_libc::atexit(&call_fini_array_callbacks);
- call_init_array_callbacks(argc, argv, env);
- }
-
- // We wait until every single thread launched on the GPU has seen the
- // initialization code. This will get very, very slow for high thread counts,
- // but for testing purposes it is unlikely to matter.
- while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
- rpc::sleep_briefly();
- gpu::sync_threads();
-}
-
-// TODO: Put this in a separate kernel and call it with one thread.
-void finalize(int retval) {
- // We wait until every single thread launched on the GPU has finished
- // executing and reached the finalize region.
- count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
- while (count.load(cpp::MemoryOrder::RELAXED) != 0)
- rpc::sleep_briefly();
- gpu::sync_threads();
- if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
- // Only a single thread should call `exit` here, the rest should gracefully
- // return from the kernel. This is so only one thread calls the destructors
- // registred with 'atexit' above.
- __llvm_libc::exit(retval);
- }
-}
-
} // namespace __llvm_libc
extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
-_start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
- void *buffer) {
- __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
+_begin(int argc, char **argv, char **env, void *in, void *out, void *buffer) {
+ // We need to set up the RPC client first in case any of the constructors
+ // require it.
+ __llvm_libc::rpc::client.reset(__llvm_libc::gpu::get_lane_size(),
+ &__llvm_libc::lock, in, out, buffer);
+
+ // We want the fini array callbacks to be run after other atexit
+ // callbacks are run. So, we register them before running the init
+ // array callbacks as they can potentially register their own atexit
+ // callbacks.
+ __llvm_libc::atexit(&__llvm_libc::call_fini_array_callbacks);
+ __llvm_libc::call_init_array_callbacks(argc, argv, env);
+}
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
+_start(int argc, char **argv, char **envp, int *ret) {
+ // Invoke the 'main' function with every active thread that the user launched
+ // the _start kernel with.
__atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+}
- __llvm_libc::finalize(*ret);
+extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
+_end(int retval) {
+ // To finis the execution we invoke all the callbacks registered via 'atexit'
+ // and then exit with the appropriate return value.
+ __llvm_libc::exit(retval);
}