[libc] Support global constructors and destructors on NVPTX

This patch adds the necessary hacks to support global constructors and destructors. This is an incredibly hacky process caused by the primary fact that Nvidia does not provide any binary tools and very little linker support. We first had to emit references to these functions and their priority in D149451. Then we dig them out of the module once it's loaded to manually create the list that the linker should have made for us. This patch also contains a few Nvidia specific hacks, but it passes the test, albeit with a stack size warning from `ptxas` for the callback. But this should be fine given the resource usage of a common test. This also adds a dependency on LLVM to the NVPTX loader, which hopefully doesn't cause problems with our CUDA buildbot. Depends on D149451 Reviewed By: tra Differential Revision: https://reviews.llvm.org/D149527
author: Joseph Huber <jhuber6@vols.utk.edu> 2023-04-28 09:33:44 -0500
committer: Joseph Huber <jhuber6@vols.utk.edu> 2023-05-04 07:13:00 -0500
commit: 2e1c0ec6297958f73ca5ed35ce47803ea0f48dba (patch)
tree: 5ecac614701cb678d6c7623dd9d32b705c7af286 /libc
parent: f05ce9045af4a40232c08451cb0aef64b0e673b2 (diff)
download: llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.tar.gz
9 files changed, 222 insertions, 23 deletions
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index b9c05a1d0c8d..a3a34136604d 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -497,12 +497,12 @@ function(add_integration_test test_name)
   # The GPU build requires overriding the default CMake triple and architecture.
   if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
     target_compile_options(${fq_build_target_name} PRIVATE
-                           -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE} -flto
-                           --target=${LIBC_GPU_TARGET_TRIPLE})
+                           -mcpu=${LIBC_GPU_TARGET_ARCHITECTURE}
+                           -flto --target=${LIBC_GPU_TARGET_TRIPLE})
   elseif(LIBC_GPU_TARGET_ARCHITECTURE_IS_NVPTX)
     get_nvptx_compile_options(nvptx_options ${LIBC_GPU_TARGET_ARCHITECTURE})
     target_compile_options(${fq_build_target_name} PRIVATE
-                           ${nvptx_options}
+                           ${nvptx_options} -fno-use-cxa-atexit
                            --target=${LIBC_GPU_TARGET_TRIPLE})
   endif()
 
diff --git a/libc/startup/gpu/nvptx/CMakeLists.txt b/libc/startup/gpu/nvptx/CMakeLists.txt
index b8a9f49d5be5..49fa489c6129 100644
--- a/libc/startup/gpu/nvptx/CMakeLists.txt
+++ b/libc/startup/gpu/nvptx/CMakeLists.txt
@@ -6,6 +6,8 @@ add_startup_object(
   DEPENDS
     libc.src.__support.RPC.rpc_client
     libc.src.__support.GPU.utils
+    libc.src.stdlib.exit
+    libc.src.stdlib.atexit
   COMPILE_OPTIONS
     -ffreestanding # To avoid compiler warnings about calling the main function.
     -fno-builtin
diff --git a/libc/startup/gpu/nvptx/start.cpp b/libc/startup/gpu/nvptx/start.cpp
index 7b88e30f7f37..fe09666a33de 100644
--- a/libc/startup/gpu/nvptx/start.cpp
+++ b/libc/startup/gpu/nvptx/start.cpp
@@ -8,6 +8,8 @@
 
 #include "src/__support/GPU/utils.h"
 #include "src/__support/RPC/rpc_client.h"
+#include "src/stdlib/atexit.h"
+#include "src/stdlib/exit.h"
 
 extern "C" int main(int argc, char **argv, char **envp);
 
@@ -15,21 +17,79 @@ namespace __llvm_libc {
 
 static cpp::Atomic<uint32_t> lock = 0;
 
-static cpp::Atomic<uint32_t> init = 0;
+static cpp::Atomic<uint32_t> count = 0;
 
-void init_rpc(void *in, void *out, void *buffer) {
-  // Only a single thread should update the RPC data.
+extern "C" {
+// Nvidia's 'nvlink' linker does not provide these symbols. We instead need
+// to manually create them and update the globals in the loader implememtation.
+uintptr_t *__init_array_start [[gnu::visibility("protected")]];
+uintptr_t *__init_array_end [[gnu::visibility("protected")]];
+uintptr_t *__fini_array_start [[gnu::visibility("protected")]];
+uintptr_t *__fini_array_end [[gnu::visibility("protected")]];
+}
+
+using InitCallback = void(int, char **, char **);
+using FiniCallback = void(void);
+
+static uint64_t get_grid_size() {
+  return gpu::get_num_threads() * gpu::get_num_blocks();
+}
+
+static void call_init_array_callbacks(int argc, char **argv, char **env) {
+  size_t init_array_size = __init_array_end - __init_array_start;
+  for (size_t i = 0; i < init_array_size; ++i)
+    reinterpret_cast<InitCallback *>(__init_array_start[i])(argc, argv, env);
+}
+
+static void call_fini_array_callbacks() {
+  size_t fini_array_size = __fini_array_end - __fini_array_start;
+  for (size_t i = 0; i < fini_array_size; ++i)
+    reinterpret_cast<FiniCallback *>(__fini_array_start[i])();
+}
+
+// TODO: Put this in a separate kernel and call it with one thread.
+void initialize(int argc, char **argv, char **env, void *in, void *out,
+                void *buffer) {
+  // We need a single GPU thread to perform the initialization of the global
+  // constructors and data. We simply mask off all but a single thread and
+  // execute.
+  count.fetch_add(1, cpp::MemoryOrder::RELAXED);
   if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // We need to set up the RPC client first in case any of the constructors
+    // require it.
     rpc::client.reset(&lock, in, out, buffer);
-    init.store(1, cpp::MemoryOrder::RELAXED);
+
+    // We want the fini array callbacks to be run after other atexit
+    // callbacks are run. So, we register them before running the init
+    // array callbacks as they can potentially register their own atexit
+    // callbacks.
+    // FIXME: The function pointer escaping this TU causes warnings.
+    __llvm_libc::atexit(&call_fini_array_callbacks);
+    call_init_array_callbacks(argc, argv, env);
   }
 
-  // Wait until the previous thread signals that the data has been written.
-  while (!init.load(cpp::MemoryOrder::RELAXED))
+  // We wait until every single thread launched on the GPU has seen the
+  // initialization code. This will get very, very slow for high thread counts,
+  // but for testing purposes it is unlikely to matter.
+  while (count.load(cpp::MemoryOrder::RELAXED) != get_grid_size())
     rpc::sleep_briefly();
+  gpu::sync_threads();
+}
 
-  // Wait for the threads in the block to converge and fence the write.
+// TODO: Put this in a separate kernel and call it with one thread.
+void finalize(int retval) {
+  // We wait until every single thread launched on the GPU has finished
+  // executing and reached the finalize region.
+  count.fetch_sub(1, cpp::MemoryOrder::RELAXED);
+  while (count.load(cpp::MemoryOrder::RELAXED) != 0)
+    rpc::sleep_briefly();
   gpu::sync_threads();
+  if (gpu::get_thread_id() == 0 && gpu::get_block_id() == 0) {
+    // Only a single thread should call `exit` here, the rest should gracefully
+    // return from the kernel. This is so only one thread calls the destructors
+    // registred with 'atexit' above.
+    __llvm_libc::exit(retval);
+  }
 }
 
 } // namespace __llvm_libc
@@ -37,7 +97,9 @@ void init_rpc(void *in, void *out, void *buffer) {
 extern "C" [[gnu::visibility("protected"), clang::nvptx_kernel]] void
 _start(int argc, char **argv, char **envp, int *ret, void *in, void *out,
        void *buffer) {
-  __llvm_libc::init_rpc(in, out, buffer);
+  __llvm_libc::initialize(argc, argv, envp, in, out, buffer);
 
   __atomic_fetch_or(ret, main(argc, argv, envp), __ATOMIC_RELAXED);
+
+  __llvm_libc::finalize(*ret);
 }
diff --git a/libc/test/IntegrationTest/test.cpp b/libc/test/IntegrationTest/test.cpp
index 4d2a7f08cc06..e86e0a8d22c8 100644
--- a/libc/test/IntegrationTest/test.cpp
+++ b/libc/test/IntegrationTest/test.cpp
@@ -22,6 +22,7 @@ int memcmp(const void *lhs, const void *rhs, size_t count);
 void *memcpy(void *__restrict, const void *__restrict, size_t);
 void *memmove(void *dst, const void *src, size_t count);
 void *memset(void *ptr, int value, size_t count);
+int atexit(void (*func)(void));
 
 } // namespace __llvm_libc
 
@@ -44,6 +45,9 @@ void *memset(void *ptr, int value, size_t count) {
   return __llvm_libc::memset(ptr, value, count);
 }
 
+// This is needed if the test was compiled with '-fno-use-cxa-atexit'.
+int atexit(void (*func)(void)) { return __llvm_libc::atexit(func); }
+
 } // extern "C"
 
 // Integration tests cannot use the SCUDO standalone allocator as SCUDO pulls
diff --git a/libc/test/integration/startup/gpu/CMakeLists.txt b/libc/test/integration/startup/gpu/CMakeLists.txt
index ab3f4c39fe48..754f36d8789c 100644
--- a/libc/test/integration/startup/gpu/CMakeLists.txt
+++ b/libc/test/integration/startup/gpu/CMakeLists.txt
@@ -26,12 +26,9 @@ add_integration_test(
     --threads 1
 )
 
-# Constructors are currently only supported on AMDGPU.
-if(LIBC_GPU_TARGET_ARCHITECTURE_IS_AMDGPU)
-  add_integration_test(
-    init_fini_array_test
-    SUITE libc-startup-tests
-    SRCS
-      init_fini_array_test.cpp
-  )
-endif()
+add_integration_test(
+  init_fini_array_test
+  SUITE libc-startup-tests
+  SRCS
+    init_fini_array_test.cpp
+)
diff --git a/libc/test/integration/startup/gpu/init_fini_array_test.cpp b/libc/test/integration/startup/gpu/init_fini_array_test.cpp
index 23064e1e85aa..1e61711f0fc4 100644
--- a/libc/test/integration/startup/gpu/init_fini_array_test.cpp
+++ b/libc/test/integration/startup/gpu/init_fini_array_test.cpp
@@ -53,7 +53,7 @@ __attribute__((destructor(1))) void reset_initval() {
   initval = 0;
 }
 
-TEST_MAIN() {
+TEST_MAIN(int argc, char **argv, char **env) {
   ASSERT_EQ(global.get(GLOBAL_INDEX), INITVAL_INITIALIZER);
   ASSERT_EQ(initval, INITVAL_INITIALIZER);
   return 0;
diff --git a/libc/utils/gpu/loader/CMakeLists.txt b/libc/utils/gpu/loader/CMakeLists.txt
index 3f63ef0bc90e..689cf086b476 100644
--- a/libc/utils/gpu/loader/CMakeLists.txt
+++ b/libc/utils/gpu/loader/CMakeLists.txt
@@ -12,7 +12,9 @@ else()
 endif()
 
 find_package(CUDAToolkit QUIET)
-if(CUDAToolkit_FOUND)
+# The CUDA loader requires LLVM to traverse the ELF image for symbols.
+find_package(LLVM QUIET)
+if(CUDAToolkit_FOUND AND LLVM_FOUND)
   add_subdirectory(nvptx)
 else()
   message(STATUS "Skipping CUDA loader for gpu target, no CUDA was detected")
diff --git a/libc/utils/gpu/loader/nvptx/CMakeLists.txt b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
index f88914383a98..9e8535792067 100644
--- a/libc/utils/gpu/loader/nvptx/CMakeLists.txt
+++ b/libc/utils/gpu/loader/nvptx/CMakeLists.txt
@@ -1,8 +1,14 @@
 add_executable(nvptx_loader Loader.cpp)
 add_dependencies(nvptx_loader libc.src.__support.RPC.rpc)
 
+if(NOT LLVM_ENABLE_RTTI)
+  target_compile_options(nvptx_loader PRIVATE -fno-rtti)
+endif()
+target_include_directories(nvptx_loader PRIVATE ${LLVM_INCLUDE_DIRS})
 target_link_libraries(nvptx_loader
   PRIVATE
   gpu_loader
   CUDA::cuda_driver
+  LLVMObject
+  LLVMSupport
 )
diff --git a/libc/utils/gpu/loader/nvptx/Loader.cpp b/libc/utils/gpu/loader/nvptx/Loader.cpp
index eb8db7f48572..baf8baaff7cd 100644
--- a/libc/utils/gpu/loader/nvptx/Loader.cpp
+++ b/libc/utils/gpu/loader/nvptx/Loader.cpp
@@ -17,10 +17,18 @@
 #include "Server.h"
 
 #include "cuda.h"
+
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+
 #include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
+#include <vector>
+
+using namespace llvm;
+using namespace object;
 
 /// The arguments to the '_start' kernel.
 struct kernel_args_t {
@@ -51,11 +59,122 @@ static void handle_error(const char *msg) {
   exit(EXIT_FAILURE);
 }
 
+// Gets the names of all the globals that contain functions to initialize or
+// deinitialize. We need to do this manually because the NVPTX toolchain does
+// not contain the necessary binary manipulation tools.
+template <typename Alloc>
+Expected<void *> get_ctor_dtor_array(const void *image, const size_t size,
+                                     Alloc allocator, CUmodule binary) {
+  auto mem_buffer = MemoryBuffer::getMemBuffer(
+      StringRef(reinterpret_cast<const char *>(image), size), "image",
+      /*RequiresNullTerminator=*/false);
+  Expected<ELF64LEObjectFile> elf_or_err =
+      ELF64LEObjectFile::create(*mem_buffer);
+  if (!elf_or_err)
+    handle_error(toString(elf_or_err.takeError()).c_str());
+
+  std::vector<std::pair<const char *, uint16_t>> ctors;
+  std::vector<std::pair<const char *, uint16_t>> dtors;
+  // CUDA has no way to iterate over all the symbols so we need to inspect the
+  // ELF directly using the LLVM libraries.
+  for (const auto &symbol : elf_or_err->symbols()) {
+    auto name_or_err = symbol.getName();
+    if (!name_or_err)
+      handle_error(toString(name_or_err.takeError()).c_str());
+
+    // Search for all symbols that contain a constructor or destructor.
+    if (!name_or_err->starts_with("__init_array_object_") &&
+        !name_or_err->starts_with("__fini_array_object_"))
+      continue;
+
+    uint16_t priority;
+    if (name_or_err->rsplit('_').second.getAsInteger(10, priority))
+      handle_error("Invalid priority for constructor or destructor");
+
+    if (name_or_err->starts_with("__init"))
+      ctors.emplace_back(std::make_pair(name_or_err->data(), priority));
+    else
+      dtors.emplace_back(std::make_pair(name_or_err->data(), priority));
+  }
+  // Lower priority constructors are run before higher ones. The reverse is true
+  // for destructors.
+  llvm::sort(ctors, [](auto x, auto y) { return x.second < y.second; });
+  llvm::sort(dtors, [](auto x, auto y) { return x.second < y.second; });
+  llvm::reverse(dtors);
+
+  // Allocate host pinned memory to make these arrays visible to the GPU.
+  CUdeviceptr *dev_memory = reinterpret_cast<CUdeviceptr *>(allocator(
+      ctors.size() * sizeof(CUdeviceptr) + dtors.size() * sizeof(CUdeviceptr)));
+  uint64_t global_size = 0;
+
+  // Get the address of the global and then store the address of the constructor
+  // function to call in the constructor array.
+  CUdeviceptr *dev_ctors_start = dev_memory;
+  CUdeviceptr *dev_ctors_end = dev_ctors_start + ctors.size();
+  for (uint64_t i = 0; i < ctors.size(); ++i) {
+    CUdeviceptr dev_ptr;
+    if (CUresult err =
+            cuModuleGetGlobal(&dev_ptr, &global_size, binary, ctors[i].first))
+      handle_error(err);
+    if (CUresult err =
+            cuMemcpyDtoH(&dev_ctors_start[i], dev_ptr, sizeof(uintptr_t)))
+      handle_error(err);
+  }
+
+  // Get the address of the global and then store the address of the destructor
+  // function to call in the destructor array.
+  CUdeviceptr *dev_dtors_start = dev_ctors_end;
+  CUdeviceptr *dev_dtors_end = dev_dtors_start + dtors.size();
+  for (uint64_t i = 0; i < dtors.size(); ++i) {
+    CUdeviceptr dev_ptr;
+    if (CUresult err =
+            cuModuleGetGlobal(&dev_ptr, &global_size, binary, dtors[i].first))
+      handle_error(err);
+    if (CUresult err =
+            cuMemcpyDtoH(&dev_dtors_start[i], dev_ptr, sizeof(uintptr_t)))
+      handle_error(err);
+  }
+
+  // Obtain the address of the pointers the startup implementation uses to
+  // iterate the constructors and destructors.
+  CUdeviceptr init_start;
+  if (CUresult err = cuModuleGetGlobal(&init_start, &global_size, binary,
+                                       "__init_array_start"))
+    handle_error(err);
+  CUdeviceptr init_end;
+  if (CUresult err = cuModuleGetGlobal(&init_end, &global_size, binary,
+                                       "__init_array_end"))
+    handle_error(err);
+  CUdeviceptr fini_start;
+  if (CUresult err = cuModuleGetGlobal(&fini_start, &global_size, binary,
+                                       "__fini_array_start"))
+    handle_error(err);
+  CUdeviceptr fini_end;
+  if (CUresult err = cuModuleGetGlobal(&fini_end, &global_size, binary,
+                                       "__fini_array_end"))
+    handle_error(err);
+
+  // Copy the pointers to the newly written array to the symbols so the startup
+  // implementation can iterate them.
+  if (CUresult err =
+          cuMemcpyHtoD(init_start, &dev_ctors_start, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err = cuMemcpyHtoD(init_end, &dev_ctors_end, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err =
+          cuMemcpyHtoD(fini_start, &dev_dtors_start, sizeof(uintptr_t)))
+    handle_error(err);
+  if (CUresult err = cuMemcpyHtoD(fini_end, &dev_dtors_end, sizeof(uintptr_t)))
+    handle_error(err);
+
+  return dev_memory;
+}
+
 int load(int argc, char **argv, char **envp, void *image, size_t size,
          const LaunchParameters &params) {
+
   if (CUresult err = cuInit(0))
     handle_error(err);
-
   // Obtain the first device found on the system.
   CUdevice device;
   if (CUresult err = cuDeviceGet(&device, 0))
@@ -91,6 +210,11 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
       handle_error(err);
     return dev_ptr;
   };
+
+  auto memory_or_err = get_ctor_dtor_array(image, size, allocator, binary);
+  if (!memory_or_err)
+    handle_error(toString(memory_or_err.takeError()).c_str());
+
   void *dev_argv = copy_argument_vector(argc, argv, allocator);
   if (!dev_argv)
     handle_error("Failed to allocate device argv");
@@ -153,6 +277,8 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
     handle_error(err);
 
   // Free the memory allocated for the device.
+  if (CUresult err = cuMemFreeHost(*memory_or_err))
+    handle_error(err);
   if (CUresult err = cuMemFree(dev_ret))
     handle_error(err);
   if (CUresult err = cuMemFreeHost(dev_argv))
author	Joseph Huber <jhuber6@vols.utk.edu>	2023-04-28 09:33:44 -0500
committer	Joseph Huber <jhuber6@vols.utk.edu>	2023-05-04 07:13:00 -0500
commit	2e1c0ec6297958f73ca5ed35ce47803ea0f48dba (patch)
tree	5ecac614701cb678d6c7623dd9d32b705c7af286 /libc
parent	f05ce9045af4a40232c08451cb0aef64b0e673b2 (diff)
download	llvm-2e1c0ec6297958f73ca5ed35ce47803ea0f48dba.tar.gz