//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LIBCXXABI_SRC_INCLUDE_CXA_GUARD_IMPL_H
#define LIBCXXABI_SRC_INCLUDE_CXA_GUARD_IMPL_H

/* cxa_guard_impl.h - Implements the C++ runtime support for function local
 * static guards.
 * The layout of the guard object is the same across ARM and Itanium.
 *
 * The first "guard byte" (which is checked by the compiler) is set only upon
 * the completion of cxa release.
 *
 * The second "init byte" does the rest of the bookkeeping. It tracks if
 * initialization is complete or pending, and if there are waiting threads.
 *
 * If the guard variable is 64-bits and the platforms supplies a 32-bit thread
 * identifier, it is used to detect recursive initialization. The thread ID of
 * the thread currently performing initialization is stored in the second word.
 *
 *  Guard Object Layout:
 * ---------------------------------------------------------------------------
 * | a+0: guard byte | a+1: init byte | a+2: unused ... | a+4: thread-id ... |
 * ---------------------------------------------------------------------------
 *
 * Note that we don't do what the ABI docs suggest (put a mutex in the guard
 * object which we acquire in cxa_guard_acquire and release in
 * cxa_guard_release). Instead we use the init byte to imitate that behaviour,
 * but without actually holding anything mutex related between aquire and
 * release/abort.
 *
 *  Access Protocol:
 *    For each implementation the guard byte is checked and set before accessing
 *    the init byte.
 *
 *  Overall Design:
 *    The implementation was designed to allow each implementation to be tested
 *    independent of the C++ runtime or platform support.
 *
 */

#include "__cxxabi_config.h"
#include "include/atomic_support.h" // from libc++
#if defined(__has_include)
#  if __has_include(<sys/syscall.h>)
#    include <sys/syscall.h>
#  endif
#  if __has_include(<unistd.h>)
#    include <unistd.h>
#  endif
#endif

#include <__threading_support>
#include <cstdint>
#include <cstring>
#include <limits.h>
#include <stdlib.h>

#ifndef _LIBCXXABI_HAS_NO_THREADS
#  if defined(__ELF__) && defined(_LIBCXXABI_LINK_PTHREAD_LIB)
#    pragma comment(lib, "pthread")
#  endif
#endif

#if defined(__clang__)
#  pragma clang diagnostic push
#  pragma clang diagnostic ignored "-Wtautological-pointer-compare"
#elif defined(__GNUC__)
#  pragma GCC diagnostic push
#  pragma GCC diagnostic ignored "-Waddress"
#endif

// To make testing possible, this header is included from both cxa_guard.cpp
// and a number of tests.
//
// For this reason we place everything in an anonymous namespace -- even though
// we're in a header. We want the actual implementation and the tests to have
// unique definitions of the types in this header (since the tests may depend
// on function local statics).
//
// To enforce this either `BUILDING_CXA_GUARD` or `TESTING_CXA_GUARD` must be
// defined when including this file. Only `src/cxa_guard.cpp` should define
// the former.
#ifdef BUILDING_CXA_GUARD
#  include "abort_message.h"
#  define ABORT_WITH_MESSAGE(...) ::abort_message(__VA_ARGS__)
#elif defined(TESTING_CXA_GUARD)
#  define ABORT_WITH_MESSAGE(...) ::abort()
#else
#  error "Either BUILDING_CXA_GUARD or TESTING_CXA_GUARD must be defined"
#endif

#if __has_feature(thread_sanitizer)
extern "C" void __tsan_acquire(void*);
extern "C" void __tsan_release(void*);
#else
#  define __tsan_acquire(addr) ((void)0)
#  define __tsan_release(addr) ((void)0)
#endif

namespace __cxxabiv1 {
// Use an anonymous namespace to ensure that the tests and actual implementation
// have unique definitions of these symbols.
namespace {

//===----------------------------------------------------------------------===//
//                          Misc Utilities
//===----------------------------------------------------------------------===//

template <class T, T (*Init)()>
struct LazyValue {
  LazyValue() : is_init(false) {}

  T& get() {
    if (!is_init) {
      value = Init();
      is_init = true;
    }
    return value;
  }

private:
  T value;
  bool is_init = false;
};

template <class IntType>
class AtomicInt {
public:
  using MemoryOrder = std::__libcpp_atomic_order;

  explicit AtomicInt(IntType* b) : b_(b) {}
  AtomicInt(AtomicInt const&) = delete;
  AtomicInt& operator=(AtomicInt const&) = delete;

  IntType load(MemoryOrder ord) { return std::__libcpp_atomic_load(b_, ord); }
  void store(IntType val, MemoryOrder ord) { std::__libcpp_atomic_store(b_, val, ord); }
  IntType exchange(IntType new_val, MemoryOrder ord) { return std::__libcpp_atomic_exchange(b_, new_val, ord); }
  bool compare_exchange(IntType* expected, IntType desired, MemoryOrder ord_success, MemoryOrder ord_failure) {
    return std::__libcpp_atomic_compare_exchange(b_, expected, desired, ord_success, ord_failure);
  }

private:
  IntType* b_;
};

//===----------------------------------------------------------------------===//
//                       PlatformGetThreadID
//===----------------------------------------------------------------------===//

#if defined(__APPLE__) && defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
uint32_t PlatformThreadID() {
  static_assert(sizeof(mach_port_t) == sizeof(uint32_t), "");
  return static_cast<uint32_t>(pthread_mach_thread_np(std::__libcpp_thread_get_current_id()));
}
#elif defined(SYS_gettid) && defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
uint32_t PlatformThreadID() {
  static_assert(sizeof(pid_t) == sizeof(uint32_t), "");
  return static_cast<uint32_t>(syscall(SYS_gettid));
}
#else
constexpr uint32_t (*PlatformThreadID)() = nullptr;
#endif

//===----------------------------------------------------------------------===//
//                          GuardByte
//===----------------------------------------------------------------------===//

static constexpr uint8_t UNSET = 0;
static constexpr uint8_t COMPLETE_BIT = (1 << 0);
static constexpr uint8_t PENDING_BIT = (1 << 1);
static constexpr uint8_t WAITING_BIT = (1 << 2);

/// Manages reads and writes to the guard byte.
struct GuardByte {
  GuardByte() = delete;
  GuardByte(GuardByte const&) = delete;
  GuardByte& operator=(GuardByte const&) = delete;

  explicit GuardByte(uint8_t* const guard_byte_address) : guard_byte(guard_byte_address) {}

public:
  /// The guard byte portion of cxa_guard_acquire. Returns true if
  /// initialization has already been completed.
  bool acquire() {
    // if guard_byte is non-zero, we have already completed initialization
    // (i.e. release has been called)
    return guard_byte.load(std::_AO_Acquire) != UNSET;
  }

  /// The guard byte portion of cxa_guard_release.
  void release() { guard_byte.store(COMPLETE_BIT, std::_AO_Release); }

  /// The guard byte portion of cxa_guard_abort.
  void abort() {} // Nothing to do

private:
  AtomicInt<uint8_t> guard_byte;
};

//===----------------------------------------------------------------------===//
//                       InitByte Implementations
//===----------------------------------------------------------------------===//
//
// Each initialization byte implementation supports the following methods:
//
//  InitByte(uint8_t* _init_byte_address, uint32_t* _thread_id_address)
//    Construct the InitByte object, initializing our member variables
//
//  bool acquire()
//    Called before we start the initialization. Check if someone else has already started, and if
//    not to signal our intent to start it ourselves. We determine the current status from the init
//    byte, which is one of 4 possible values:
//      COMPLETE:           Initialization was finished by somebody else. Return true.
//      PENDING:            Somebody has started the initialization already, set the WAITING bit,
//                          then wait for the init byte to get updated with a new value.
//      (PENDING|WAITING):  Somebody has started the initialization already, and we're not the
//                          first one waiting. Wait for the init byte to get updated.
//      UNSET:              Initialization hasn't successfully completed, and nobody is currently
//                          performing the initialization. Set the PENDING bit to indicate our
//                          intention to start the initialization, and return false.
//    The return value indicates whether initialization has already been completed.
//
//  void release()
//    Called after successfully completing the initialization. Update the init byte to reflect
//    that, then if anybody else is waiting, wake them up.
//
//  void abort()
//    Called after an error is thrown during the initialization. Reset the init byte to UNSET to
//    indicate that we're no longer performing the initialization, then if anybody is waiting, wake
//    them up so they can try performing the initialization.
//

//===----------------------------------------------------------------------===//
//                    Single Threaded Implementation
//===----------------------------------------------------------------------===//

/// InitByteNoThreads - Doesn't use any inter-thread synchronization when
/// managing reads and writes to the init byte.
struct InitByteNoThreads {
  InitByteNoThreads() = delete;
  InitByteNoThreads(InitByteNoThreads const&) = delete;
  InitByteNoThreads& operator=(InitByteNoThreads const&) = delete;

  explicit InitByteNoThreads(uint8_t* _init_byte_address, uint32_t*) : init_byte_address(_init_byte_address) {}

  /// The init byte portion of cxa_guard_acquire. Returns true if
  /// initialization has already been completed.
  bool acquire() {
    if (*init_byte_address == COMPLETE_BIT)
      return true;
    if (*init_byte_address & PENDING_BIT)
      ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization: do you have a function-local static variable whose initialization depends on that function?");
    *init_byte_address = PENDING_BIT;
    return false;
  }

  /// The init byte portion of cxa_guard_release.
  void release() { *init_byte_address = COMPLETE_BIT; }
  /// The init byte portion of cxa_guard_abort.
  void abort() { *init_byte_address = UNSET; }

private:
  /// The address of the byte used during initialization.
  uint8_t* const init_byte_address;
};

//===----------------------------------------------------------------------===//
//                     Global Mutex Implementation
//===----------------------------------------------------------------------===//

struct LibcppMutex;
struct LibcppCondVar;

#ifndef _LIBCXXABI_HAS_NO_THREADS
struct LibcppMutex {
  LibcppMutex() = default;
  LibcppMutex(LibcppMutex const&) = delete;
  LibcppMutex& operator=(LibcppMutex const&) = delete;

  bool lock() { return std::__libcpp_mutex_lock(&mutex); }
  bool unlock() { return std::__libcpp_mutex_unlock(&mutex); }

private:
  friend struct LibcppCondVar;
  std::__libcpp_mutex_t mutex = _LIBCPP_MUTEX_INITIALIZER;
};

struct LibcppCondVar {
  LibcppCondVar() = default;
  LibcppCondVar(LibcppCondVar const&) = delete;
  LibcppCondVar& operator=(LibcppCondVar const&) = delete;

  bool wait(LibcppMutex& mut) { return std::__libcpp_condvar_wait(&cond, &mut.mutex); }
  bool broadcast() { return std::__libcpp_condvar_broadcast(&cond); }

private:
  std::__libcpp_condvar_t cond = _LIBCPP_CONDVAR_INITIALIZER;
};
#else
struct LibcppMutex {};
struct LibcppCondVar {};
#endif // !defined(_LIBCXXABI_HAS_NO_THREADS)

/// InitByteGlobalMutex - Uses a global mutex and condition variable (common to
/// all static local variables) to manage reads and writes to the init byte.
template <class Mutex, class CondVar, Mutex& global_mutex, CondVar& global_cond,
          uint32_t (*GetThreadID)() = PlatformThreadID>
struct InitByteGlobalMutex {

  explicit InitByteGlobalMutex(uint8_t* _init_byte_address, uint32_t* _thread_id_address)
      : init_byte_address(_init_byte_address), thread_id_address(_thread_id_address),
        has_thread_id_support(_thread_id_address != nullptr && GetThreadID != nullptr) {}

public:
  /// The init byte portion of cxa_guard_acquire. Returns true if
  /// initialization has already been completed.
  bool acquire() {
    LockGuard g("__cxa_guard_acquire");
    // Check for possible recursive initialization.
    if (has_thread_id_support && (*init_byte_address & PENDING_BIT)) {
      if (*thread_id_address == current_thread_id.get())
        ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization: do you have a function-local static variable whose initialization depends on that function?");
    }

    // Wait until the pending bit is not set.
    while (*init_byte_address & PENDING_BIT) {
      *init_byte_address |= WAITING_BIT;
      global_cond.wait(global_mutex);
    }

    if (*init_byte_address == COMPLETE_BIT)
      return true;

    if (has_thread_id_support)
      *thread_id_address = current_thread_id.get();

    *init_byte_address = PENDING_BIT;
    return false;
  }

  /// The init byte portion of cxa_guard_release.
  void release() {
    bool has_waiting;
    {
      LockGuard g("__cxa_guard_release");
      has_waiting = *init_byte_address & WAITING_BIT;
      *init_byte_address = COMPLETE_BIT;
    }
    if (has_waiting) {
      if (global_cond.broadcast()) {
        ABORT_WITH_MESSAGE("%s failed to broadcast", "__cxa_guard_release");
      }
    }
  }

  /// The init byte portion of cxa_guard_abort.
  void abort() {
    bool has_waiting;
    {
      LockGuard g("__cxa_guard_abort");
      if (has_thread_id_support)
        *thread_id_address = 0;
      has_waiting = *init_byte_address & WAITING_BIT;
      *init_byte_address = UNSET;
    }
    if (has_waiting) {
      if (global_cond.broadcast()) {
        ABORT_WITH_MESSAGE("%s failed to broadcast", "__cxa_guard_abort");
      }
    }
  }

private:
  /// The address of the byte used during initialization.
  uint8_t* const init_byte_address;
  /// An optional address storing an identifier for the thread performing initialization.
  /// It's used to detect recursive initialization.
  uint32_t* const thread_id_address;

  const bool has_thread_id_support;
  LazyValue<uint32_t, GetThreadID> current_thread_id;

private:
  struct LockGuard {
    LockGuard() = delete;
    LockGuard(LockGuard const&) = delete;
    LockGuard& operator=(LockGuard const&) = delete;

    explicit LockGuard(const char* calling_func) : calling_func_(calling_func) {
      if (global_mutex.lock())
        ABORT_WITH_MESSAGE("%s failed to acquire mutex", calling_func_);
    }

    ~LockGuard() {
      if (global_mutex.unlock())
        ABORT_WITH_MESSAGE("%s failed to release mutex", calling_func_);
    }

  private:
    const char* const calling_func_;
  };
};

//===----------------------------------------------------------------------===//
//                         Futex Implementation
//===----------------------------------------------------------------------===//

#if defined(SYS_futex)
void PlatformFutexWait(int* addr, int expect) {
  constexpr int WAIT = 0;
  syscall(SYS_futex, addr, WAIT, expect, 0);
  __tsan_acquire(addr);
}
void PlatformFutexWake(int* addr) {
  constexpr int WAKE = 1;
  __tsan_release(addr);
  syscall(SYS_futex, addr, WAKE, INT_MAX);
}
#else
constexpr void (*PlatformFutexWait)(int*, int) = nullptr;
constexpr void (*PlatformFutexWake)(int*) = nullptr;
#endif

constexpr bool PlatformSupportsFutex() { return +PlatformFutexWait != nullptr; }

/// InitByteFutex - Uses a futex to manage reads and writes to the init byte.
template <void (*Wait)(int*, int) = PlatformFutexWait, void (*Wake)(int*) = PlatformFutexWake,
          uint32_t (*GetThreadIDArg)() = PlatformThreadID>
struct InitByteFutex {

  explicit InitByteFutex(uint8_t* _init_byte_address, uint32_t* _thread_id_address)
      : init_byte(_init_byte_address),
        has_thread_id_support(_thread_id_address != nullptr && GetThreadIDArg != nullptr),
        thread_id(_thread_id_address),
        base_address(reinterpret_cast<int*>(/*_init_byte_address & ~0x3*/ _init_byte_address - 1)) {}

public:
  /// The init byte portion of cxa_guard_acquire. Returns true if
  /// initialization has already been completed.
  bool acquire() {
    while (true) {
      uint8_t last_val = UNSET;
      if (init_byte.compare_exchange(&last_val, PENDING_BIT, std::_AO_Acq_Rel, std::_AO_Acquire)) {
        if (has_thread_id_support) {
          thread_id.store(current_thread_id.get(), std::_AO_Relaxed);
        }
        return false;
      }

      if (last_val == COMPLETE_BIT)
        return true;

      if (last_val & PENDING_BIT) {

        // Check for recursive initialization
        if (has_thread_id_support && thread_id.load(std::_AO_Relaxed) == current_thread_id.get()) {
          ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization: do you have a function-local static variable whose initialization depends on that function?");
        }

        if ((last_val & WAITING_BIT) == 0) {
          // This compare exchange can fail for several reasons
          // (1) another thread finished the whole thing before we got here
          // (2) another thread set the waiting bit we were trying to thread
          // (3) another thread had an exception and failed to finish
          if (!init_byte.compare_exchange(&last_val, PENDING_BIT | WAITING_BIT, std::_AO_Acq_Rel, std::_AO_Release)) {
            // (1) success, via someone else's work!
            if (last_val == COMPLETE_BIT)
              return true;

            // (3) someone else, bailed on doing the work, retry from the start!
            if (last_val == UNSET)
              continue;

            // (2) the waiting bit got set, so we are happy to keep waiting
          }
        }
        wait_on_initialization();
      }
    }
  }

  /// The init byte portion of cxa_guard_release.
  void release() {
    uint8_t old = init_byte.exchange(COMPLETE_BIT, std::_AO_Acq_Rel);
    if (old & WAITING_BIT)
      wake_all();
  }

  /// The init byte portion of cxa_guard_abort.
  void abort() {
    if (has_thread_id_support)
      thread_id.store(0, std::_AO_Relaxed);

    uint8_t old = init_byte.exchange(UNSET, std::_AO_Acq_Rel);
    if (old & WAITING_BIT)
      wake_all();
  }

private:
  /// Use the futex to wait on the current guard variable. Futex expects a
  /// 32-bit 4-byte aligned address as the first argument, so we use the 4-byte
  /// aligned address that encompasses the init byte (i.e. the address of the
  /// raw guard object that was passed to __cxa_guard_acquire/release/abort).
  void wait_on_initialization() { Wait(base_address, expected_value_for_futex(PENDING_BIT | WAITING_BIT)); }
  void wake_all() { Wake(base_address); }

private:
  AtomicInt<uint8_t> init_byte;

  const bool has_thread_id_support;
  // Unsafe to use unless has_thread_id_support
  AtomicInt<uint32_t> thread_id;
  LazyValue<uint32_t, GetThreadIDArg> current_thread_id;

  /// the 4-byte-aligned address that encompasses the init byte (i.e. the
  /// address of the raw guard object).
  int* const base_address;

  /// Create the expected integer value for futex `wait(int* addr, int expected)`.
  /// We pass the base address as the first argument, So this function creates
  /// an zero-initialized integer  with `b` copied at the correct offset.
  static int expected_value_for_futex(uint8_t b) {
    int dest_val = 0;
    std::memcpy(reinterpret_cast<char*>(&dest_val) + 1, &b, 1);
    return dest_val;
  }

  static_assert(Wait != nullptr && Wake != nullptr, "");
};

//===----------------------------------------------------------------------===//
//                          GuardObject
//===----------------------------------------------------------------------===//

enum class AcquireResult {
  INIT_IS_DONE,
  INIT_IS_PENDING,
};
constexpr AcquireResult INIT_IS_DONE = AcquireResult::INIT_IS_DONE;
constexpr AcquireResult INIT_IS_PENDING = AcquireResult::INIT_IS_PENDING;

/// Co-ordinates between GuardByte and InitByte.
template <class InitByteT>
struct GuardObject {
  GuardObject() = delete;
  GuardObject(GuardObject const&) = delete;
  GuardObject& operator=(GuardObject const&) = delete;

private:
  GuardByte guard_byte;
  InitByteT init_byte;

public:
  /// ARM Constructor
  explicit GuardObject(uint32_t* raw_guard_object)
      : guard_byte(reinterpret_cast<uint8_t*>(raw_guard_object)),
        init_byte(reinterpret_cast<uint8_t*>(raw_guard_object) + 1, nullptr) {}

  /// Itanium Constructor
  explicit GuardObject(uint64_t* raw_guard_object)
      : guard_byte(reinterpret_cast<uint8_t*>(raw_guard_object)),
        init_byte(reinterpret_cast<uint8_t*>(raw_guard_object) + 1, reinterpret_cast<uint32_t*>(raw_guard_object) + 1) {
  }

  /// Implements __cxa_guard_acquire.
  AcquireResult cxa_guard_acquire() {
    // Use short-circuit evaluation to avoid calling init_byte.acquire when
    // guard_byte.acquire returns true. (i.e. don't call it when we know from
    // the guard byte that initialization has already been completed)
    if (guard_byte.acquire() || init_byte.acquire())
      return INIT_IS_DONE;
    return INIT_IS_PENDING;
  }

  /// Implements __cxa_guard_release.
  void cxa_guard_release() {
    // Update guard byte first, so if somebody is woken up by init_byte.release
    // and comes all the way back around to __cxa_guard_acquire again, they see
    // it as having completed initialization.
    guard_byte.release();
    init_byte.release();
  }

  /// Implements __cxa_guard_abort.
  void cxa_guard_abort() {
    guard_byte.abort();
    init_byte.abort();
  }
};

//===----------------------------------------------------------------------===//
//                          Convenience Classes
//===----------------------------------------------------------------------===//

/// NoThreadsGuard - Manages initialization without performing any inter-thread
/// synchronization.
using NoThreadsGuard = GuardObject<InitByteNoThreads>;

/// GlobalMutexGuard - Manages initialization using a global mutex and
/// condition variable.
template <class Mutex, class CondVar, Mutex& global_mutex, CondVar& global_cond,
          uint32_t (*GetThreadID)() = PlatformThreadID>
using GlobalMutexGuard = GuardObject<InitByteGlobalMutex<Mutex, CondVar, global_mutex, global_cond, GetThreadID>>;

/// FutexGuard - Manages initialization using atomics and the futex syscall for
/// waiting and waking.
template <void (*Wait)(int*, int) = PlatformFutexWait, void (*Wake)(int*) = PlatformFutexWake,
          uint32_t (*GetThreadIDArg)() = PlatformThreadID>
using FutexGuard = GuardObject<InitByteFutex<Wait, Wake, GetThreadIDArg>>;

//===----------------------------------------------------------------------===//
//
//===----------------------------------------------------------------------===//

template <class T>
struct GlobalStatic {
  static T instance;
};
template <class T>
_LIBCPP_CONSTINIT T GlobalStatic<T>::instance = {};

enum class Implementation { NoThreads, GlobalMutex, Futex };

template <Implementation Impl>
struct SelectImplementation;

template <>
struct SelectImplementation<Implementation::NoThreads> {
  using type = NoThreadsGuard;
};

template <>
struct SelectImplementation<Implementation::GlobalMutex> {
  using type = GlobalMutexGuard<LibcppMutex, LibcppCondVar, GlobalStatic<LibcppMutex>::instance,
                                GlobalStatic<LibcppCondVar>::instance, PlatformThreadID>;
};

template <>
struct SelectImplementation<Implementation::Futex> {
  using type = FutexGuard<PlatformFutexWait, PlatformFutexWake, PlatformThreadID>;
};

// TODO(EricWF): We should prefer the futex implementation when available. But
// it should be done in a separate step from adding the implementation.
constexpr Implementation CurrentImplementation =
#if defined(_LIBCXXABI_HAS_NO_THREADS)
    Implementation::NoThreads;
#elif defined(_LIBCXXABI_USE_FUTEX)
    Implementation::Futex;
#else
    Implementation::GlobalMutex;
#endif

static_assert(CurrentImplementation != Implementation::Futex || PlatformSupportsFutex(),
              "Futex selected but not supported");

using SelectedImplementation = SelectImplementation<CurrentImplementation>::type;

} // end namespace
} // end namespace __cxxabiv1

#if defined(__clang__)
#  pragma clang diagnostic pop
#elif defined(__GNUC__)
#  pragma GCC diagnostic pop
#endif

#endif // LIBCXXABI_SRC_INCLUDE_CXA_GUARD_IMPL_H