// Copyright 2020 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "base/threading/hang_watcher.h" #include #include #include #include "base/bind.h" #include "base/callback_helpers.h" #include "base/debug/alias.h" #include "base/debug/crash_logging.h" #include "base/debug/dump_without_crashing.h" #include "base/feature_list.h" #include "base/no_destructor.h" #include "base/strings/string_number_conversions.h" #include "base/synchronization/lock.h" #include "base/synchronization/waitable_event.h" #include "base/threading/platform_thread.h" #include "base/threading/thread_checker.h" #include "base/threading/thread_restrictions.h" #include "base/time/default_tick_clock.h" #include "base/time/time.h" #include "build/build_config.h" namespace base { // static constexpr base::Feature HangWatcher::kEnableHangWatcher{ "EnableHangWatcher", base::FEATURE_DISABLED_BY_DEFAULT}; constexpr base::TimeDelta HangWatchScope::kDefaultHangWatchTime = base::TimeDelta::FromSeconds(10); namespace { HangWatcher* g_instance = nullptr; } constexpr const char* kThreadName = "HangWatcher"; // The time that the HangWatcher thread will sleep for between calls to // Monitor(). Increasing or decreasing this does not modify the type of hangs // that can be detected. It instead increases the probability that a call to // Monitor() will happen at the right time to catch a hang. This has to be // balanced with power/cpu use concerns as busy looping would catch amost all // hangs but present unacceptable overhead. const base::TimeDelta kMonitoringPeriod = base::TimeDelta::FromSeconds(10); HangWatchScope::HangWatchScope(TimeDelta timeout) { internal::HangWatchState* current_hang_watch_state = internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get(); DCHECK(timeout >= base::TimeDelta()) << "Negative timeouts are invalid."; // TODO(crbug.com/1034046): Remove when all threads using HangWatchScope are // monitored. Thread is not monitored, noop. if (!current_hang_watch_state) { return; } DCHECK(current_hang_watch_state) << "A scope can only be used on a thread that " "registered for hang watching with HangWatcher::RegisterThread."; #if DCHECK_IS_ON() previous_scope_ = current_hang_watch_state->GetCurrentHangWatchScope(); current_hang_watch_state->SetCurrentHangWatchScope(this); #endif // TODO(crbug.com/1034046): Check whether we are over deadline already for the // previous scope here by issuing only one TimeTicks::Now() and resuing the // value. previous_deadline_ = current_hang_watch_state->GetDeadline(); TimeTicks deadline = TimeTicks::Now() + timeout; current_hang_watch_state->SetDeadline(deadline); } HangWatchScope::~HangWatchScope() { DCHECK_CALLED_ON_VALID_THREAD(thread_checker_); internal::HangWatchState* current_hang_watch_state = internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get(); // TODO(crbug.com/1034046): Remove when all threads using HangWatchScope are // monitored. Thread is not monitored, noop. if (!current_hang_watch_state) { return; } // If a hang is currently being captured we should block here so execution // stops and the relevant stack frames are recorded. base::HangWatcher::GetInstance()->BlockIfCaptureInProgress(); #if DCHECK_IS_ON() // Verify that no Scope was destructed out of order. DCHECK_EQ(this, current_hang_watch_state->GetCurrentHangWatchScope()); current_hang_watch_state->SetCurrentHangWatchScope(previous_scope_); #endif // Reset the deadline to the value it had before entering this scope. current_hang_watch_state->SetDeadline(previous_deadline_); // TODO(crbug.com/1034046): Log when a HangWatchScope exits after its deadline // and that went undetected by the HangWatcher. } HangWatcher::HangWatcher() : monitor_period_(kMonitoringPeriod), should_monitor_(WaitableEvent::ResetPolicy::AUTOMATIC), thread_(this, kThreadName), tick_clock_(base::DefaultTickClock::GetInstance()) { // |thread_checker_| should not be bound to the constructing thread. DETACH_FROM_THREAD(hang_watcher_thread_checker_); should_monitor_.declare_only_used_while_idle(); DCHECK(!g_instance); g_instance = this; } HangWatcher::~HangWatcher() { DCHECK_EQ(g_instance, this); DCHECK(watch_states_.empty()); g_instance = nullptr; Stop(); } void HangWatcher::Start() { thread_.Start(); } void HangWatcher::Stop() { keep_monitoring_.store(false, std::memory_order_relaxed); should_monitor_.Signal(); thread_.Join(); } bool HangWatcher::IsWatchListEmpty() { AutoLock auto_lock(watch_state_lock_); return watch_states_.empty(); } void HangWatcher::Wait() { while (true) { // Amount by which the actual time spent sleeping can deviate from // the target time and still be considered timely. constexpr base::TimeDelta kWaitDriftTolerance = base::TimeDelta::FromMilliseconds(100); const base::TimeTicks time_before_wait = tick_clock_->NowTicks(); // Sleep until next scheduled monitoring or until signaled. const bool was_signaled = should_monitor_.TimedWait(monitor_period_); if (after_wait_callback_) after_wait_callback_.Run(time_before_wait); const base::TimeTicks time_after_wait = tick_clock_->NowTicks(); const base::TimeDelta wait_time = time_after_wait - time_before_wait; const bool wait_was_normal = wait_time <= (monitor_period_ + kWaitDriftTolerance); if (!wait_was_normal) { // If the time spent waiting was too high it might indicate the machine is // very slow or that that it went to sleep. In any case we can't trust the // hang watch scopes that are currently live. Update the ignore threshold // to make sure they don't trigger a hang on subsequent monitors then keep // waiting. base::AutoLock auto_lock(watch_state_lock_); // Find the latest deadline among the live watch states. They might change // atomically while iterating but that's fine because if they do that // means the new HangWatchScope was constructed very soon after the // abnormal sleep happened and might be affected by the root cause still. // Ignoring it is cautious and harmless. base::TimeTicks latest_deadline; for (const auto& state : watch_states_) { base::TimeTicks deadline = state->GetDeadline(); if (deadline > latest_deadline) { latest_deadline = deadline; } } deadline_ignore_threshold_ = latest_deadline; } // Stop waiting. if (wait_was_normal || was_signaled) return; } } void HangWatcher::Run() { // Monitor() should only run on |thread_|. Bind |thread_checker_| here to make // sure of that. DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_); while (keep_monitoring_.load(std::memory_order_relaxed)) { Wait(); if (!IsWatchListEmpty() && keep_monitoring_.load(std::memory_order_relaxed)) { Monitor(); if (after_monitor_closure_for_testing_) { after_monitor_closure_for_testing_.Run(); } } } } // static HangWatcher* HangWatcher::GetInstance() { return g_instance; } // static void HangWatcher::RecordHang() { base::debug::DumpWithoutCrashing(); // Inhibit code folding. const int line_number = __LINE__; base::debug::Alias(&line_number); } ScopedClosureRunner HangWatcher::RegisterThread() { AutoLock auto_lock(watch_state_lock_); watch_states_.push_back( internal::HangWatchState::CreateHangWatchStateForCurrentThread()); return ScopedClosureRunner(BindOnce(&HangWatcher::UnregisterThread, Unretained(HangWatcher::GetInstance()))); } base::TimeTicks HangWatcher::WatchStateSnapShot::GetHighestDeadline() const { DCHECK(!hung_watch_state_copies_.empty()); // Since entries are sorted in increasing order the last entry is the largest // one. return hung_watch_state_copies_.back().deadline; } HangWatcher::WatchStateSnapShot::WatchStateSnapShot( const HangWatchStates& watch_states, base::TimeTicks snapshot_time, base::TimeTicks deadline_ignore_threshold) : snapshot_time_(snapshot_time) { // Initial copy of the values. for (const auto& watch_state : watch_states) { base::TimeTicks deadline = watch_state.get()->GetDeadline(); if (deadline <= deadline_ignore_threshold) { hung_watch_state_copies_.clear(); return; } // Only copy hung threads. if (deadline <= snapshot_time) { hung_watch_state_copies_.push_back( WatchStateCopy{deadline, watch_state.get()->GetThreadID()}); } } // Sort |hung_watch_state_copies_| by order of decreasing hang severity so the // most severe hang is first in the list. std::sort(hung_watch_state_copies_.begin(), hung_watch_state_copies_.end(), [](const WatchStateCopy& lhs, const WatchStateCopy& rhs) { return lhs.deadline < rhs.deadline; }); } HangWatcher::WatchStateSnapShot::WatchStateSnapShot( const WatchStateSnapShot& other) = default; HangWatcher::WatchStateSnapShot::~WatchStateSnapShot() = default; std::string HangWatcher::WatchStateSnapShot::PrepareHungThreadListCrashKey() const { // Build a crash key string that contains the ids of the hung threads. constexpr char kSeparator{'|'}; std::string list_of_hung_thread_ids; // Add as many thread ids to the crash key as possible. for (const WatchStateCopy& copy : hung_watch_state_copies_) { std::string fragment = base::NumberToString(copy.thread_id) + kSeparator; if (list_of_hung_thread_ids.size() + fragment.size() < static_cast(debug::CrashKeySize::Size256)) { list_of_hung_thread_ids += fragment; } else { // Respect the by priority ordering of thread ids in the crash key by // stopping the construction as soon as one does not fit. This avoids // including lesser priority ids while omitting more important ones. break; } } return list_of_hung_thread_ids; } HangWatcher::WatchStateSnapShot HangWatcher::GrabWatchStateSnapshotForTesting() const { WatchStateSnapShot snapshot(watch_states_, base::TimeTicks::Now(), deadline_ignore_threshold_); return snapshot; } void HangWatcher::Monitor() { DCHECK_CALLED_ON_VALID_THREAD(hang_watcher_thread_checker_); AutoLock auto_lock(watch_state_lock_); // If all threads unregistered since this function was invoked there's // nothing to do anymore. if (watch_states_.empty()) return; const base::TimeTicks now = base::TimeTicks::Now(); // See if any thread hung. We're holding |watch_state_lock_| so threads // can't register or unregister but their deadline still can change // atomically. This is fine. Detecting a hang is generally best effort and // if a thread resumes from hang in the time it takes to move on to // capturing then its ID will be absent from the crash keys. bool any_thread_hung = std::any_of( watch_states_.cbegin(), watch_states_.cend(), [this, now](const std::unique_ptr& state) { base::TimeTicks deadline = state->GetDeadline(); return deadline > deadline_ignore_threshold_ && deadline < now; }); // If at least a thread is hung we need to capture. if (any_thread_hung) CaptureHang(now); } void HangWatcher::CaptureHang(base::TimeTicks capture_time) { capture_in_progress.store(true, std::memory_order_relaxed); base::AutoLock scope_lock(capture_lock_); WatchStateSnapShot watch_state_snapshot(watch_states_, capture_time, deadline_ignore_threshold_); // The hung thread(s) could detected at the start of Monitor() could have // moved on from their scopes. If that happened and there are no more hung // threads then abort capture. std::string list_of_hung_thread_ids = watch_state_snapshot.PrepareHungThreadListCrashKey(); if (list_of_hung_thread_ids.empty()) return; #if not defined(OS_NACL) static debug::CrashKeyString* crash_key = AllocateCrashKeyString( "list-of-hung-threads", debug::CrashKeySize::Size256); debug::ScopedCrashKeyString list_of_hung_threads_crash_key_string( crash_key, list_of_hung_thread_ids); #endif // To avoid capturing more than one hang that blames a subset of the same // threads it's necessary to keep track of what is the furthest deadline // that contributed to declaring a hang. Only once // all threads have deadlines past this point can we be sure that a newly // discovered hang is not directly related. // Example: // ********************************************************************** // Timeline A : L------1-------2----------3-------4----------N----------- // Timeline B : -------2----------3-------4----------L----5------N------- // Timeline C : L----------------------------5------6----7---8------9---N // ********************************************************************** // In the example when a Monitor() happens during timeline A // |deadline_ignore_threshold_| (L) is at time zero and deadlines (1-4) // are before Now() (N) . A hang is captured and L is updated. During // the next Monitor() (timeline B) a new deadline is over but we can't // capture a hang because deadlines 2-4 are still live and already counted // toward a hang. During a third monitor (timeline C) all live deadlines // are now after L and a second hang can be recorded. base::TimeTicks latest_expired_deadline = watch_state_snapshot.GetHighestDeadline(); if (on_hang_closure_for_testing_) on_hang_closure_for_testing_.Run(); else RecordHang(); // Update after running the actual capture. deadline_ignore_threshold_ = latest_expired_deadline; capture_in_progress.store(false, std::memory_order_relaxed); } void HangWatcher::SetAfterMonitorClosureForTesting( base::RepeatingClosure closure) { DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_); after_monitor_closure_for_testing_ = std::move(closure); } void HangWatcher::SetOnHangClosureForTesting(base::RepeatingClosure closure) { DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_); on_hang_closure_for_testing_ = std::move(closure); } void HangWatcher::SetMonitoringPeriodForTesting(base::TimeDelta period) { DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_); monitor_period_ = period; } void HangWatcher::SetAfterWaitCallbackForTesting( RepeatingCallback callback) { DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_); after_wait_callback_ = callback; } void HangWatcher::SignalMonitorEventForTesting() { DCHECK_CALLED_ON_VALID_THREAD(constructing_thread_checker_); should_monitor_.Signal(); } void HangWatcher::StopMonitoringForTesting() { keep_monitoring_.store(false, std::memory_order_relaxed); } void HangWatcher::SetTickClockForTesting(const base::TickClock* tick_clock) { tick_clock_ = tick_clock; } void HangWatcher::BlockIfCaptureInProgress() { // Makes a best-effort attempt to block execution if a hang is currently being // captured.Only block on |capture_lock| if |capture_in_progress| hints that // it's already held to avoid serializing all threads on this function when no // hang capture is in-progress. if (capture_in_progress.load(std::memory_order_relaxed)) { base::AutoLock hang_lock(capture_lock_); } } void HangWatcher::UnregisterThread() { AutoLock auto_lock(watch_state_lock_); internal::HangWatchState* current_hang_watch_state = internal::HangWatchState::GetHangWatchStateForCurrentThread()->Get(); auto it = std::find_if(watch_states_.cbegin(), watch_states_.cend(), [current_hang_watch_state]( const std::unique_ptr& state) { return state.get() == current_hang_watch_state; }); // Thread should be registered to get unregistered. DCHECK(it != watch_states_.end()); watch_states_.erase(it); } namespace internal { // |deadline_| starts at Max() to avoid validation problems // when setting the first legitimate value. HangWatchState::HangWatchState() : thread_id_(PlatformThread::CurrentId()) { // There should not exist a state object for this thread already. DCHECK(!GetHangWatchStateForCurrentThread()->Get()); // Bind the new instance to this thread. GetHangWatchStateForCurrentThread()->Set(this); } HangWatchState::~HangWatchState() { DCHECK_CALLED_ON_VALID_THREAD(thread_checker_); DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), this); GetHangWatchStateForCurrentThread()->Set(nullptr); #if DCHECK_IS_ON() // Destroying the HangWatchState should not be done if there are live // HangWatchScopes. DCHECK(!current_hang_watch_scope_); #endif } // static std::unique_ptr HangWatchState::CreateHangWatchStateForCurrentThread() { // Allocate a watch state object for this thread. std::unique_ptr hang_state = std::make_unique(); // Setting the thread local worked. DCHECK_EQ(GetHangWatchStateForCurrentThread()->Get(), hang_state.get()); // Transfer ownership to caller. return hang_state; } TimeTicks HangWatchState::GetDeadline() const { return deadline_.load(std::memory_order_relaxed); } void HangWatchState::SetDeadline(TimeTicks deadline) { DCHECK_CALLED_ON_VALID_THREAD(thread_checker_); deadline_.store(deadline, std::memory_order_relaxed); } bool HangWatchState::IsOverDeadline() const { return TimeTicks::Now() > deadline_.load(std::memory_order_relaxed); } #if DCHECK_IS_ON() void HangWatchState::SetCurrentHangWatchScope(HangWatchScope* scope) { DCHECK_CALLED_ON_VALID_THREAD(thread_checker_); current_hang_watch_scope_ = scope; } HangWatchScope* HangWatchState::GetCurrentHangWatchScope() { DCHECK_CALLED_ON_VALID_THREAD(thread_checker_); return current_hang_watch_scope_; } #endif // static ThreadLocalPointer* HangWatchState::GetHangWatchStateForCurrentThread() { static NoDestructor> hang_watch_state; return hang_watch_state.get(); } PlatformThreadId HangWatchState::GetThreadID() const { return thread_id_; } } // namespace internal } // namespace base