/** * Copyright (C) 2018-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #pragma once #include #include #include #include "mongo/base/status.h" #include "mongo/base/status_with.h" #include "mongo/db/jsobj.h" #include "mongo/platform/atomic_word.h" #include "mongo/platform/mutex.h" #include "mongo/stdx/unordered_map.h" #include "mongo/util/assert_util.h" #include "mongo/util/cancellation.h" #include "mongo/util/duration.h" #include "mongo/util/interruptible.h" #include "mongo/util/string_map.h" namespace mongo { /** * * A FailPoint is a hook mechanism allowing testing behavior to occur at prearranged * execution points in the server code. They can be activated and deactivated, and * configured to hold data. * * A FailPoint is usually defined by the MONGO_FAIL_POINT_DEFINE(name) macro, * which arranges for it to be added to the global failpoint registry. * * A FailPoint object can have unusual lifetime semantics. It can be marked * `immortal`, so that its internal state is never destroyed. This is possible * because FailPoint is designed to have only trivially destructible nonstatic * data members, and we can choose not to manually destroy the internal state * object. This enables server code that is instrumented by an immortal * static-duration FailPoint to remain valid even during process shutdown. * * Sample use: * * // Defined somewhere: * MONGO_FAIL_POINT_DEFINE(failPoint); * * bool somewhereInTheCode() { * ... do some stuff ... * // The failpoint artificially changes the return value of this function when active. * if (MONGO_unlikely(failPoint.shouldFail())) * return false; * return true; * } * * - or to implement more complex scenarios, use execute/executeIf - * * bool somewhereInTheCode() { * failPoint.execute([&](const BSONObj& data) { * // The bad things happen here, and can read the injected 'data'. * }); * return true; * } * * // scoped() is another way to do it, where lambda isn't suitable, e.g. to cause * // a return/continue/break to control the enclosing function. * for (auto& user : users) { * // The failpoint can be activated and given a user name, to skip that user. * if (auto sfp = failPoint.scoped(); MONGO_unlikely(sfp.isActive())) { * if (sfp.getData()["user"] == user.name()) { * continue; * } * } * processOneUser(user); * } * * // Rendered compactly with scopedIf where the data serves as an activation filter. * for (auto& user : users) { * if (MONGO_unlikely(failPoint.scopedIf([&](auto&& o) { * return o["user"] == user.name(); * }).isActive())) { * continue; * } * processOneUser(user); * } * * The `scopedIf` and `executeIf` members have an advantage over `scoped` and `execute`. They * only affect the `FailPoint` activation counters (relevant to the `nTimes` and `skip` modes) * if the predicate is true. * * A FailPoint can be configured remotely by a database command. * See `src/mongo/db/commands/fail_point_cmd.cpp`. * */ class FailPoint { public: using ValType = unsigned; enum Mode { off, alwaysOn, random, nTimes, skip }; struct ModeOptions { Mode mode; ValType val; BSONObj extra; }; // long long values are able to be appended to BSON. If this is using declaration is changed, // please make sure that the new type is also BSON-compatible. using EntryCountT = long long; using PredicateFunction = std::function; private: class Impl { private: enum class AlreadyCounted : bool {}; static constexpr auto _kWaitGranularity = Milliseconds(100); static constexpr auto _kActiveBit = ValType{ValType{1} << 31}; public: class LockHandle { public: LockHandle(Impl* impl, bool hit) : _impl(impl), _hit(hit) {} ~LockHandle() { if (MONGO_unlikely(_impl)) _impl->_unlock(); } LockHandle(const LockHandle&) = delete; LockHandle& operator=(const LockHandle&) = delete; LockHandle(LockHandle&& o) noexcept : _impl{std::exchange(o._impl, nullptr)}, _hit{std::exchange(o._hit, false)} {} LockHandle& operator=(LockHandle&&) = delete; /** * Returns true if this LockHandle associated with a FailPoint, and * the lock outcome was a "hit". `lockHandle.isActive()` generally * means the block of FailPoint special behavior should execute. */ bool isActive() const { return MONGO_unlikely(_hit); } /** * Returns true if the fail point is still enabled. * * This function does not increment the underlying counter. Note that the fail point * may have been changed in various ways while a LockHandle is held: * - The fail point may be in the process of mutation which toggles to disabled until * LockHandles are released. * - The fail point may have the modes "activationProbability", "skip", or * "times". */ bool isStillEnabled() const { return _impl->_shouldFail(AlreadyCounted{true}, PredicateFunction{}); } /** May only be called if isActive() is true. */ const BSONObj& getData() const { invariant(_impl, "getData without holding failpoint lock"); return _impl->_data; } private: Impl* _impl = nullptr; bool _hit = false; //< True if this represents a tryLock "hit". }; Impl(std::string name) : _name(std::move(name)) {} template bool shouldFail(Pred&& pred) { return _shouldFail(AlreadyCounted{false}, pred); } EntryCountT setMode(Mode mode, ValType val = 0, BSONObj extra = {}); EntryCountT waitForTimesEntered(Interruptible* interruptible, EntryCountT targetTimesEntered) const; BSONObj toBSON() const; template LockHandle tryLock(Pred&& pred) { return _tryLock(AlreadyCounted{false}, pred); } /** See `FailPoint::pauseWhileSet`. */ void pauseWhileSet(Interruptible* interruptible) { auto alreadyCounted = AlreadyCounted{false}; while (MONGO_unlikely(_shouldFail(alreadyCounted, nullptr))) { interruptible->sleepFor(_kWaitGranularity); alreadyCounted = AlreadyCounted{true}; } } /** See `FailPoint::pauseWhileSetAndNotCanceled`. */ void pauseWhileSetAndNotCanceled(Interruptible* interruptible, const CancellationToken& token) { auto alreadyCounted = AlreadyCounted{false}; while (MONGO_unlikely(_shouldFail(alreadyCounted, nullptr))) { uassert( ErrorCodes::Interrupted, "Failpoint has been canceled", !token.isCanceled()); interruptible->sleepFor(_kWaitGranularity); alreadyCounted = AlreadyCounted{true}; } } const std::string& getName() const { return _name; } private: void _enable() { _fpInfo.fetchAndBitOr(_kActiveBit); } void _disable() { _fpInfo.fetchAndBitAnd(~_kActiveBit); } /** No default parameters. No-Frills shouldFail implementation. */ template bool _shouldFail(AlreadyCounted alreadyCounted, Pred&& pred) { return _tryLock(alreadyCounted, pred).isActive(); } /** * Release a FailPoint lock previously acquired with `_tryLock`. * Used only by `~LockHandle`. */ void _unlock() { _fpInfo.subtractAndFetch(1); } /** * Attempt to access the fail point. If FailPoint is disabled, it * cannot be accessed and this call will return a disengaged and * inactive LockHandle. * * After successfully locking it, however, the caller will have * received either a hit or a miss, observable by calling * `result.isActive()`. If true, then caller may further access the * associated `const BSONObj&` payload with `result.getData()`. * * If `pred` is callable, `pred(data)` is invoked with the FailPoint * BSON data payload. If it returns false, it specifies a user-defined * Failpoint miss. In response, this function will return an inactive * LockHandle. * * Otherwise the FailPoint determines whether this lock operation * outcome is a hit or a miss based on the FailPoint's configured Mode. * * Unless `alreadyCounted` is true, such a hit will also increment * `_hitCount` as a side effect. This complication enables the * `pauseWhileSet` loop to evaluate the failpoint multiple times while * only counting the first of those hits in terms of the `_hitCount`. */ template LockHandle _tryLock(AlreadyCounted alreadyCounted, Pred&& pred) { if (MONGO_likely((_fpInfo.loadRelaxed() & _kActiveBit) == 0)) return LockHandle{nullptr, false}; // Fast path if ((_fpInfo.addAndFetch(1) & _kActiveBit) == 0) return LockHandle{this, false}; // Took a reference to disabled in data race. // Slow path. Wrap in `std::function` to deal with nullptr_t // or other predicates that are not bool-convertible. auto predWrap = PredicateFunction(std::move(pred)); // The caller-supplied predicate, if provided, can force a miss that // bypasses the `_evaluateByMode()` call. bool bypass = predWrap && !predWrap(_data); bool hit = bypass ? false : _evaluateByMode(); if (hit && alreadyCounted == AlreadyCounted{false}) _hitCount.addAndFetch(1); return LockHandle{this, hit}; } /** * Use the configured mode to determine hit or miss. * Return true to indicate a hit */ bool _evaluateByMode(); // Bit layout: // 31: tells whether this fail point is active. // 0~30: ref counter: # of outstanding LockHandles. AtomicWord _fpInfo{0}; /* Number of times this has been locked with a `hit` result. */ AtomicWord _hitCount{0}; // Invariant: These should be read only if _kActiveBit of _fpInfo is set. Mode _mode{off}; AtomicWord _modeValue{0}; BSONObj _data; const std::string _name; // protects _mode, _modeValue, _data mutable Mutex _modMutex = MONGO_MAKE_LATCH("FailPoint::_modMutex"); }; public: /** * An object representing a FailPoint's interaction with the code it is * instrumenting. Users don't create these. They are only used within the * execute and executeIf functions and returned by the scoped() and * scopedIf() functions. * * If the FailPoint access attempt does not acquire a reference to the * FailPoint, the returned LockHandle will be disengaged. Otherwise, it * holds a reference to its associated FailPoint, ensuring that FailPoint's * state doesn't change while a LockHandle is attached to it. * * Even an engaged LockHandle (holds a reference to a FailPoint) * can still have `isActive()==false`. * * LockHandle `isActive()`, then `getData()` may be called on it to * retrieve injected data from the associated FailPoint. * * Ex: * if (auto scoped = failPoint.scoped(); scoped.isActive()) { * const BSONObj& data = scoped.getData(); * // failPoint injects some behavior, informed by `data`. * } */ using LockHandle = Impl::LockHandle; /** * Explicitly resets the seed used for the PRNG in this thread. If not called on a thread, * an instance of SecureRandom is used to seed the PRNG. */ static void setThreadPRNGSeed(int32_t seed); /** * Parses the {mode, val, extra} from the BSON. * obj = { * mode: modeElem // required * data: extra // optional payload to inject into the FailPoint intercept site. * } * where `modeElem` is one of: * "off" * "alwaysOn" * {"times" : val} // active for the next val calls * {"skip" : val} // skip calls, activate on and after call number (val+1). * {"activationProbability" : val} // val is in interval [0.0, 1.0] */ static StatusWith parseBSON(const BSONObj& obj); /** * FailPoint state can be kept alive during shutdown by setting `immortal` true. * The usual macro definition does this, but FailPoint unit tests do not. */ explicit FailPoint(std::string name, bool immortal = false); FailPoint(const FailPoint&) = delete; FailPoint& operator=(const FailPoint&) = delete; /** * If this FailPoint was constructed as `immortal` (FailPoints defined by * MONGO_FAIL_POINT_DEFINE are immortal), this destructor does nothing. In * that case the FailPoint (and the code it is instrumenting) can operate * normally while the process shuts down. */ ~FailPoint(); const std::string& getName() const { return _impl()->getName(); } /** * Returns true if fail point is active. * * @param pred see `executeIf` for more information. * * Calls to `shouldFail` should be placed inside MONGO_unlikely for performance. * if (MONGO_unlikely(failpoint.shouldFail())) ... */ template bool shouldFail(Pred&& pred) { return _impl()->shouldFail(pred); } bool shouldFail() { return shouldFail(nullptr); } /** * Changes the settings of this fail point. This will turn off the FailPoint and * wait for all references on this FailPoint to go away before modifying it. * * @param mode new mode * @param val unsigned having different interpretations depending on the mode: * * - off, alwaysOn: ignored * - random: static_cast(std::numeric_limits::max() * p), where * where p is the probability that any given evaluation of the failpoint should * activate. * - nTimes: the number of times this fail point will be active when * #shouldFail/#execute/#scoped are called. * - skip: will become active and remain active after * #shouldFail/#execute/#scoped are called this number of times. * * @param extra arbitrary BSON object that can be stored to this fail point * that can be referenced afterwards with #getData. Defaults to an empty * document. * * @returns the number of times the fail point has been entered so far. */ EntryCountT setMode(Mode mode, ValType val = 0, BSONObj extra = {}) { return _impl()->setMode(std::move(mode), std::move(val), std::move(extra)); } EntryCountT setMode(ModeOptions opt) { return setMode(std::move(opt.mode), std::move(opt.val), std::move(opt.extra)); } /** * Waits until the fail point has been entered the desired number of times. * * @param targetTimesEntered the number of times the fail point has been entered. * * @returns the number of times the fail point has been entered so far. */ EntryCountT waitForTimesEntered(EntryCountT targetTimesEntered) const noexcept { return waitForTimesEntered(Interruptible::notInterruptible(), targetTimesEntered); } /** * Like `waitForTimesEntered`, but interruptible via the `interruptible->sleepFor` mechanism. * See `mongo::Interruptible::sleepFor`. */ EntryCountT waitForTimesEntered(Interruptible* interruptible, EntryCountT targetTimesEntered) const { return _impl()->waitForTimesEntered(interruptible, targetTimesEntered); } /** * @returns a BSON object showing the current mode and data stored. */ BSONObj toBSON() const { return _impl()->toBSON(); } /** * Create a LockHandle from this FailPoint. * The returned object will be active if the failpoint is active. * If it's active, the returned object can be used to access FailPoint data. */ LockHandle scoped() { return scopedIf(nullptr); } /** * Create a LockHandle from this FailPoint. * If `pred(payload)` is true, then the returned object is active and the * FailPoint's activation count is altered (relevant to e.g. the `nTimes` mode). If the * predicate is false, an inactive LockHandle is returned and this FailPoint's mode is not * modified at all. * If it's active, the returned object can be used to access FailPoint data. * The `pred` should be callable like a `bool pred(const BSONObj&)`. */ template LockHandle scopedIf(Pred&& pred) { return _impl()->tryLock(pred); } template void execute(F&& f) { return executeIf(f, nullptr); } /** * If `pred(payload)` is true, then `f(payload)` is executed and the FailPoint's * activation count is altered (relevant to e.g. the `nTimes` mode). Otherwise, `f` * is not executed and this FailPoint's mode is not altered (e.g. `nTimes` isn't * consumed). * The `pred` should be callable like a `bool pred(const BSONObj&)`. */ template void executeIf(F&& f, Pred&& pred) { auto sfp = scopedIf(pred); if (MONGO_unlikely(sfp.isActive())) { std::forward(f)(sfp.getData()); } } /** * Take short `_kWaitGranularity` pauses for as long as the FailPoint is * active. Though this makes several accesses to `shouldFail()`, it counts * as only one increment in the FailPoint `nTimes` counter. */ void pauseWhileSet() { pauseWhileSet(Interruptible::notInterruptible()); } /** * Like `pauseWhileSet`, but interruptible via the `interruptible->sleepFor` mechanism. See * `mongo::Interruptible::sleepFor`. */ void pauseWhileSet(Interruptible* interruptible) { _impl()->pauseWhileSet(interruptible); } /** * Like `pauseWhileSet`, but will also unpause as soon as the cancellation token is canceled. * This method will throw if the token is canceled, to match the behavior when the * Interruptible* is interrupted. */ void pauseWhileSetAndNotCanceled(Interruptible* interruptible, const CancellationToken& token) { _impl()->pauseWhileSetAndNotCanceled(interruptible, token); } private: const Impl* _rawImpl() const { return reinterpret_cast(&_implStorage); } Impl* _rawImpl() { return const_cast(std::as_const(*this)._rawImpl()); // Reuse const overload } const Impl* _impl() const { // Relaxed: such violations can only occur during single-threaded static initialization. invariant(_ready.loadRelaxed(), "Use of uninitialized FailPoint"); return _rawImpl(); } Impl* _impl() { return const_cast(std::as_const(*this)._impl()); // Reuse const overload } const bool _immortal; /** * True only when `_impl()` should succeed. * We exploit zero-initialization of statics to detect use-before-init. */ AtomicWord _ready; std::aligned_storage_t _implStorage; }; class FailPointRegistry { public: FailPointRegistry(); /** * Adds a new fail point to this registry. Duplicate names are not allowed. * * @return the status code under these circumstances: * OK - if successful. * 51006 - if the given name already exists in this registry. * CannotMutateObject - if this registry is already frozen. */ Status add(FailPoint* failPoint); /** * @return a registered FailPoint, or nullptr if it was not registered. */ FailPoint* find(StringData name) const; /** * Freezes this registry from being modified. */ void freeze(); /** * Creates a new FailPointServerParameter for each failpoint in the registry. This allows the * failpoint to be set on the command line via --setParameter, but is only allowed when * running with '--setParameter enableTestCommands=1'. */ void registerAllFailPointsAsServerParameters(); /** * Sets all registered FailPoints to Mode::off. Used primarily during unit test cleanup to * reset the state of all FailPoints set by the unit test. Does not prevent FailPoints from * being enabled again after. */ void disableAllFailpoints(); private: bool _frozen; StringMap _fpMap; }; /** * A scope guard that enables a named FailPoint on construction and disables it on destruction. */ class FailPointEnableBlock { public: explicit FailPointEnableBlock(StringData failPointName); FailPointEnableBlock(StringData failPointName, BSONObj data); explicit FailPointEnableBlock(FailPoint* failPoint); FailPointEnableBlock(FailPoint* failPoint, BSONObj data); ~FailPointEnableBlock(); FailPointEnableBlock(const FailPointEnableBlock&) = delete; FailPointEnableBlock& operator=(const FailPointEnableBlock&) = delete; // Const access to the underlying FailPoint const FailPoint* failPoint() const { return _failPoint; } // Const access to the underlying FailPoint const FailPoint* operator->() const { return failPoint(); } // Return the value of timesEntered() when the block was entered auto initialTimesEntered() const { return _initialTimesEntered; } private: FailPoint* const _failPoint; FailPoint::EntryCountT _initialTimesEntered; }; /** * Set a fail point in the global registry to a given value via BSON * @return the number of times the fail point has been entered so far. * @throw DBException corresponding to ErrorCodes::FailPointSetFailed if no failpoint * called failPointName exists. */ FailPoint::EntryCountT setGlobalFailPoint(const std::string& failPointName, const BSONObj& cmdObj); /** * Registration object for FailPoint. Its static-initializer registers FailPoint `fp` * into the `globalFailPointRegistry()` under the specified `name`. */ class FailPointRegisterer { public: explicit FailPointRegisterer(FailPoint* fp); }; FailPointRegistry& globalFailPointRegistry(); /** * Convenience macro for defining a fail point and registering it. * Must be used at namespace scope, not at local (inside a function) or class scope. * Never use in header files, only .cpp files. */ #define MONGO_FAIL_POINT_DEFINE(fp) \ ::mongo::FailPoint fp(#fp, true); /* An immortal FailPoint */ \ ::mongo::FailPointRegisterer fp##failPointRegisterer(&fp); } // namespace mongo