From 997b1c3718b378e4096fb708e054644690b38c4d Mon Sep 17 00:00:00 2001 From: Randolph Tan Date: Wed, 19 Sep 2012 16:32:42 -0400 Subject: SERVER-5175 Need "failpoints" system to facilitate testing core server Step 1: Implement and test the FailPoint class --- src/mongo/SConscript | 7 ++ src/mongo/db/fail_point_manager.cpp | 155 ++++++++++++++++++++++++ src/mongo/db/fail_point_manager.h | 51 ++++++++ src/mongo/util/fail_point.cpp | 130 ++++++++++++++++++++ src/mongo/util/fail_point.h | 185 +++++++++++++++++++++++++++++ src/mongo/util/fail_point_test.cpp | 230 ++++++++++++++++++++++++++++++++++++ 6 files changed, 758 insertions(+) create mode 100644 src/mongo/db/fail_point_manager.cpp create mode 100644 src/mongo/db/fail_point_manager.h create mode 100644 src/mongo/util/fail_point.cpp create mode 100644 src/mongo/util/fail_point.h create mode 100644 src/mongo/util/fail_point_test.cpp (limited to 'src') diff --git a/src/mongo/SConscript b/src/mongo/SConscript index c2c66783fb1..62ee03f409f 100644 --- a/src/mongo/SConscript +++ b/src/mongo/SConscript @@ -99,12 +99,15 @@ commonFiles = [ "pch.cpp", "client/distlock.cpp", ] +env.StaticLibrary("fail_point", "util/fail_point.cpp", LIBDEPS=["foundation", "bson"]) + env.StaticLibrary('mongocommon', commonFiles, LIBDEPS=['bson', 'foundation', 'md5', 'stacktrace', 'stringutils', + 'fail_point', '$BUILD_DIR/third_party/pcrecpp', '$BUILD_DIR/third_party/murmurhash3/murmurhash3', '$BUILD_DIR/third_party/shim_boost'],) @@ -353,6 +356,10 @@ env.CppUnitTest( "balancer_policy_test" , [ "s/balancer_policy_tests.cpp" ] , LIBDEPS=["mongoscore", "coreshard","mongocommon","coreserver","coredb","dbcmdline","mongodandmongos"] , NO_CRUTCH=True) +# Should only need stuff from util, unittest and platform +env.CppUnitTest("fail_point_test", [ "util/fail_point_test.cpp" ], + LIBDEPS=["fail_point"]) + serverOnlyFiles += [ "s/d_logic.cpp", "s/d_writeback.cpp", "s/d_migrate.cpp", diff --git a/src/mongo/db/fail_point_manager.cpp b/src/mongo/db/fail_point_manager.cpp new file mode 100644 index 00000000000..0a17fb7d9e2 --- /dev/null +++ b/src/mongo/db/fail_point_manager.cpp @@ -0,0 +1,155 @@ +/* + * Copyright (C) 2012 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "fail_point_manager.h" + +#include "mongo/db/commands.h" + +namespace mongo { + /** + * Command for modifying installed fail points. + * + * Format + * { + * injectFault: , // name of the fail point. + * mode: , // the new mode to set. Can have one of the + * following format: + * + * 1. ‘off’ - disable fail point. + * 2. ‘alwaysOn’ - fail point is always active. + * 3. { period: } - n should be within the range of a 32 bit signed + * integer and this would be the approximate period for every activation. + * For example, for { period: 120 }, the probability of the fail point to + * be activated is 1 in 120. NOT YET SUPPORTED. + * 4. { times: } - n should be positive and within the range of a 32 bit + * signed integer and this is the number of passes on the fail point will + * remain activated. + * + * data: // optional arbitrary object to store. + * } + */ + class FaultInjectCmd: public Command { + public: + FaultInjectCmd(): Command("faultInject") {} + + virtual bool slaveOk() const { + return true; + } + + virtual LockType locktype() const { + return NONE; + } + + virtual bool adminOnly() const { + return true; + } + + virtual void help(stringstream& h) const { + h << "modifies the settings of a fail point"; + } + + bool run(const string& dbname, + BSONObj& cmdObj, + int, + string& errmsg, + BSONObjBuilder& result, + bool fromRepl) { + const string failPointName(cmdObj.firstElement().str()); + FailPointRegistry* registry = FailPointManager::getRegistry(); + FailPoint* failPoint = registry->getFailPoint(failPointName); + + if (failPoint == NULL) { + errmsg = failPointName + " not found"; + return false; + } + + FailPoint::Mode mode = FailPoint::alwaysOn; + FailPoint::ValType val = 0; + + const BSONElement modeElem(cmdObj["mode"]); + if (modeElem.eoo()) { + // TODO: return error or display state? + } + else if (modeElem.type() == String) { + const string modeStr(modeElem.valuestr()); + + if (modeStr == "off") { + mode = FailPoint::off; + } + else if (modeStr == "alwaysOn") { + mode = FailPoint::alwaysOn; + } + else { + errmsg = "unknown mode: " + modeStr; + return false; + } + } + else if (modeElem.type() == Object) { + const BSONObj modeObj(modeElem.Obj()); + + if (modeObj.hasField("times")) { + mode = FailPoint::nTimes; + const int intVal = modeObj["times"].numberInt(); + + if (intVal < 0) { + errmsg = "times should be positive"; + return false; + } + + val = intVal; + } + else if (modeObj.hasField("period")) { + mode = FailPoint::random; + + // TODO: implement + errmsg = "random is not yet supported"; + return false; + } + else { + errmsg = "invalid mode object"; + return false; + } + } + else { + errmsg = "invalid mode format"; + return false; + } + + if (cmdObj.hasField("data")) { + const BSONObj dataObj(cmdObj["data"].Obj()); + failPoint->setMode(mode, val, &dataObj); + } + else { + failPoint->setMode(mode, val); + } + + return true; + } + }; + + scoped_ptr faultInjectCmd; + + FailPointRegistry FailPointManager::_fpRegistry; + + FailPointRegistry* FailPointManager::getRegistry() { + return &_fpRegistry; + } + + void FailPointManager::init() { + faultInjectCmd.reset(new FaultInjectCmd); + _fpRegistry.freeze(); + } +} diff --git a/src/mongo/db/fail_point_manager.h b/src/mongo/db/fail_point_manager.h new file mode 100644 index 00000000000..47358eec760 --- /dev/null +++ b/src/mongo/db/fail_point_manager.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2012 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#pragma once + +#include "mongo/util/fail_point.h" + +namespace mongo { + class FailPointManager { + public: + /** + * @return the global fail point registry. + */ + static FailPointRegistry* getRegistry(); + + /** + * Installs the injectFault command. + * + * Note: not thread-safe + */ + static void init(); + + private: + static FailPointRegistry _fpRegistry; + }; + + class FailPointInstaller { + public: + FailPointInstaller(FailPointRegistry* fpRegistry, + string name, + FailPoint* failPoint) { + fpRegistry->addFailPoint(name, failPoint); + } + }; + +#define MONGO_FP_DECLARE(fp) FailPoint fp; \ + FailPointInstaller install_##fp(FailPointManager::getRegistry(), #fp, &fp); +} diff --git a/src/mongo/util/fail_point.cpp b/src/mongo/util/fail_point.cpp new file mode 100644 index 00000000000..390fdb58bff --- /dev/null +++ b/src/mongo/util/fail_point.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2012 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include "mongo/util/fail_point.h" + +#include "mongo/util/mongoutils/str.h" +#include "mongo/util/time_support.h" + +using mongoutils::str::stream; + +namespace mongo { + FailPoint::FailPoint(): + _fpInfo(0), + _mode(off), + _timesOrPeriod(0), + _modMutex("failPointMutex") { + } + + void FailPoint::shouldFailCloseBlock() { + _fpInfo.subtractAndFetch(1); + } + + void FailPoint::setMode(Mode mode, ValType val, const BSONObj& extra) { + /** + * Outline: + * + * 1. Deactivates fail point to enter write-only mode + * 2. Waits for all current readers of the fail point to finish + * 3. Sets the new mode. + */ + + scoped_lock scoped(_modMutex); + + // Step 1 + disableFailPoint(); + + // Step 2 + while (_fpInfo.load() != 0) { + sleepmillis(50); + } + + // Step 3 + uassert(16442, stream() << "mode not supported " << static_cast(mode), + mode >= off && mode < numModes); + + _mode = mode; + _timesOrPeriod.store(val); + + _data = extra.copy(); + + if (_mode != off) { + _fpInfo.store(ACTIVE_BIT); + } + } + + const BSONObj& FailPoint::getData() const { + return _data; + } + + void FailPoint::disableFailPoint() { + // TODO: Better to replace with a bitwise AND, once available for AU32 + ValType currentVal = _fpInfo.load(); + ValType expectedCurrentVal; + ValType newVal; + + do { + expectedCurrentVal = currentVal; + newVal = expectedCurrentVal & REF_COUNTER_MASK; + currentVal = _fpInfo.compareAndSwap(expectedCurrentVal, newVal); + } while (expectedCurrentVal != currentVal); + } + + FailPoint::RetCode FailPoint::slowShouldFailOpenBlock() { + ValType localFpInfo = _fpInfo.addAndFetch(1); + + if ((localFpInfo & ACTIVE_BIT) == 0) { + return slowOff; + } + + switch (_mode) { + case alwaysOn: + return slowOn; + + case random: + // TODO: randomly determine if should be active or not + error() << "FailPoint Mode random is not yet supported." << endl; + fassertFailed(16443); + + case nTimes: + { + AtomicInt32::WordType newVal = _timesOrPeriod.subtractAndFetch(1); + + if (newVal <= 0) { + disableFailPoint(); + } + + return slowOn; + } + + default: + error() << "FailPoint Mode not supported: " << static_cast(_mode) << endl; + fassertFailed(16444); + } + } + + ScopedFailPoint::ScopedFailPoint(FailPoint* failPoint): + _failPoint(failPoint), + _once(false), + _shouldClose(false) { + } + + ScopedFailPoint::~ScopedFailPoint() { + if (_shouldClose) { + _failPoint->shouldFailCloseBlock(); + } + } +} diff --git a/src/mongo/util/fail_point.h b/src/mongo/util/fail_point.h new file mode 100644 index 00000000000..7862db9e39e --- /dev/null +++ b/src/mongo/util/fail_point.h @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2012 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#pragma once + +#include "mongo/db/jsobj.h" +#include "mongo/platform/atomic_word.h" +#include "mongo/util/concurrency/mutex.h" + +namespace mongo { + /** + * A simple thread-safe fail point implementation that can be activated and + * deactivated, as well as embed temporary data into it. + * + * The fail point has a static instance, which is represented by a FailPoint + * object, and dynamic instance, which are all the threads in between + * shouldFailOpenBlock and shouldFailCloseBlock. + * + * Sample use: + * // Declared somewhere: + * FailPoint makeBadThingsHappen; + * + * // Somewhere in the code + * return false || MONGO_FAIL_POINT(makeBadThingsHappen); + * + * or + * + * // Somewhere in the code + * MONGO_FAIL_POINT_BLOCK(makeBadThingsHappen) { + * const BSONObj& data = makeBadThingsHappen.getData(); + * // Do something + * } + * + * Invariants: + * + * 1. Always refer to _fpInfo first to check if failPoint is active or not before + * entering fail point or modifying fail point. + * 2. Client visible fail point states are read-only when active. + */ + class FailPoint { + public: + typedef AtomicUInt32::WordType ValType; + enum Mode { off, alwaysOn, random, nTimes, numModes }; + enum RetCode { fastOff = 0, slowOff, slowOn }; + + FailPoint(); + + /** + * Note: This is not side-effect free - it can change the state to OFF after calling. + * + * @return true if fail point is active. + */ + inline bool shouldFail() { + RetCode ret = shouldFailOpenBlock(); + + if (MONGO_likely(ret == fastOff)) { + return false; + } + + shouldFailCloseBlock(); + return ret == slowOn; + } + + /** + * Checks whether fail point is active and increments the reference counter without + * decrementing it. Must call shouldFailCloseBlock afterwards when the return value + * is not fastOff. Otherwise, this will remain read-only forever. + * + * @return slowOn if fail point is active. + */ + inline RetCode shouldFailOpenBlock() { + // TODO: optimization - use unordered load once available + if (MONGO_likely((_fpInfo.load() & ACTIVE_BIT) == 0)) { + return fastOff; + } + + return slowShouldFailOpenBlock(); + } + + /** + * Decrements the reference counter. + * @see #shouldFailOpenBlock + */ + void shouldFailCloseBlock(); + + /** + * Changes the settings of this fail point. This will turn off the fail point + * and waits for all dynamic instances referencing this fail point to go away before + * actually modifying the settings. + * + * @param mode the new mode for this fail point. + * @param val the value that can have different usage depending on the mode: + * + * - off, alwaysOn: ignored + * - random: + * - nTimes: the number of times this fail point will be active when + * #shouldFail or #shouldFailOpenBlock is called. + * + * @param extra arbitrary BSON object that can be stored to this fail point + * that can be referenced afterwards with #getData. Defaults to an empty + * document. + */ + void setMode(Mode mode, ValType val = 0, const BSONObj& extra = BSONObj()); + + /** + * @return the stored BSONObj in this fail point. Note that this cannot be safely + * read if this fail point is off. + */ + const BSONObj& getData() const; + + private: + static const ValType ACTIVE_BIT = 1 << 31; + static const ValType REF_COUNTER_MASK = ~ACTIVE_BIT; + + // Bit layout: + // 31: tells whether this fail point is active. + // 0~30: unsigned ref counter for active dynamic instances. + AtomicUInt32 _fpInfo; + + // Invariant: These should be read only if ACTIVE_BIT of _fpInfo is set. + Mode _mode; + AtomicInt32 _timesOrPeriod; + BSONObj _data; + + // protects _mode, _timesOrPeriod, _data + mutex _modMutex; + + /** + * Disables this fail point. + */ + void disableFailPoint(); + + /** + * slow path for #shouldFailOpenBlock + */ + RetCode slowShouldFailOpenBlock(); + }; + + /** + * Helper class for making sure that FailPoint#shouldFailCloseBlock is called when + * FailPoint#shouldFailOpenBlock was called. + */ + class ScopedFailPoint { + public: + ScopedFailPoint(FailPoint* failPoint); + ~ScopedFailPoint(); + + /** + * @return true if fail point is on. This will be true at most once. + */ + inline bool isActive() { + if (_once) { + return false; + } + + _once = true; + + FailPoint::RetCode ret = _failPoint->shouldFailOpenBlock(); + _shouldClose = ret != FailPoint::fastOff; + return ret == FailPoint::slowOn; + } + + private: + FailPoint* _failPoint; + bool _once; + bool _shouldClose; + }; + + #define MONGO_FAIL_POINT(symbol) MONGO_unlikely(symbol.shouldFail()) + #define MONGO_FAIL_POINT_BLOCK(symbol) for (mongo::ScopedFailPoint scopedFP(&symbol); \ + MONGO_unlikely(scopedFP.isActive()); ) +} diff --git a/src/mongo/util/fail_point_test.cpp b/src/mongo/util/fail_point_test.cpp new file mode 100644 index 00000000000..c9fc8b3e33c --- /dev/null +++ b/src/mongo/util/fail_point_test.cpp @@ -0,0 +1,230 @@ +/** + * Copyright (C) 2012 10gen Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License, version 3, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + +#include "mongo/util/fail_point.h" +#include "mongo/util/time_support.h" +#include "mongo/unittest/unittest.h" + +using mongo::FailPoint; + +namespace mongo_test { + TEST(FailPoint, InitialState) { + FailPoint failPoint; + ASSERT_FALSE(failPoint.shouldFail()); + ASSERT(failPoint.getData().isEmpty()); + ASSERT_FALSE(failPoint.shouldFail()); + } + + TEST(FailPoint, AlwaysOn) { + FailPoint failPoint; + failPoint.setMode(FailPoint::alwaysOn); + ASSERT(failPoint.shouldFail()); + ASSERT(failPoint.getData().isEmpty()); + + for (size_t x = 0; x < 50; x++) { + ASSERT(failPoint.shouldFail()); + } + } + + TEST(FailPoint, NTimes) { + FailPoint failPoint; + failPoint.setMode(FailPoint::nTimes, 4); + ASSERT(failPoint.shouldFail()); + ASSERT(failPoint.shouldFail()); + ASSERT(failPoint.shouldFail()); + ASSERT(failPoint.shouldFail()); + + for (size_t x = 0; x < 50; x++) { + ASSERT_FALSE(failPoint.shouldFail()); + } + } + + TEST(FailPoint, BlockOff) { + FailPoint failPoint; + bool called = false; + + MONGO_FAIL_POINT_BLOCK(failPoint) { + called = true; + } + + ASSERT_FALSE(called); + } + + TEST(FailPoint, BlockAlwaysOn) { + FailPoint failPoint; + failPoint.setMode(FailPoint::alwaysOn); + bool called = false; + + MONGO_FAIL_POINT_BLOCK(failPoint) { + called = true; + } + + ASSERT(called); + } + + TEST(FailPoint, BlockNTimes) { + FailPoint failPoint; + failPoint.setMode(FailPoint::nTimes, 1); + size_t counter = 0; + + for (size_t x = 0; x < 10; x++) { + MONGO_FAIL_POINT_BLOCK(failPoint) { + counter++; + } + } + + ASSERT_EQUALS(1U, counter); + } + + TEST(FailPoint, BlockWithException) { + FailPoint failPoint; + failPoint.setMode(FailPoint::alwaysOn); + bool threw = false; + + try { + MONGO_FAIL_POINT_BLOCK(failPoint) { + throw std::logic_error("BlockWithException threw"); + } + } + catch (const std::logic_error &) { + threw = true; + } + + ASSERT(threw); + // This will get into an infinite loop if reference counter was not + // properly decremented + failPoint.setMode(FailPoint::off); + } + + TEST(FailPoint, SetGetParam) { + FailPoint failPoint; + failPoint.setMode(FailPoint::alwaysOn, 0, BSON("x" << 20)); + + MONGO_FAIL_POINT_BLOCK(failPoint) { + ASSERT_EQUALS(20, failPoint.getData()["x"].numberInt()); + } + } + + TEST(FailPoint, SetInvalidMode) { + FailPoint failPoint; + + ASSERT_THROWS(failPoint.setMode(static_cast(9999)), + mongo::UserException); + ASSERT_FALSE(failPoint.shouldFail()); + + ASSERT_THROWS(failPoint.setMode(static_cast(-1)), + mongo::UserException); + ASSERT_FALSE(failPoint.shouldFail()); + } + + class FailPointStress: public mongo::unittest::Test { + public: + FailPointStress(): _tasks(NULL) { + } + + void setUp() { + _fp.setMode(FailPoint::alwaysOn, 0, BSON("a" << 44)); + } + + void tearDown() { + // Note: This can loop indefinitely if reference counter was off + _fp.setMode(FailPoint::off, 0, BSON("a" << 66)); + } + + void startTest() { + verify(_tasks == NULL); + + _tasks = new boost::thread_group(); + _tasks->add_thread(new boost::thread(blockTask, &_fp)); + _tasks->add_thread(new boost::thread(blockWithExceptionTask, &_fp)); + _tasks->add_thread(new boost::thread(simpleTask, &_fp)); + _tasks->add_thread(new boost::thread(flipTask, &_fp)); + } + + void stopTest() { + _tasks->interrupt_all(); + _tasks->join_all(); + delete _tasks; + _tasks = NULL; + } + + private: + static void blockTask(FailPoint* failPoint) { + while (true) { + MONGO_FAIL_POINT_BLOCK((*failPoint)) { + const mongo::BSONObj& data = failPoint->getData(); + ASSERT_EQUALS(44, data["a"].numberInt()); + } + + boost::this_thread::interruption_point(); + } + } + + static void blockWithExceptionTask(FailPoint* failPoint) { + while (true) { + try { + MONGO_FAIL_POINT_BLOCK((*failPoint)) { + const mongo::BSONObj& data = failPoint->getData(); + ASSERT_EQUALS(44, data["a"].numberInt()); + throw std::logic_error("blockWithExceptionTask threw"); + } + } + catch (const std::logic_error&) { + } + + boost::this_thread::interruption_point(); + } + } + + static void simpleTask(FailPoint* failPoint) { + while (true) { + if (MONGO_FAIL_POINT((*failPoint))) { + const mongo::BSONObj& data = failPoint->getData(); + ASSERT_EQUALS(44, data["a"].numberInt()); + } + + boost::this_thread::interruption_point(); + } + } + + static void flipTask(FailPoint* failPoint) { + while (true) { + if(failPoint->shouldFail()) { + failPoint->setMode(FailPoint::off, 0); + } + else { + failPoint->setMode(FailPoint::alwaysOn, 0, BSON("a" << 44)); + } + + boost::this_thread::interruption_point(); + } + } + + FailPoint _fp; + boost::thread_group* _tasks; + }; + + TEST_F(FailPointStress, Basic) { + startTest(); + mongo::sleepsecs(120); + stopTest(); + } +} -- cgit v1.2.1