diff options
author | Scott Hernandez <scotthernandez@gmail.com> | 2015-04-27 14:51:06 -0400 |
---|---|---|
committer | Scott Hernandez <scotthernandez@gmail.com> | 2015-06-01 13:08:46 -0400 |
commit | 2d1b0186696872e3c15b61fb354cb09f9d34c563 (patch) | |
tree | c6b21f1d91108130e54baefb67f3537c8975dbed | |
parent | b4f2474724b62ff50ccbb3e4e3de6f17cdf83eb4 (diff) | |
download | mongo-2d1b0186696872e3c15b61fb354cb09f9d34c563.tar.gz |
SERVER-17807: reset state before starting intial sync attempt
(cherry picked from commit 2b0ad1da1d85b034bb4d958707755a74b2c54e96)
-rw-r--r-- | jstests/noPassthrough/initial_sync_cloner_dups.js | 15 | ||||
-rw-r--r-- | src/mongo/db/repl/bgsync.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/repl/bgsync.h | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/rs_initialsync.cpp | 23 |
4 files changed, 49 insertions, 4 deletions
diff --git a/jstests/noPassthrough/initial_sync_cloner_dups.js b/jstests/noPassthrough/initial_sync_cloner_dups.js index 050b14c4f39..6fec3b49ca9 100644 --- a/jstests/noPassthrough/initial_sync_cloner_dups.js +++ b/jstests/noPassthrough/initial_sync_cloner_dups.js @@ -76,6 +76,14 @@ var secondary = replTest.add({}); replTest.reInitiate(); secondary.setSlaveOk(); +// This fail point will cause the first intial sync to fail, and leave an op in the buffer to +// verify the fix from SERVER-17807 +print("=================== failpoint enabled =============="); +printjson(assert.commandWorked(secondary.getDB("admin").adminCommand( + { configureFailPoint: 'failInitSyncWithBufferedEntriesLeft', + mode: {times: 1}} ))); +printjson(assert.commandWorked(secondary.getDB("admin").adminCommand( { resync:true } ))); + // NOTE: This is here to prevent false negatives, but it is racy and dependent on magic numbers. // Removed the assertion because it was too flaky. Printing a warning instead (dan) jsTestLog("making sure we dropped some dups"); @@ -87,10 +95,13 @@ if (!droppedDups) { jsTestLog("Warning: Test did not trigger duplicate documents, this run will be a false negative"); } -jsTestLog("stoping writes and waiting for replica set to coalesce") +jsTestLog("stopping writes and waiting for replica set to coalesce") primary.getDB('test').stop.insert({}); worker.join(); -replTest.awaitReplication(); // Make sure all writes have hit secondary. +//make sure all secondaries are caught up, after init sync +reconnect(secondary.getDB("test")); +replTest.awaitSecondaryNodes(); +replTest.awaitReplication(2*60*1000); jsTestLog("check that secondary has correct counts"); var secondaryColl = secondary.getDB('test').getCollection('cloner'); diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp index 54f33578a32..6e26475642f 100644 --- a/src/mongo/db/repl/bgsync.cpp +++ b/src/mongo/db/repl/bgsync.cpp @@ -481,6 +481,10 @@ namespace { return _lastAppliedHash; } + void BackgroundSync::clearBuffer() { + _buffer.clear(); + } + void BackgroundSync::setLastAppliedHash(long long newHash) { boost::lock_guard<boost::mutex> lck(_mutex); _lastAppliedHash = newHash; @@ -533,6 +537,11 @@ namespace { _initialSyncRequestedFlag = value; } + void BackgroundSync::pushTestOpToBuffer(const BSONObj& op) { + boost::lock_guard<boost::mutex> lock(_mutex); + _buffer.push(op); + } + } // namespace repl } // namespace mongo diff --git a/src/mongo/db/repl/bgsync.h b/src/mongo/db/repl/bgsync.h index 8a36e572abe..2952879f246 100644 --- a/src/mongo/db/repl/bgsync.h +++ b/src/mongo/db/repl/bgsync.h @@ -108,6 +108,9 @@ namespace repl { void setLastAppliedHash(long long oldH); void loadLastAppliedHash(OperationContext* txn); + // Clears any fetched and buffered oplog entries. + void clearBuffer(); + bool getInitialSyncRequestedFlag(); void setInitialSyncRequestedFlag(bool value); @@ -119,6 +122,9 @@ namespace repl { return _indexPrefetchConfig; } + + // Testing related stuff + void pushTestOpToBuffer(const BSONObj& op); private: static BackgroundSync *s_instance; // protects creation of s_instance diff --git a/src/mongo/db/repl/rs_initialsync.cpp b/src/mongo/db/repl/rs_initialsync.cpp index 6f3ab4bb417..86265d0e601 100644 --- a/src/mongo/db/repl/rs_initialsync.cpp +++ b/src/mongo/db/repl/rs_initialsync.cpp @@ -49,6 +49,7 @@ #include "mongo/db/repl/oplogreader.h" #include "mongo/db/repl/replication_coordinator_global.h" #include "mongo/util/exit.h" +#include "mongo/util/fail_point_service.h" #include "mongo/util/log.h" #include "mongo/util/mongoutils/str.h" @@ -59,6 +60,9 @@ namespace { using std::list; using std::string; + // Failpoint which fails initial sync and leaves on oplog entry in the buffer. + MONGO_FP_DECLARE(failInitSyncWithBufferedEntriesLeft); + /** * Truncates the oplog (removes any documents) and resets internal variables that were * originally initialized or affected by using values from the oplog at startup time. These @@ -69,6 +73,9 @@ namespace { void truncateAndResetOplog(OperationContext* txn, ReplicationCoordinator* replCoord, BackgroundSync* bgsync) { + // Clear minvalid + setMinValid(txn, OpTime()); + AutoGetDb autoDb(txn, "local", MODE_X); massert(28585, "no local database found", autoDb.getDb()); invariant(txn->lockState()->isCollectionLockedForMode(rsoplog, MODE_X)); @@ -81,6 +88,9 @@ namespace { // because the bgsync thread, while running, may update the blacklist. replCoord->resetMyLastOptime(); bgsync->stop(); + bgsync->setLastAppliedHash(0); + bgsync->clearBuffer(); + replCoord->clearSyncSourceBlacklist(); // Truncate the oplog in case there was a prior initial sync that failed. @@ -205,6 +215,15 @@ namespace { OplogReader* r) { const OpTime startOpTime = getGlobalReplicationCoordinator()->getMyLastOptime(); BSONObj lastOp; + + // If the fail point is set, exit failing. + if (MONGO_FAIL_POINT(failInitSyncWithBufferedEntriesLeft)) { + log() << "adding fake oplog entry to buffer."; + BackgroundSync::get()->pushTestOpToBuffer( + BSON("ts" << startOpTime << "v" << 1 << "op" << "n")); + return false; + } + try { // It may have been a long time since we last used this connection to // query the oplog, depending on the size of the databases we needed to clone. @@ -250,10 +269,9 @@ namespace { } } catch (const DBException&) { - log() << "replSet initial sync failed during oplog application phase, and will retry"; - getGlobalReplicationCoordinator()->resetMyLastOptime(); BackgroundSync::get()->setLastAppliedHash(0); + warning() << "initial sync failed during oplog application phase, and will retry"; sleepsecs(5); return false; @@ -326,6 +344,7 @@ namespace { OperationContextImpl txn; ReplicationCoordinator* replCoord(getGlobalReplicationCoordinator()); + // reset state for initial sync truncateAndResetOplog(&txn, replCoord, bgsync); OplogReader r; |