summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorScott Hernandez <scotthernandez@gmail.com>2015-04-27 14:51:06 -0400
committerScott Hernandez <scotthernandez@gmail.com>2015-06-01 13:08:46 -0400
commit2d1b0186696872e3c15b61fb354cb09f9d34c563 (patch)
treec6b21f1d91108130e54baefb67f3537c8975dbed
parentb4f2474724b62ff50ccbb3e4e3de6f17cdf83eb4 (diff)
downloadmongo-2d1b0186696872e3c15b61fb354cb09f9d34c563.tar.gz
SERVER-17807: reset state before starting intial sync attempt
(cherry picked from commit 2b0ad1da1d85b034bb4d958707755a74b2c54e96)
-rw-r--r--jstests/noPassthrough/initial_sync_cloner_dups.js15
-rw-r--r--src/mongo/db/repl/bgsync.cpp9
-rw-r--r--src/mongo/db/repl/bgsync.h6
-rw-r--r--src/mongo/db/repl/rs_initialsync.cpp23
4 files changed, 49 insertions, 4 deletions
diff --git a/jstests/noPassthrough/initial_sync_cloner_dups.js b/jstests/noPassthrough/initial_sync_cloner_dups.js
index 050b14c4f39..6fec3b49ca9 100644
--- a/jstests/noPassthrough/initial_sync_cloner_dups.js
+++ b/jstests/noPassthrough/initial_sync_cloner_dups.js
@@ -76,6 +76,14 @@ var secondary = replTest.add({});
replTest.reInitiate();
secondary.setSlaveOk();
+// This fail point will cause the first intial sync to fail, and leave an op in the buffer to
+// verify the fix from SERVER-17807
+print("=================== failpoint enabled ==============");
+printjson(assert.commandWorked(secondary.getDB("admin").adminCommand(
+ { configureFailPoint: 'failInitSyncWithBufferedEntriesLeft',
+ mode: {times: 1}} )));
+printjson(assert.commandWorked(secondary.getDB("admin").adminCommand( { resync:true } )));
+
// NOTE: This is here to prevent false negatives, but it is racy and dependent on magic numbers.
// Removed the assertion because it was too flaky. Printing a warning instead (dan)
jsTestLog("making sure we dropped some dups");
@@ -87,10 +95,13 @@ if (!droppedDups) {
jsTestLog("Warning: Test did not trigger duplicate documents, this run will be a false negative");
}
-jsTestLog("stoping writes and waiting for replica set to coalesce")
+jsTestLog("stopping writes and waiting for replica set to coalesce")
primary.getDB('test').stop.insert({});
worker.join();
-replTest.awaitReplication(); // Make sure all writes have hit secondary.
+//make sure all secondaries are caught up, after init sync
+reconnect(secondary.getDB("test"));
+replTest.awaitSecondaryNodes();
+replTest.awaitReplication(2*60*1000);
jsTestLog("check that secondary has correct counts");
var secondaryColl = secondary.getDB('test').getCollection('cloner');
diff --git a/src/mongo/db/repl/bgsync.cpp b/src/mongo/db/repl/bgsync.cpp
index 54f33578a32..6e26475642f 100644
--- a/src/mongo/db/repl/bgsync.cpp
+++ b/src/mongo/db/repl/bgsync.cpp
@@ -481,6 +481,10 @@ namespace {
return _lastAppliedHash;
}
+ void BackgroundSync::clearBuffer() {
+ _buffer.clear();
+ }
+
void BackgroundSync::setLastAppliedHash(long long newHash) {
boost::lock_guard<boost::mutex> lck(_mutex);
_lastAppliedHash = newHash;
@@ -533,6 +537,11 @@ namespace {
_initialSyncRequestedFlag = value;
}
+ void BackgroundSync::pushTestOpToBuffer(const BSONObj& op) {
+ boost::lock_guard<boost::mutex> lock(_mutex);
+ _buffer.push(op);
+ }
+
} // namespace repl
} // namespace mongo
diff --git a/src/mongo/db/repl/bgsync.h b/src/mongo/db/repl/bgsync.h
index 8a36e572abe..2952879f246 100644
--- a/src/mongo/db/repl/bgsync.h
+++ b/src/mongo/db/repl/bgsync.h
@@ -108,6 +108,9 @@ namespace repl {
void setLastAppliedHash(long long oldH);
void loadLastAppliedHash(OperationContext* txn);
+ // Clears any fetched and buffered oplog entries.
+ void clearBuffer();
+
bool getInitialSyncRequestedFlag();
void setInitialSyncRequestedFlag(bool value);
@@ -119,6 +122,9 @@ namespace repl {
return _indexPrefetchConfig;
}
+
+ // Testing related stuff
+ void pushTestOpToBuffer(const BSONObj& op);
private:
static BackgroundSync *s_instance;
// protects creation of s_instance
diff --git a/src/mongo/db/repl/rs_initialsync.cpp b/src/mongo/db/repl/rs_initialsync.cpp
index 6f3ab4bb417..86265d0e601 100644
--- a/src/mongo/db/repl/rs_initialsync.cpp
+++ b/src/mongo/db/repl/rs_initialsync.cpp
@@ -49,6 +49,7 @@
#include "mongo/db/repl/oplogreader.h"
#include "mongo/db/repl/replication_coordinator_global.h"
#include "mongo/util/exit.h"
+#include "mongo/util/fail_point_service.h"
#include "mongo/util/log.h"
#include "mongo/util/mongoutils/str.h"
@@ -59,6 +60,9 @@ namespace {
using std::list;
using std::string;
+ // Failpoint which fails initial sync and leaves on oplog entry in the buffer.
+ MONGO_FP_DECLARE(failInitSyncWithBufferedEntriesLeft);
+
/**
* Truncates the oplog (removes any documents) and resets internal variables that were
* originally initialized or affected by using values from the oplog at startup time. These
@@ -69,6 +73,9 @@ namespace {
void truncateAndResetOplog(OperationContext* txn,
ReplicationCoordinator* replCoord,
BackgroundSync* bgsync) {
+ // Clear minvalid
+ setMinValid(txn, OpTime());
+
AutoGetDb autoDb(txn, "local", MODE_X);
massert(28585, "no local database found", autoDb.getDb());
invariant(txn->lockState()->isCollectionLockedForMode(rsoplog, MODE_X));
@@ -81,6 +88,9 @@ namespace {
// because the bgsync thread, while running, may update the blacklist.
replCoord->resetMyLastOptime();
bgsync->stop();
+ bgsync->setLastAppliedHash(0);
+ bgsync->clearBuffer();
+
replCoord->clearSyncSourceBlacklist();
// Truncate the oplog in case there was a prior initial sync that failed.
@@ -205,6 +215,15 @@ namespace {
OplogReader* r) {
const OpTime startOpTime = getGlobalReplicationCoordinator()->getMyLastOptime();
BSONObj lastOp;
+
+ // If the fail point is set, exit failing.
+ if (MONGO_FAIL_POINT(failInitSyncWithBufferedEntriesLeft)) {
+ log() << "adding fake oplog entry to buffer.";
+ BackgroundSync::get()->pushTestOpToBuffer(
+ BSON("ts" << startOpTime << "v" << 1 << "op" << "n"));
+ return false;
+ }
+
try {
// It may have been a long time since we last used this connection to
// query the oplog, depending on the size of the databases we needed to clone.
@@ -250,10 +269,9 @@ namespace {
}
}
catch (const DBException&) {
- log() << "replSet initial sync failed during oplog application phase, and will retry";
-
getGlobalReplicationCoordinator()->resetMyLastOptime();
BackgroundSync::get()->setLastAppliedHash(0);
+ warning() << "initial sync failed during oplog application phase, and will retry";
sleepsecs(5);
return false;
@@ -326,6 +344,7 @@ namespace {
OperationContextImpl txn;
ReplicationCoordinator* replCoord(getGlobalReplicationCoordinator());
+ // reset state for initial sync
truncateAndResetOplog(&txn, replCoord, bgsync);
OplogReader r;