summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Milkie <milkie@10gen.com>2013-05-01 15:33:08 -0400
committerEric Milkie <milkie@10gen.com>2013-05-01 15:33:08 -0400
commit16a08039c41107970e226e6a15e3eb683f4dae20 (patch)
treed387a1ab27560725f220d536cda3af92cd6d2f88
parent08e4b8cebaa96c023830e5460925f549c13804af (diff)
downloadmongo-16a08039c41107970e226e6a15e3eb683f4dae20.tar.gz
SERVER-9528 retry network failures during document copy at initial sync time
-rw-r--r--jstests/replsets/replset9.js23
-rw-r--r--src/mongo/db/oplog.cpp54
2 files changed, 64 insertions, 13 deletions
diff --git a/jstests/replsets/replset9.js b/jstests/replsets/replset9.js
index c1d73c9fce9..a1dc59f0459 100644
--- a/jstests/replsets/replset9.js
+++ b/jstests/replsets/replset9.js
@@ -5,7 +5,7 @@ var rt = new ReplSetTest( { name : "replset9tests" , nodes: 1, oplogSize: 100 }
var nodes = rt.startSet();
rt.initiate();
var master = rt.getMaster();
-var bigstring = Array(1000).toString();
+var bigstring = Array(5000).toString();
var md = master.getDB( 'd' );
var mdc = md[ 'c' ];
@@ -32,7 +32,26 @@ md.getLastError();
// add a secondary; start cloning
var slave = rt.add();
-rt.reInitiate();
+(function reinitiate() {
+ var master = rt.nodes[0];
+ var c = master.getDB("local")['system.replset'].findOne();
+ var config = rt.getReplSetConfig();
+ config.version = c.version + 1;
+ var admin = master.getDB("admin");
+ var cmd = {};
+ var cmdKey = 'replSetReconfig';
+ var timeout = timeout || 30000;
+ cmd[cmdKey] = config;
+ printjson(cmd);
+
+ jsTest.attempt({context:rt, timeout: timeout, desc: "reinitiate replica set"}, function() {
+ var result = admin.runCommand(cmd);
+ printjson(result);
+ return result['ok'] == 1;
+ });
+})();
+
+
print ("initiation complete!");
var sc = slave.getDB( 'd' )[ 'c' ];
slave.setSlaveOk();
diff --git a/src/mongo/db/oplog.cpp b/src/mongo/db/oplog.cpp
index e1e6e370d4f..870601c9b4b 100644
--- a/src/mongo/db/oplog.cpp
+++ b/src/mongo/db/oplog.cpp
@@ -695,19 +695,51 @@ namespace mongo {
return BSONObj();
}
- uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn));
+ const int retryMax = 3;
+ for (int retryCount = 1; retryCount <= retryMax; ++retryCount) {
+ if (retryCount != 1) {
+ // if we are retrying, sleep a bit to let the network possibly recover
+ sleepsecs(retryCount * retryCount);
+ }
+ try {
+ bool ok = missingObjReader.connect(hn);
+ if (!ok) {
+ warning() << "network problem detected while connecting to the "
+ << "sync source, attempt " << retryCount << " of "
+ << retryMax << endl;
+ continue; // try again
+ }
+ }
+ catch (const SocketException& exc) {
+ warning() << "network problem detected while connecting to the "
+ << "sync source, attempt " << retryCount << " of "
+ << retryMax << endl;
+ continue; // try again
+ }
- // might be more than just _id in the update criteria
- BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
- BSONObj missingObj;
- try {
- missingObj = missingObjReader.findOne(ns, query);
- } catch(DBException& e) {
- log() << "replication assertion fetching missing object: " << e.what() << endl;
- throw;
- }
+ // might be more than just _id in the update criteria
+ BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj();
+ BSONObj missingObj;
+ try {
+ missingObj = missingObjReader.findOne(ns, query);
+ }
+ catch (const SocketException& exc) {
+ warning() << "network problem detected while fetching a missing document from the "
+ << "sync source, attempt " << retryCount << " of "
+ << retryMax << endl;
+ continue; // try again
+ }
+ catch (DBException& e) {
+ log() << "replication assertion fetching missing object: " << e.what() << endl;
+ throw;
+ }
- return missingObj;
+ // success!
+ return missingObj;
+ }
+ // retry count exceeded
+ msgasserted(15916,
+ str::stream() << "Can no longer connect to initial sync source: " << hn);
}
bool Sync::shouldRetry(const BSONObj& o) {