diff options
author | Eric Milkie <milkie@10gen.com> | 2013-05-01 15:33:08 -0400 |
---|---|---|
committer | Eric Milkie <milkie@10gen.com> | 2013-05-01 15:33:08 -0400 |
commit | 16a08039c41107970e226e6a15e3eb683f4dae20 (patch) | |
tree | d387a1ab27560725f220d536cda3af92cd6d2f88 | |
parent | 08e4b8cebaa96c023830e5460925f549c13804af (diff) | |
download | mongo-16a08039c41107970e226e6a15e3eb683f4dae20.tar.gz |
SERVER-9528 retry network failures during document copy at initial sync time
-rw-r--r-- | jstests/replsets/replset9.js | 23 | ||||
-rw-r--r-- | src/mongo/db/oplog.cpp | 54 |
2 files changed, 64 insertions, 13 deletions
diff --git a/jstests/replsets/replset9.js b/jstests/replsets/replset9.js index c1d73c9fce9..a1dc59f0459 100644 --- a/jstests/replsets/replset9.js +++ b/jstests/replsets/replset9.js @@ -5,7 +5,7 @@ var rt = new ReplSetTest( { name : "replset9tests" , nodes: 1, oplogSize: 100 } var nodes = rt.startSet(); rt.initiate(); var master = rt.getMaster(); -var bigstring = Array(1000).toString(); +var bigstring = Array(5000).toString(); var md = master.getDB( 'd' ); var mdc = md[ 'c' ]; @@ -32,7 +32,26 @@ md.getLastError(); // add a secondary; start cloning var slave = rt.add(); -rt.reInitiate(); +(function reinitiate() { + var master = rt.nodes[0]; + var c = master.getDB("local")['system.replset'].findOne(); + var config = rt.getReplSetConfig(); + config.version = c.version + 1; + var admin = master.getDB("admin"); + var cmd = {}; + var cmdKey = 'replSetReconfig'; + var timeout = timeout || 30000; + cmd[cmdKey] = config; + printjson(cmd); + + jsTest.attempt({context:rt, timeout: timeout, desc: "reinitiate replica set"}, function() { + var result = admin.runCommand(cmd); + printjson(result); + return result['ok'] == 1; + }); +})(); + + print ("initiation complete!"); var sc = slave.getDB( 'd' )[ 'c' ]; slave.setSlaveOk(); diff --git a/src/mongo/db/oplog.cpp b/src/mongo/db/oplog.cpp index e1e6e370d4f..870601c9b4b 100644 --- a/src/mongo/db/oplog.cpp +++ b/src/mongo/db/oplog.cpp @@ -695,19 +695,51 @@ namespace mongo { return BSONObj(); } - uassert(15916, str::stream() << "Can no longer connect to initial sync source: " << hn, missingObjReader.connect(hn)); + const int retryMax = 3; + for (int retryCount = 1; retryCount <= retryMax; ++retryCount) { + if (retryCount != 1) { + // if we are retrying, sleep a bit to let the network possibly recover + sleepsecs(retryCount * retryCount); + } + try { + bool ok = missingObjReader.connect(hn); + if (!ok) { + warning() << "network problem detected while connecting to the " + << "sync source, attempt " << retryCount << " of " + << retryMax << endl; + continue; // try again + } + } + catch (const SocketException& exc) { + warning() << "network problem detected while connecting to the " + << "sync source, attempt " << retryCount << " of " + << retryMax << endl; + continue; // try again + } - // might be more than just _id in the update criteria - BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); - BSONObj missingObj; - try { - missingObj = missingObjReader.findOne(ns, query); - } catch(DBException& e) { - log() << "replication assertion fetching missing object: " << e.what() << endl; - throw; - } + // might be more than just _id in the update criteria + BSONObj query = BSONObjBuilder().append(o.getObjectField("o2")["_id"]).obj(); + BSONObj missingObj; + try { + missingObj = missingObjReader.findOne(ns, query); + } + catch (const SocketException& exc) { + warning() << "network problem detected while fetching a missing document from the " + << "sync source, attempt " << retryCount << " of " + << retryMax << endl; + continue; // try again + } + catch (DBException& e) { + log() << "replication assertion fetching missing object: " << e.what() << endl; + throw; + } - return missingObj; + // success! + return missingObj; + } + // retry count exceeded + msgasserted(15916, + str::stream() << "Can no longer connect to initial sync source: " << hn); } bool Sync::shouldRetry(const BSONObj& o) { |