summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaloian Manassiev <kaloian.manassiev@mongodb.com>2015-12-30 17:01:04 -0500
committerKaloian Manassiev <kaloian.manassiev@mongodb.com>2015-12-30 17:09:27 -0500
commit715e9e1cdc618dad480a7a1a73458daf6ea9ce0f (patch)
tree95ee80f3e51d3218647bc6fb013dec7f3f735297
parent5d2d6e209acd862324612c7f9c41d65940f8dcba (diff)
downloadmongo-715e9e1cdc618dad480a7a1a73458daf6ea9ce0f.tar.gz
Revert "SERVER-22027 Sharding should not retry killed operations"
This reverts commit 5d2d6e209acd862324612c7f9c41d65940f8dcba.
-rw-r--r--jstests/sharding/auth_add_shard.js86
-rw-r--r--jstests/sharding/features3.js32
-rw-r--r--jstests/sharding/mongos_rs_shard_failure_tolerance.js9
-rw-r--r--src/mongo/base/error_codes.err6
-rw-r--r--src/mongo/db/operation_context.cpp9
-rw-r--r--src/mongo/db/operation_context.h40
-rw-r--r--src/mongo/db/operation_context_impl.cpp6
-rw-r--r--src/mongo/db/repl/replication_coordinator_external_state_impl.cpp4
-rw-r--r--src/mongo/db/service_context.h2
-rw-r--r--src/mongo/db/service_context_d.cpp38
-rw-r--r--src/mongo/db/service_context_d.h15
-rw-r--r--src/mongo/db/service_context_noop.cpp3
-rw-r--r--src/mongo/db/service_context_noop.h2
-rw-r--r--src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp2
-rw-r--r--src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp4
-rw-r--r--src/mongo/s/client/shard_registry.cpp9
-rw-r--r--src/mongo/s/query/async_results_merger.cpp21
17 files changed, 153 insertions, 135 deletions
diff --git a/jstests/sharding/auth_add_shard.js b/jstests/sharding/auth_add_shard.js
index ef2e5dfa760..a20f34034cd 100644
--- a/jstests/sharding/auth_add_shard.js
+++ b/jstests/sharding/auth_add_shard.js
@@ -1,8 +1,8 @@
-// SERVER-5124
-// The puporse of this test is to test authentication when adding/removing a shard. The test sets
-// up a sharded system, then adds/removes a shard.
-(function() {
-'use strict';
+
+/* SERVER-5124
+ * The puporse of this test is to test authentication when adding/removing a shard
+ * The test sets up a sharded system, then adds/remove a shard.
+ */
// login method to login into the database
function login(userObj) {
@@ -11,89 +11,89 @@ function login(userObj) {
}
// admin user object
-var adminUser = { db: "admin", username: "foo", password: "bar" };
+adminUser = { db : "admin", username : "foo", password : "bar" };
//set up a 2 shard cluster with keyfile
-var st = new ShardingTest({ name: "auth_add_shard1", shards: 1,
- mongos: 1, keyFile: "jstests/libs/key1" });
+var st = new ShardingTest( { name : "auth_add_shard1", shards : 1,
+ mongos : 1, keyFile : "jstests/libs/key1" } )
-var mongos = st.s0;
-var admin = mongos.getDB("admin");
+var mongos = st.s0
+var admin = mongos.getDB("admin")
print("1 shard system setup");
//add the admin user
print("adding user");
-mongos.getDB(adminUser.db).createUser({ user: adminUser.username,
- pwd: adminUser.password,
- roles: jsTest.adminUserRoles});
+mongos.getDB(adminUser.db).createUser({user: adminUser.username, pwd: adminUser.password, roles: jsTest.adminUserRoles});
//login as admin user
login(adminUser);
-assert.eq(1, st.config.shards.count() , "initial server count wrong");
+st.stopBalancer();
+assert.eq( 1, st.config.shards.count() , "initial server count wrong" );
//start a mongod with NO keyfile
var conn = MongoRunner.runMongod({});
-print(conn);
+print (conn);
// --------------- Test 1 --------------------
-// Add shard to the existing cluster (should fail because it was added without a keyfile)
-printjson(assert.commandFailed(admin.runCommand({ addShard: conn.host })));
-
+// Add shard to the existing cluster
+var result = admin.runCommand( {addShard : conn.host} );
+printjson(result);
+// make sure the shard wasn't added
+assert.eq(result.ok, 0, "added shard without keyfile");
// stop mongod
-MongoRunner.stopMongod(conn);
+MongoRunner.stopMongod( conn );
//--------------- Test 2 --------------------
//start mongod again, this time with keyfile
-var conn = MongoRunner.runMongod({keyFile: "jstests/libs/key1"});
+var conn = MongoRunner.runMongod( {keyFile : "jstests/libs/key1"} );
//try adding the new shard
-printjson(assert.commandWorked(admin.runCommand({ addShard: conn.host })));
+var result = admin.runCommand( {addShard : conn.host} );
+printjson(result);
+//make sure the shard was added successfully
+assert.eq(result.ok, 1, "failed to add shard with keyfile");
//Add some data
var db = mongos.getDB("foo");
-var collA = mongos.getCollection("foo.bar");
+var collA = mongos.getCollection("foo.bar")
// enable sharding on a collection
-assert.commandWorked(admin.runCommand({ enableSharding: "" + collA.getDB() }));
-st.ensurePrimaryShard("foo", "shard0000");
+printjson( admin.runCommand( { enableSharding : "" + collA.getDB() } ) )
+printjson( admin.runCommand( { movePrimary : "foo", to : "shard0000" } ) );
-assert.commandWorked(admin.runCommand({ shardCollection: "" + collA, key: { _id: 1 } }));
+admin.runCommand( { shardCollection : "" + collA, key : { _id : 1 } } )
// add data to the sharded collection
-for (var i = 0; i < 4; i++) {
- db.bar.save({ _id: i });
- assert.commandWorked(admin.runCommand({ split: "" + collA, middle: { _id: i } }));
+for (i=0; i<4; i++) {
+ db.bar.save({_id:i});
+ printjson(admin.runCommand( { split : "" + collA, middle : { _id : i } }) )
}
-
// move a chunk
-assert.commandWorked(admin.runCommand({ moveChunk: "foo.bar", find: { _id: 1 }, to: "shard0001" }));
+printjson( admin.runCommand( { moveChunk : "foo.bar", find : { _id : 1 }, to : "shard0001" }) )
//verify the chunk was moved
-admin.runCommand({ flushRouterConfig: 1 });
-
-var config = mongos.getDB("config");
-config.printShardingStatus(true);
+admin.runCommand( { flushRouterConfig : 1 } )
+var config = mongos.getDB("config")
+config.printShardingStatus(true)
// start balancer before removing the shard
st.startBalancer();
//--------------- Test 3 --------------------
// now drain the shard
-printjson(assert.commandWorked(admin.runCommand({removeShard: conn.host})));
+var result = admin.runCommand( {removeShard : conn.host} );
+printjson(result);
+assert.eq(result.ok, 1, "failed to start draining shard");
// give it some time to drain
assert.soon(function() {
- var result = admin.runCommand({removeShard: conn.host});
+ var result = admin.runCommand( {removeShard : conn.host} );
printjson(result);
-
- return result.ok && result.state == "completed";
+ return result.ok && result.state == "completed"
}, "failed to drain shard completely", 5 * 60 * 1000)
-assert.eq(1, st.config.shards.count() , "removed server still appears in count");
-
-MongoRunner.stopMongod(conn);
+assert.eq( 1, st.config.shards.count() , "removed server still appears in count" );
+MongoRunner.stopMongod( conn );
st.stop();
-
-})();
diff --git a/jstests/sharding/features3.js b/jstests/sharding/features3.js
index 5e256676c73..84c857e644c 100644
--- a/jstests/sharding/features3.js
+++ b/jstests/sharding/features3.js
@@ -29,9 +29,8 @@ s.startBalancer();
// insert 10k small documents into the sharded collection
var bulk = dbForTest.foo.initializeUnorderedBulkOp();
-for (var i = 0; i < numDocs; i++) {
+for (i = 0; i < numDocs; i++)
bulk.insert({ _id: i });
-}
assert.writeOK(bulk.execute());
var x = dbForTest.foo.stats();
@@ -54,9 +53,12 @@ assert(!x.sharded, "XXX3: " + tojson(x));
// fork shell and start querying the data
var start = new Date();
+// TODO: Still potential problem when our sampling of current ops misses when $where is active -
+// solution is to increase sleep time
+var whereKillSleepTime = 10000;
var parallelCommand =
"db.foo.find(function() { " +
- " sleep(1000); " +
+ " sleep( " + whereKillSleepTime + " ); " +
" return false; " +
"}).itcount(); ";
@@ -66,22 +68,17 @@ var awaitShell = startParallelShell(parallelCommand, s.s.port);
print("done forking shell at: " + Date());
// Get all current $where operations
-function getInProgWhereOps() {
+function getMine(printInprog) {
var inprog = dbForTest.currentOp().inprog;
+ if (printInprog)
+ printjson(inprog);
// Find all the where queries
var myProcs = [];
- inprog.forEach(function(op) {
- if (op.query && op.query.filter && op.query.filter.$where) {
- myProcs.push(op);
+ for (var x = 0; x < inprog.length; x++) {
+ if (inprog[x].query && inprog[x].query.filter && inprog[x].query.filter.$where) {
+ myProcs.push(inprog[x]);
}
- });
-
- if (myProcs.length == 0) {
- print('No $where operations found: ' + tojson(inprog));
- }
- else {
- print('Found ' + myProcs.length + ' $where operations: ' + tojson(myProcs));
}
return myProcs;
@@ -89,11 +86,16 @@ function getInProgWhereOps() {
var curOpState = 0; // 0 = not found, 1 = killed
var killTime = null;
+var i = 0;
var mine;
assert.soon(function() {
// Get all the current operations
- mine = getInProgWhereOps();
+ mine = getMine(true); // SERVER-8794: print all operations
+
+ // get curren tops, but only print out operations before we see a $where op has started
+ // mine = getMine(curOpState == 0 && i > 20);
+ i++;
// Wait for the queries to start (one per shard, so 2 total)
if (curOpState == 0 && mine.length == 2) {
diff --git a/jstests/sharding/mongos_rs_shard_failure_tolerance.js b/jstests/sharding/mongos_rs_shard_failure_tolerance.js
index b5117439925..6cd99a1bbc3 100644
--- a/jstests/sharding/mongos_rs_shard_failure_tolerance.js
+++ b/jstests/sharding/mongos_rs_shard_failure_tolerance.js
@@ -10,10 +10,11 @@
// sequence), idle (connection is connected but not used before a shard change), and new
// (connection connected after shard change).
//
-(function() {
-'use strict';
-var st = new ShardingTest({ shards: 3, mongos: 1, other: { rs: true, rsOptions: { nodes: 2 } } });
+var options = {rs : true, rsOptions : { nodes : 2 }};
+
+var st = new ShardingTest({shards : 3, mongos : 1, other : options});
+st.stopBalancer();
var mongos = st.s0;
var admin = mongos.getDB( "admin" );
@@ -411,6 +412,6 @@ assert.writeError(mongosConnNew.getCollection( collUnsharded.toString() ).insert
gc(); // Clean up new connections
+jsTest.log("DONE!");
st.stop();
-})();
diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err
index 43fdefa1c50..faed0dbb14a 100644
--- a/src/mongo/base/error_codes.err
+++ b/src/mongo/base/error_codes.err
@@ -160,7 +160,6 @@ error_code("CannotGrowDocumentInCappedNamespace", 10003)
error_code("DuplicateKey", 11000)
error_code("InterruptedAtShutdown", 11600)
error_code("Interrupted", 11601)
-error_code("InterruptedDueToReplStateChange", 11602)
error_code("OutOfDiskSpace", 14031 )
error_code("KeyTooLong", 17280);
error_code("BackgroundOperationInProgressForDatabase", 12586);
@@ -174,10 +173,7 @@ error_code("PrepareConfigsFailed", 13104);
# SERVER-21428 explains the extra changes needed if error_class components change.
error_class("NetworkError", ["HostUnreachable", "HostNotFound", "NetworkTimeout"])
-error_class("Interruption", ["Interrupted",
- "InterruptedAtShutdown",
- "InterruptedDueToReplStateChange",
- "ExceededTimeLimit"])
+error_class("Interruption", ["Interrupted", "InterruptedAtShutdown", "ExceededTimeLimit"])
error_class("NotMasterError", ["NotMaster", "NotMasterNoSlaveOk"])
error_class("StaleShardingError",
["RecvStaleConfig", "SendStaleConfig", "StaleShardVersion", "StaleEpoch"])
diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp
index 1434ee5ddc7..1b79dadcb50 100644
--- a/src/mongo/db/operation_context.cpp
+++ b/src/mongo/db/operation_context.cpp
@@ -46,13 +46,12 @@ Client* OperationContext::getClient() const {
return _client;
}
-void OperationContext::markKilled(ErrorCodes::Error killCode) {
- invariant(killCode != ErrorCodes::OK);
- _killCode.compareAndSwap(ErrorCodes::OK, killCode);
+void OperationContext::markKilled() {
+ _killPending.store(1);
}
-ErrorCodes::Error OperationContext::getKillStatus() const {
- return _killCode.loadRelaxed();
+bool OperationContext::isKillPending() const {
+ return _killPending.loadRelaxed();
}
} // namespace mongo
diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h
index 3a7e3165dda..a146a8f7f14 100644
--- a/src/mongo/db/operation_context.h
+++ b/src/mongo/db/operation_context.h
@@ -77,6 +77,7 @@ public:
*/
virtual RecoveryUnit* recoveryUnit() const = 0;
+
/**
* Returns the RecoveryUnit (same return value as recoveryUnit()) but the caller takes
* ownership of the returned RecoveryUnit, and the OperationContext instance relinquishes
@@ -182,33 +183,23 @@ public:
virtual bool writesAreReplicated() const = 0;
/**
- * Marks this operation as killed so that subsequent calls to checkForInterrupt and
- * checkForInterruptNoAssert by the thread executing the operation will start returning the
- * specified error code.
+ * Marks this operation as killed.
*
- * If multiple threads kill the same operation with different codes, only the first code will
- * be preserved.
+ * Subsequent calls to checkForInterrupt and checkForInterruptNoAssert by the thread
+ * executing the operation will indicate that the operation has been killed.
*
- * May be called by any thread that has locked the Client owning this operation context.
+ * May be called by any thread that has locked the Client owning this operation context,
+ * or by the thread executing on behalf of this operation context.
*/
- void markKilled(ErrorCodes::Error killCode = ErrorCodes::Interrupted);
+ void markKilled();
/**
- * Returns the code passed to markKilled if this operation context has been killed previously
- * or ErrorCodes::OK otherwise.
+ * Returns true if markKilled has been called on this operation context.
*
- * May be called by any thread that has locked the Client owning this operation context, or
- * without lock by the thread executing on behalf of this operation context.
- */
- ErrorCodes::Error getKillStatus() const;
-
- /**
- * Shortcut method, which checks whether getKillStatus returns a non-OK value. Has the same
- * concurrency rules as getKillStatus.
+ * May be called by any thread that has locked the Client owning this operation context,
+ * or by the thread executing on behalf of this operation context.
*/
- bool isKillPending() const {
- return getKillStatus() != ErrorCodes::OK;
- }
+ bool isKillPending() const;
protected:
OperationContext(Client* client, unsigned int opId, Locker* locker);
@@ -220,14 +211,11 @@ private:
Client* const _client;
const unsigned int _opId;
- // Not owned.
+ // The lifetime of locker is managed by subclasses of OperationContext, so it is not
+ // safe to access _locker in the destructor of OperationContext.
Locker* const _locker;
- // Follows the values of ErrorCodes::Error. The default value is 0 (OK), which means the
- // operation is not killed. If killed, it will contain a specific code. This value changes only
- // once from OK to some kill code.
- AtomicWord<ErrorCodes::Error> _killCode{ErrorCodes::OK};
-
+ AtomicInt32 _killPending{0};
WriteConcernOptions _writeConcern;
};
diff --git a/src/mongo/db/operation_context_impl.cpp b/src/mongo/db/operation_context_impl.cpp
index b7733958bc2..f2c0166876f 100644
--- a/src/mongo/db/operation_context_impl.cpp
+++ b/src/mongo/db/operation_context_impl.cpp
@@ -140,7 +140,6 @@ uint64_t OperationContextImpl::getRemainingMaxTimeMicros() const {
MONGO_FP_DECLARE(checkForInterruptFail);
namespace {
-
// Helper function for checkForInterrupt fail point. Decides whether the operation currently
// being run by the given Client meet the (probabilistic) conditions for interruption as
// specified in the fail point info.
@@ -195,9 +194,8 @@ Status OperationContextImpl::checkForInterruptNoAssert() {
}
}
- const auto killStatus = getKillStatus();
- if (killStatus != ErrorCodes::OK) {
- return Status(killStatus, "operation was interrupted");
+ if (isKillPending()) {
+ return Status(ErrorCodes::Interrupted, "operation was interrupted");
}
return Status::OK();
diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
index 52b0fee436c..c42b80a605d 100644
--- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
+++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp
@@ -350,8 +350,8 @@ void ReplicationCoordinatorExternalStateImpl::closeConnections() {
}
void ReplicationCoordinatorExternalStateImpl::killAllUserOperations(OperationContext* txn) {
- ServiceContext* environment = txn->getServiceContext();
- environment->killAllUserOperations(txn, ErrorCodes::InterruptedDueToReplStateChange);
+ ServiceContext* environment = getGlobalServiceContext();
+ environment->killAllUserOperations(txn);
}
void ReplicationCoordinatorExternalStateImpl::clearShardingState() {
diff --git a/src/mongo/db/service_context.h b/src/mongo/db/service_context.h
index 4e63046726f..6f9aa36a22f 100644
--- a/src/mongo/db/service_context.h
+++ b/src/mongo/db/service_context.h
@@ -284,7 +284,7 @@ public:
* Kills all operations that have a Client that is associated with an incoming user
* connection, except for the one associated with txn.
*/
- virtual void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) = 0;
+ virtual void killAllUserOperations(const OperationContext* txn) = 0;
/**
* Registers a listener to be notified each time an op is killed.
diff --git a/src/mongo/db/service_context_d.cpp b/src/mongo/db/service_context_d.cpp
index 470dcb927a5..3e5fd2d5be7 100644
--- a/src/mongo/db/service_context_d.cpp
+++ b/src/mongo/db/service_context_d.cpp
@@ -220,9 +220,21 @@ bool ServiceContextMongoD::getKillAllOperations() {
return _globalKill;
}
-void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx,
- ErrorCodes::Error killCode) {
- opCtx->markKilled(killCode);
+bool ServiceContextMongoD::_killOperationsAssociatedWithClientAndOpId_inlock(Client* client,
+ unsigned int opId) {
+ OperationContext* opCtx = client->getOperationContext();
+ if (!opCtx) {
+ return false;
+ }
+ if (opCtx->getOpID() != opId) {
+ return false;
+ }
+ _killOperation_inlock(opCtx);
+ return true;
+}
+
+void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx) {
+ opCtx->markKilled();
for (const auto listener : _killOpListeners) {
try {
@@ -236,10 +248,8 @@ void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx,
bool ServiceContextMongoD::killOperation(unsigned int opId) {
for (LockedClientsCursor cursor(this); Client* client = cursor.next();) {
stdx::lock_guard<Client> lk(*client);
-
- OperationContext* opCtx = client->getOperationContext();
- if (opCtx && opCtx->getOpID() == opId) {
- _killOperation_inlock(opCtx, ErrorCodes::Interrupted);
+ bool found = _killOperationsAssociatedWithClientAndOpId_inlock(client, opId);
+ if (found) {
return true;
}
}
@@ -247,8 +257,7 @@ bool ServiceContextMongoD::killOperation(unsigned int opId) {
return false;
}
-void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn,
- ErrorCodes::Error killCode) {
+void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn) {
for (LockedClientsCursor cursor(this); Client* client = cursor.next();) {
if (!client->isFromUserConnection()) {
// Don't kill system operations.
@@ -257,11 +266,16 @@ void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn,
stdx::lock_guard<Client> lk(*client);
OperationContext* toKill = client->getOperationContext();
+ if (!toKill) {
+ continue;
+ }
- // Don't kill ourself.
- if (toKill && toKill->getOpID() != txn->getOpID()) {
- _killOperation_inlock(toKill, killCode);
+ if (toKill->getOpID() == txn->getOpID()) {
+ // Don't kill ourself.
+ continue;
}
+
+ _killOperation_inlock(toKill);
}
}
diff --git a/src/mongo/db/service_context_d.h b/src/mongo/db/service_context_d.h
index dc834197e9e..0c560ff17f4 100644
--- a/src/mongo/db/service_context_d.h
+++ b/src/mongo/db/service_context_d.h
@@ -67,7 +67,7 @@ public:
bool killOperation(unsigned int opId) override;
- void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) override;
+ void killAllUserOperations(const OperationContext* txn) override;
void registerKillOpListener(KillOpListenerInterface* listener) override;
@@ -79,11 +79,22 @@ private:
std::unique_ptr<OperationContext> _newOpCtx(Client* client) override;
/**
+ * Kills the active operation on "client" if that operation is associated with operation id
+ * "opId".
+ *
+ * Returns true if an operation was killed.
+ *
+ * Must only be called by a thread owning both this service context's mutex and the
+ * client's.
+ */
+ bool _killOperationsAssociatedWithClientAndOpId_inlock(Client* client, unsigned int opId);
+
+ /**
* Kills the given operation.
*
* Caller must own the service context's _mutex.
*/
- void _killOperation_inlock(OperationContext* opCtx, ErrorCodes::Error killCode);
+ void _killOperation_inlock(OperationContext* opCtx);
bool _globalKill;
diff --git a/src/mongo/db/service_context_noop.cpp b/src/mongo/db/service_context_noop.cpp
index fd94b35db89..65184906442 100644
--- a/src/mongo/db/service_context_noop.cpp
+++ b/src/mongo/db/service_context_noop.cpp
@@ -79,8 +79,7 @@ bool ServiceContextNoop::killOperation(unsigned int opId) {
return false;
}
-void ServiceContextNoop::killAllUserOperations(const OperationContext* txn,
- ErrorCodes::Error killCode) {}
+void ServiceContextNoop::killAllUserOperations(const OperationContext* txn) {}
void ServiceContextNoop::registerKillOpListener(KillOpListenerInterface* listener) {}
diff --git a/src/mongo/db/service_context_noop.h b/src/mongo/db/service_context_noop.h
index 62e6a169896..0e74b61b7a2 100644
--- a/src/mongo/db/service_context_noop.h
+++ b/src/mongo/db/service_context_noop.h
@@ -49,7 +49,7 @@ public:
bool killOperation(unsigned int opId) override;
- void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) override;
+ void killAllUserOperations(const OperationContext* txn) override;
void setKillAllOperations() override;
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
index 5686476e1a3..5943cc6e255 100644
--- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
+++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp
@@ -389,6 +389,8 @@ StatusWith<OpTimePair<DatabaseType>> CatalogManagerReplicaSet::_fetchDatabaseMet
StatusWith<OpTimePair<CollectionType>> CatalogManagerReplicaSet::getCollection(
OperationContext* txn, const std::string& collNs) {
+ auto configShard = grid.shardRegistry()->getShard(txn, "config");
+
auto statusFind = _exhaustiveFindOnConfig(txn,
kConfigReadSelector,
NamespaceString(CollectionType::ConfigNS),
diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
index 9bf44eaa09c..b21ff63961f 100644
--- a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
+++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp
@@ -93,7 +93,7 @@ TEST_F(InsertRetryTest, RetryOnInterruptedAndNetworkErrorSuccess) {
onCommand([&](const RemoteCommandRequest& request) {
ASSERT_EQ(request.target, kTestHosts[0]);
configTargeter()->setFindHostReturnValue({kTestHosts[1]});
- return Status(ErrorCodes::InterruptedDueToReplStateChange, "Interruption");
+ return Status(ErrorCodes::Interrupted, "Interruption");
});
onCommand([&](const RemoteCommandRequest& request) {
@@ -271,7 +271,7 @@ TEST_F(UpdateRetryTest, OperationInterruptedDueToPrimaryStepDown) {
auto writeErrDetail = stdx::make_unique<WriteErrorDetail>();
writeErrDetail->setIndex(0);
- writeErrDetail->setErrCode(ErrorCodes::InterruptedDueToReplStateChange);
+ writeErrDetail->setErrCode(ErrorCodes::Interrupted);
writeErrDetail->setErrMessage("Operation interrupted");
response.addToErrDetails(writeErrDetail.release());
diff --git a/src/mongo/s/client/shard_registry.cpp b/src/mongo/s/client/shard_registry.cpp
index e5b91dda83a..55121f42209 100644
--- a/src/mongo/s/client/shard_registry.cpp
+++ b/src/mongo/s/client/shard_registry.cpp
@@ -126,7 +126,6 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kNotMasterErrors{ErrorCodes::N
const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{
ErrorCodes::NotMaster,
ErrorCodes::NotMasterNoSlaveOk,
- ErrorCodes::NotMasterOrSecondary,
// If write concern failed to be satisfied on the remote server, this most probably means that
// some of the secondary nodes were unreachable or otherwise unresponsive, so the call is safe
// to be retried if idempotency can be guaranteed.
@@ -134,7 +133,10 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{
ErrorCodes::HostUnreachable,
ErrorCodes::HostNotFound,
ErrorCodes::NetworkTimeout,
- ErrorCodes::InterruptedDueToReplStateChange};
+ // This set includes interrupted because replica set step down kills all server operations
+ // before it closes connections so it may happen that the caller actually receives the
+ // interruption.
+ ErrorCodes::Interrupted};
ShardRegistry::ShardRegistry(std::unique_ptr<RemoteCommandTargeterFactory> targeterFactory,
std::unique_ptr<executor::TaskExecutorPool> executorPool,
@@ -780,8 +782,7 @@ StatusWith<ShardRegistry::CommandResponse> ShardRegistry::_runCommandWithMetadat
void ShardRegistry::updateReplSetMonitor(const std::shared_ptr<RemoteCommandTargeter>& targeter,
const HostAndPort& remoteHost,
const Status& remoteCommandStatus) {
- if (ErrorCodes::isNotMasterError(remoteCommandStatus.code()) ||
- (remoteCommandStatus == ErrorCodes::InterruptedDueToReplStateChange)) {
+ if (ErrorCodes::isNotMasterError(remoteCommandStatus.code())) {
targeter->markHostNotMaster(remoteHost);
} else if (ErrorCodes::isNetworkError(remoteCommandStatus.code())) {
targeter->markHostUnreachable(remoteHost);
diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp
index c82de6a3bbe..8b23528a04a 100644
--- a/src/mongo/s/query/async_results_merger.cpp
+++ b/src/mongo/s/query/async_results_merger.cpp
@@ -51,6 +51,15 @@ namespace {
// Maximum number of retries for network and replication notMaster errors (per host).
const int kMaxNumFailedHostRetryAttempts = 3;
+/**
+ * Returns whether a particular error code returned from the initial cursor establishment should
+ * be retried.
+ */
+bool isPerShardRetriableError(ErrorCodes::Error err) {
+ return (ShardRegistry::kAllRetriableErrors.count(err) ||
+ err == ErrorCodes::NotMasterOrSecondary);
+}
+
} // namespace
AsyncResultsMerger::AsyncResultsMerger(executor::TaskExecutor* executor,
@@ -429,7 +438,8 @@ void AsyncResultsMerger::handleBatchResponse(
if (!cursorResponseStatus.isOK()) {
// Notify the shard registry of the failure.
if (remote.shardId) {
- auto shard = grid.shardRegistry()->getShardNoReload(*remote.shardId);
+ // TODO: Pass down an OperationContext* to use here.
+ auto shard = grid.shardRegistry()->getShard(nullptr, *remote.shardId);
if (!shard) {
remote.status = Status(cursorResponseStatus.getStatus().code(),
str::stream() << "Could not find shard " << *remote.shardId
@@ -443,10 +453,7 @@ void AsyncResultsMerger::handleBatchResponse(
// If the error is retriable, schedule another request.
if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts &&
- ShardRegistry::kAllRetriableErrors.count(cursorResponseStatus.getStatus().code())) {
- LOG(1) << "Initial cursor establishment failed with retriable error and will be retried"
- << causedBy(cursorResponseStatus.getStatus());
-
+ isPerShardRetriableError(cursorResponseStatus.getStatus().code())) {
++remote.retryCount;
// Since we potentially updated the targeter that the last host it chose might be
@@ -634,13 +641,13 @@ Status AsyncResultsMerger::RemoteCursorData::resolveShardIdToHostAndPort(
invariant(shardId);
invariant(!cursorId);
- const auto shard = grid.shardRegistry()->getShardNoReload(*shardId);
+ // TODO: Pass down an OperationContext* to use here.
+ const auto shard = grid.shardRegistry()->getShard(nullptr, *shardId);
if (!shard) {
return Status(ErrorCodes::ShardNotFound,
str::stream() << "Could not find shard " << *shardId);
}
- // TODO: Pass down an OperationContext* to use here.
auto findHostStatus = shard->getTargeter()->findHost(
readPref, RemoteCommandTargeter::selectFindHostMaxWaitTime(nullptr));
if (!findHostStatus.isOK()) {