diff options
-rw-r--r-- | jstests/replsets/stepdown3.js | 8 | ||||
-rw-r--r-- | jstests/sharding/auth_add_shard.js | 86 | ||||
-rw-r--r-- | jstests/sharding/features3.js | 33 | ||||
-rw-r--r-- | jstests/sharding/mongos_rs_shard_failure_tolerance.js | 9 | ||||
-rw-r--r-- | src/mongo/base/error_codes.err | 6 | ||||
-rw-r--r-- | src/mongo/db/operation_context.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/operation_context.h | 40 | ||||
-rw-r--r-- | src/mongo/db/operation_context_impl.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_external_state_impl.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/service_context.h | 2 | ||||
-rw-r--r-- | src/mongo/db/service_context_d.cpp | 38 | ||||
-rw-r--r-- | src/mongo/db/service_context_d.h | 15 | ||||
-rw-r--r-- | src/mongo/db/service_context_noop.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/service_context_noop.h | 2 | ||||
-rw-r--r-- | src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp | 2 | ||||
-rw-r--r-- | src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp | 4 | ||||
-rw-r--r-- | src/mongo/s/client/shard_registry.cpp | 9 | ||||
-rw-r--r-- | src/mongo/s/query/async_results_merger.cpp | 21 |
18 files changed, 142 insertions, 155 deletions
diff --git a/jstests/replsets/stepdown3.js b/jstests/replsets/stepdown3.js index 9c3cf3d9544..65fa9d2df08 100644 --- a/jstests/replsets/stepdown3.js +++ b/jstests/replsets/stepdown3.js @@ -2,6 +2,8 @@ // // This test requires the fsync command to force a secondary to be stale. // @tags: [requires_fsync] +(function() { +'use strict'; var replTest = new ReplSetTest({ name: 'testSet', nodes: 2 }); var nodes = replTest.startSet(); @@ -37,8 +39,8 @@ print("getlasterror; should assert or return an error, depending on timing"); var gleFunction = function() { var result = master.getDB("test").runCommand({getLastError : 1, w: 2 , wtimeout :30000 }); if (result.errmsg === "not master" || - result.code == 10107 || - result.code == 11601 /*interrupted*/ ) { + result.code == ErrorCodes.NotMaster || + result.code == ErrorCodes.InterruptedDueToReplStateChange) { throw new Error("satisfy assert.throws()"); } print("failed to throw exception; GLE returned: "); @@ -54,3 +56,5 @@ assert.neq(0, exitCode, "expected replSetStepDown to close the shell's connectio // unlock and shut down printjson(locked.getDB("admin").fsyncUnlock()); replTest.stopSet(); + +})(); diff --git a/jstests/sharding/auth_add_shard.js b/jstests/sharding/auth_add_shard.js index a20f34034cd..ef2e5dfa760 100644 --- a/jstests/sharding/auth_add_shard.js +++ b/jstests/sharding/auth_add_shard.js @@ -1,8 +1,8 @@ - -/* SERVER-5124 - * The puporse of this test is to test authentication when adding/removing a shard - * The test sets up a sharded system, then adds/remove a shard. - */ +// SERVER-5124 +// The puporse of this test is to test authentication when adding/removing a shard. The test sets +// up a sharded system, then adds/removes a shard. +(function() { +'use strict'; // login method to login into the database function login(userObj) { @@ -11,89 +11,89 @@ function login(userObj) { } // admin user object -adminUser = { db : "admin", username : "foo", password : "bar" }; +var adminUser = { db: "admin", username: "foo", password: "bar" }; //set up a 2 shard cluster with keyfile -var st = new ShardingTest( { name : "auth_add_shard1", shards : 1, - mongos : 1, keyFile : "jstests/libs/key1" } ) +var st = new ShardingTest({ name: "auth_add_shard1", shards: 1, + mongos: 1, keyFile: "jstests/libs/key1" }); -var mongos = st.s0 -var admin = mongos.getDB("admin") +var mongos = st.s0; +var admin = mongos.getDB("admin"); print("1 shard system setup"); //add the admin user print("adding user"); -mongos.getDB(adminUser.db).createUser({user: adminUser.username, pwd: adminUser.password, roles: jsTest.adminUserRoles}); +mongos.getDB(adminUser.db).createUser({ user: adminUser.username, + pwd: adminUser.password, + roles: jsTest.adminUserRoles}); //login as admin user login(adminUser); -st.stopBalancer(); -assert.eq( 1, st.config.shards.count() , "initial server count wrong" ); +assert.eq(1, st.config.shards.count() , "initial server count wrong"); //start a mongod with NO keyfile var conn = MongoRunner.runMongod({}); -print (conn); +print(conn); // --------------- Test 1 -------------------- -// Add shard to the existing cluster -var result = admin.runCommand( {addShard : conn.host} ); -printjson(result); -// make sure the shard wasn't added -assert.eq(result.ok, 0, "added shard without keyfile"); +// Add shard to the existing cluster (should fail because it was added without a keyfile) +printjson(assert.commandFailed(admin.runCommand({ addShard: conn.host }))); + // stop mongod -MongoRunner.stopMongod( conn ); +MongoRunner.stopMongod(conn); //--------------- Test 2 -------------------- //start mongod again, this time with keyfile -var conn = MongoRunner.runMongod( {keyFile : "jstests/libs/key1"} ); +var conn = MongoRunner.runMongod({keyFile: "jstests/libs/key1"}); //try adding the new shard -var result = admin.runCommand( {addShard : conn.host} ); -printjson(result); -//make sure the shard was added successfully -assert.eq(result.ok, 1, "failed to add shard with keyfile"); +printjson(assert.commandWorked(admin.runCommand({ addShard: conn.host }))); //Add some data var db = mongos.getDB("foo"); -var collA = mongos.getCollection("foo.bar") +var collA = mongos.getCollection("foo.bar"); // enable sharding on a collection -printjson( admin.runCommand( { enableSharding : "" + collA.getDB() } ) ) -printjson( admin.runCommand( { movePrimary : "foo", to : "shard0000" } ) ); +assert.commandWorked(admin.runCommand({ enableSharding: "" + collA.getDB() })); +st.ensurePrimaryShard("foo", "shard0000"); -admin.runCommand( { shardCollection : "" + collA, key : { _id : 1 } } ) +assert.commandWorked(admin.runCommand({ shardCollection: "" + collA, key: { _id: 1 } })); // add data to the sharded collection -for (i=0; i<4; i++) { - db.bar.save({_id:i}); - printjson(admin.runCommand( { split : "" + collA, middle : { _id : i } }) ) +for (var i = 0; i < 4; i++) { + db.bar.save({ _id: i }); + assert.commandWorked(admin.runCommand({ split: "" + collA, middle: { _id: i } })); } + // move a chunk -printjson( admin.runCommand( { moveChunk : "foo.bar", find : { _id : 1 }, to : "shard0001" }) ) +assert.commandWorked(admin.runCommand({ moveChunk: "foo.bar", find: { _id: 1 }, to: "shard0001" })); //verify the chunk was moved -admin.runCommand( { flushRouterConfig : 1 } ) -var config = mongos.getDB("config") -config.printShardingStatus(true) +admin.runCommand({ flushRouterConfig: 1 }); + +var config = mongos.getDB("config"); +config.printShardingStatus(true); // start balancer before removing the shard st.startBalancer(); //--------------- Test 3 -------------------- // now drain the shard -var result = admin.runCommand( {removeShard : conn.host} ); -printjson(result); -assert.eq(result.ok, 1, "failed to start draining shard"); +printjson(assert.commandWorked(admin.runCommand({removeShard: conn.host}))); // give it some time to drain assert.soon(function() { - var result = admin.runCommand( {removeShard : conn.host} ); + var result = admin.runCommand({removeShard: conn.host}); printjson(result); - return result.ok && result.state == "completed" + + return result.ok && result.state == "completed"; }, "failed to drain shard completely", 5 * 60 * 1000) -assert.eq( 1, st.config.shards.count() , "removed server still appears in count" ); +assert.eq(1, st.config.shards.count() , "removed server still appears in count"); + +MongoRunner.stopMongod(conn); -MongoRunner.stopMongod( conn ); st.stop(); + +})(); diff --git a/jstests/sharding/features3.js b/jstests/sharding/features3.js index 84c857e644c..03a5cedc25a 100644 --- a/jstests/sharding/features3.js +++ b/jstests/sharding/features3.js @@ -29,8 +29,9 @@ s.startBalancer(); // insert 10k small documents into the sharded collection var bulk = dbForTest.foo.initializeUnorderedBulkOp(); -for (i = 0; i < numDocs; i++) +for (var i = 0; i < numDocs; i++) { bulk.insert({ _id: i }); +} assert.writeOK(bulk.execute()); var x = dbForTest.foo.stats(); @@ -53,12 +54,10 @@ assert(!x.sharded, "XXX3: " + tojson(x)); // fork shell and start querying the data var start = new Date(); -// TODO: Still potential problem when our sampling of current ops misses when $where is active - -// solution is to increase sleep time -var whereKillSleepTime = 10000; +var whereKillSleepTime = 1000; var parallelCommand = "db.foo.find(function() { " + - " sleep( " + whereKillSleepTime + " ); " + + " sleep(" + whereKillSleepTime + "); " + " return false; " + "}).itcount(); "; @@ -68,17 +67,22 @@ var awaitShell = startParallelShell(parallelCommand, s.s.port); print("done forking shell at: " + Date()); // Get all current $where operations -function getMine(printInprog) { +function getInProgWhereOps() { var inprog = dbForTest.currentOp().inprog; - if (printInprog) - printjson(inprog); // Find all the where queries var myProcs = []; - for (var x = 0; x < inprog.length; x++) { - if (inprog[x].query && inprog[x].query.filter && inprog[x].query.filter.$where) { - myProcs.push(inprog[x]); + inprog.forEach(function(op) { + if (op.query && op.query.filter && op.query.filter.$where) { + myProcs.push(op); } + }); + + if (myProcs.length == 0) { + print('No $where operations found: ' + tojson(inprog)); + } + else { + print('Found ' + myProcs.length + ' $where operations: ' + tojson(myProcs)); } return myProcs; @@ -86,16 +90,11 @@ function getMine(printInprog) { var curOpState = 0; // 0 = not found, 1 = killed var killTime = null; -var i = 0; var mine; assert.soon(function() { // Get all the current operations - mine = getMine(true); // SERVER-8794: print all operations - - // get curren tops, but only print out operations before we see a $where op has started - // mine = getMine(curOpState == 0 && i > 20); - i++; + mine = getInProgWhereOps(); // Wait for the queries to start (one per shard, so 2 total) if (curOpState == 0 && mine.length == 2) { diff --git a/jstests/sharding/mongos_rs_shard_failure_tolerance.js b/jstests/sharding/mongos_rs_shard_failure_tolerance.js index 6cd99a1bbc3..b5117439925 100644 --- a/jstests/sharding/mongos_rs_shard_failure_tolerance.js +++ b/jstests/sharding/mongos_rs_shard_failure_tolerance.js @@ -10,11 +10,10 @@ // sequence), idle (connection is connected but not used before a shard change), and new // (connection connected after shard change). // +(function() { +'use strict'; -var options = {rs : true, rsOptions : { nodes : 2 }}; - -var st = new ShardingTest({shards : 3, mongos : 1, other : options}); -st.stopBalancer(); +var st = new ShardingTest({ shards: 3, mongos: 1, other: { rs: true, rsOptions: { nodes: 2 } } }); var mongos = st.s0; var admin = mongos.getDB( "admin" ); @@ -412,6 +411,6 @@ assert.writeError(mongosConnNew.getCollection( collUnsharded.toString() ).insert gc(); // Clean up new connections -jsTest.log("DONE!"); st.stop(); +})(); diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err index faed0dbb14a..43fdefa1c50 100644 --- a/src/mongo/base/error_codes.err +++ b/src/mongo/base/error_codes.err @@ -160,6 +160,7 @@ error_code("CannotGrowDocumentInCappedNamespace", 10003) error_code("DuplicateKey", 11000) error_code("InterruptedAtShutdown", 11600) error_code("Interrupted", 11601) +error_code("InterruptedDueToReplStateChange", 11602) error_code("OutOfDiskSpace", 14031 ) error_code("KeyTooLong", 17280); error_code("BackgroundOperationInProgressForDatabase", 12586); @@ -173,7 +174,10 @@ error_code("PrepareConfigsFailed", 13104); # SERVER-21428 explains the extra changes needed if error_class components change. error_class("NetworkError", ["HostUnreachable", "HostNotFound", "NetworkTimeout"]) -error_class("Interruption", ["Interrupted", "InterruptedAtShutdown", "ExceededTimeLimit"]) +error_class("Interruption", ["Interrupted", + "InterruptedAtShutdown", + "InterruptedDueToReplStateChange", + "ExceededTimeLimit"]) error_class("NotMasterError", ["NotMaster", "NotMasterNoSlaveOk"]) error_class("StaleShardingError", ["RecvStaleConfig", "SendStaleConfig", "StaleShardVersion", "StaleEpoch"]) diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp index 1b79dadcb50..1434ee5ddc7 100644 --- a/src/mongo/db/operation_context.cpp +++ b/src/mongo/db/operation_context.cpp @@ -46,12 +46,13 @@ Client* OperationContext::getClient() const { return _client; } -void OperationContext::markKilled() { - _killPending.store(1); +void OperationContext::markKilled(ErrorCodes::Error killCode) { + invariant(killCode != ErrorCodes::OK); + _killCode.compareAndSwap(ErrorCodes::OK, killCode); } -bool OperationContext::isKillPending() const { - return _killPending.loadRelaxed(); +ErrorCodes::Error OperationContext::getKillStatus() const { + return _killCode.loadRelaxed(); } } // namespace mongo diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h index a146a8f7f14..3a7e3165dda 100644 --- a/src/mongo/db/operation_context.h +++ b/src/mongo/db/operation_context.h @@ -77,7 +77,6 @@ public: */ virtual RecoveryUnit* recoveryUnit() const = 0; - /** * Returns the RecoveryUnit (same return value as recoveryUnit()) but the caller takes * ownership of the returned RecoveryUnit, and the OperationContext instance relinquishes @@ -183,23 +182,33 @@ public: virtual bool writesAreReplicated() const = 0; /** - * Marks this operation as killed. + * Marks this operation as killed so that subsequent calls to checkForInterrupt and + * checkForInterruptNoAssert by the thread executing the operation will start returning the + * specified error code. * - * Subsequent calls to checkForInterrupt and checkForInterruptNoAssert by the thread - * executing the operation will indicate that the operation has been killed. + * If multiple threads kill the same operation with different codes, only the first code will + * be preserved. * - * May be called by any thread that has locked the Client owning this operation context, - * or by the thread executing on behalf of this operation context. + * May be called by any thread that has locked the Client owning this operation context. */ - void markKilled(); + void markKilled(ErrorCodes::Error killCode = ErrorCodes::Interrupted); /** - * Returns true if markKilled has been called on this operation context. + * Returns the code passed to markKilled if this operation context has been killed previously + * or ErrorCodes::OK otherwise. * - * May be called by any thread that has locked the Client owning this operation context, - * or by the thread executing on behalf of this operation context. + * May be called by any thread that has locked the Client owning this operation context, or + * without lock by the thread executing on behalf of this operation context. + */ + ErrorCodes::Error getKillStatus() const; + + /** + * Shortcut method, which checks whether getKillStatus returns a non-OK value. Has the same + * concurrency rules as getKillStatus. */ - bool isKillPending() const; + bool isKillPending() const { + return getKillStatus() != ErrorCodes::OK; + } protected: OperationContext(Client* client, unsigned int opId, Locker* locker); @@ -211,11 +220,14 @@ private: Client* const _client; const unsigned int _opId; - // The lifetime of locker is managed by subclasses of OperationContext, so it is not - // safe to access _locker in the destructor of OperationContext. + // Not owned. Locker* const _locker; - AtomicInt32 _killPending{0}; + // Follows the values of ErrorCodes::Error. The default value is 0 (OK), which means the + // operation is not killed. If killed, it will contain a specific code. This value changes only + // once from OK to some kill code. + AtomicWord<ErrorCodes::Error> _killCode{ErrorCodes::OK}; + WriteConcernOptions _writeConcern; }; diff --git a/src/mongo/db/operation_context_impl.cpp b/src/mongo/db/operation_context_impl.cpp index f2c0166876f..b7733958bc2 100644 --- a/src/mongo/db/operation_context_impl.cpp +++ b/src/mongo/db/operation_context_impl.cpp @@ -140,6 +140,7 @@ uint64_t OperationContextImpl::getRemainingMaxTimeMicros() const { MONGO_FP_DECLARE(checkForInterruptFail); namespace { + // Helper function for checkForInterrupt fail point. Decides whether the operation currently // being run by the given Client meet the (probabilistic) conditions for interruption as // specified in the fail point info. @@ -194,8 +195,9 @@ Status OperationContextImpl::checkForInterruptNoAssert() { } } - if (isKillPending()) { - return Status(ErrorCodes::Interrupted, "operation was interrupted"); + const auto killStatus = getKillStatus(); + if (killStatus != ErrorCodes::OK) { + return Status(killStatus, "operation was interrupted"); } return Status::OK(); diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index c42b80a605d..52b0fee436c 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -350,8 +350,8 @@ void ReplicationCoordinatorExternalStateImpl::closeConnections() { } void ReplicationCoordinatorExternalStateImpl::killAllUserOperations(OperationContext* txn) { - ServiceContext* environment = getGlobalServiceContext(); - environment->killAllUserOperations(txn); + ServiceContext* environment = txn->getServiceContext(); + environment->killAllUserOperations(txn, ErrorCodes::InterruptedDueToReplStateChange); } void ReplicationCoordinatorExternalStateImpl::clearShardingState() { diff --git a/src/mongo/db/service_context.h b/src/mongo/db/service_context.h index 6f9aa36a22f..4e63046726f 100644 --- a/src/mongo/db/service_context.h +++ b/src/mongo/db/service_context.h @@ -284,7 +284,7 @@ public: * Kills all operations that have a Client that is associated with an incoming user * connection, except for the one associated with txn. */ - virtual void killAllUserOperations(const OperationContext* txn) = 0; + virtual void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) = 0; /** * Registers a listener to be notified each time an op is killed. diff --git a/src/mongo/db/service_context_d.cpp b/src/mongo/db/service_context_d.cpp index 3e5fd2d5be7..470dcb927a5 100644 --- a/src/mongo/db/service_context_d.cpp +++ b/src/mongo/db/service_context_d.cpp @@ -220,21 +220,9 @@ bool ServiceContextMongoD::getKillAllOperations() { return _globalKill; } -bool ServiceContextMongoD::_killOperationsAssociatedWithClientAndOpId_inlock(Client* client, - unsigned int opId) { - OperationContext* opCtx = client->getOperationContext(); - if (!opCtx) { - return false; - } - if (opCtx->getOpID() != opId) { - return false; - } - _killOperation_inlock(opCtx); - return true; -} - -void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx) { - opCtx->markKilled(); +void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx, + ErrorCodes::Error killCode) { + opCtx->markKilled(killCode); for (const auto listener : _killOpListeners) { try { @@ -248,8 +236,10 @@ void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx) { bool ServiceContextMongoD::killOperation(unsigned int opId) { for (LockedClientsCursor cursor(this); Client* client = cursor.next();) { stdx::lock_guard<Client> lk(*client); - bool found = _killOperationsAssociatedWithClientAndOpId_inlock(client, opId); - if (found) { + + OperationContext* opCtx = client->getOperationContext(); + if (opCtx && opCtx->getOpID() == opId) { + _killOperation_inlock(opCtx, ErrorCodes::Interrupted); return true; } } @@ -257,7 +247,8 @@ bool ServiceContextMongoD::killOperation(unsigned int opId) { return false; } -void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn) { +void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn, + ErrorCodes::Error killCode) { for (LockedClientsCursor cursor(this); Client* client = cursor.next();) { if (!client->isFromUserConnection()) { // Don't kill system operations. @@ -266,16 +257,11 @@ void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn) { stdx::lock_guard<Client> lk(*client); OperationContext* toKill = client->getOperationContext(); - if (!toKill) { - continue; - } - if (toKill->getOpID() == txn->getOpID()) { - // Don't kill ourself. - continue; + // Don't kill ourself. + if (toKill && toKill->getOpID() != txn->getOpID()) { + _killOperation_inlock(toKill, killCode); } - - _killOperation_inlock(toKill); } } diff --git a/src/mongo/db/service_context_d.h b/src/mongo/db/service_context_d.h index 0c560ff17f4..dc834197e9e 100644 --- a/src/mongo/db/service_context_d.h +++ b/src/mongo/db/service_context_d.h @@ -67,7 +67,7 @@ public: bool killOperation(unsigned int opId) override; - void killAllUserOperations(const OperationContext* txn) override; + void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) override; void registerKillOpListener(KillOpListenerInterface* listener) override; @@ -79,22 +79,11 @@ private: std::unique_ptr<OperationContext> _newOpCtx(Client* client) override; /** - * Kills the active operation on "client" if that operation is associated with operation id - * "opId". - * - * Returns true if an operation was killed. - * - * Must only be called by a thread owning both this service context's mutex and the - * client's. - */ - bool _killOperationsAssociatedWithClientAndOpId_inlock(Client* client, unsigned int opId); - - /** * Kills the given operation. * * Caller must own the service context's _mutex. */ - void _killOperation_inlock(OperationContext* opCtx); + void _killOperation_inlock(OperationContext* opCtx, ErrorCodes::Error killCode); bool _globalKill; diff --git a/src/mongo/db/service_context_noop.cpp b/src/mongo/db/service_context_noop.cpp index 65184906442..fd94b35db89 100644 --- a/src/mongo/db/service_context_noop.cpp +++ b/src/mongo/db/service_context_noop.cpp @@ -79,7 +79,8 @@ bool ServiceContextNoop::killOperation(unsigned int opId) { return false; } -void ServiceContextNoop::killAllUserOperations(const OperationContext* txn) {} +void ServiceContextNoop::killAllUserOperations(const OperationContext* txn, + ErrorCodes::Error killCode) {} void ServiceContextNoop::registerKillOpListener(KillOpListenerInterface* listener) {} diff --git a/src/mongo/db/service_context_noop.h b/src/mongo/db/service_context_noop.h index 0e74b61b7a2..62e6a169896 100644 --- a/src/mongo/db/service_context_noop.h +++ b/src/mongo/db/service_context_noop.h @@ -49,7 +49,7 @@ public: bool killOperation(unsigned int opId) override; - void killAllUserOperations(const OperationContext* txn) override; + void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) override; void setKillAllOperations() override; diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp index 5943cc6e255..5686476e1a3 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp @@ -389,8 +389,6 @@ StatusWith<OpTimePair<DatabaseType>> CatalogManagerReplicaSet::_fetchDatabaseMet StatusWith<OpTimePair<CollectionType>> CatalogManagerReplicaSet::getCollection( OperationContext* txn, const std::string& collNs) { - auto configShard = grid.shardRegistry()->getShard(txn, "config"); - auto statusFind = _exhaustiveFindOnConfig(txn, kConfigReadSelector, NamespaceString(CollectionType::ConfigNS), diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp index b21ff63961f..9bf44eaa09c 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp @@ -93,7 +93,7 @@ TEST_F(InsertRetryTest, RetryOnInterruptedAndNetworkErrorSuccess) { onCommand([&](const RemoteCommandRequest& request) { ASSERT_EQ(request.target, kTestHosts[0]); configTargeter()->setFindHostReturnValue({kTestHosts[1]}); - return Status(ErrorCodes::Interrupted, "Interruption"); + return Status(ErrorCodes::InterruptedDueToReplStateChange, "Interruption"); }); onCommand([&](const RemoteCommandRequest& request) { @@ -271,7 +271,7 @@ TEST_F(UpdateRetryTest, OperationInterruptedDueToPrimaryStepDown) { auto writeErrDetail = stdx::make_unique<WriteErrorDetail>(); writeErrDetail->setIndex(0); - writeErrDetail->setErrCode(ErrorCodes::Interrupted); + writeErrDetail->setErrCode(ErrorCodes::InterruptedDueToReplStateChange); writeErrDetail->setErrMessage("Operation interrupted"); response.addToErrDetails(writeErrDetail.release()); diff --git a/src/mongo/s/client/shard_registry.cpp b/src/mongo/s/client/shard_registry.cpp index 55121f42209..e5b91dda83a 100644 --- a/src/mongo/s/client/shard_registry.cpp +++ b/src/mongo/s/client/shard_registry.cpp @@ -126,6 +126,7 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kNotMasterErrors{ErrorCodes::N const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{ ErrorCodes::NotMaster, ErrorCodes::NotMasterNoSlaveOk, + ErrorCodes::NotMasterOrSecondary, // If write concern failed to be satisfied on the remote server, this most probably means that // some of the secondary nodes were unreachable or otherwise unresponsive, so the call is safe // to be retried if idempotency can be guaranteed. @@ -133,10 +134,7 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{ ErrorCodes::HostUnreachable, ErrorCodes::HostNotFound, ErrorCodes::NetworkTimeout, - // This set includes interrupted because replica set step down kills all server operations - // before it closes connections so it may happen that the caller actually receives the - // interruption. - ErrorCodes::Interrupted}; + ErrorCodes::InterruptedDueToReplStateChange}; ShardRegistry::ShardRegistry(std::unique_ptr<RemoteCommandTargeterFactory> targeterFactory, std::unique_ptr<executor::TaskExecutorPool> executorPool, @@ -782,7 +780,8 @@ StatusWith<ShardRegistry::CommandResponse> ShardRegistry::_runCommandWithMetadat void ShardRegistry::updateReplSetMonitor(const std::shared_ptr<RemoteCommandTargeter>& targeter, const HostAndPort& remoteHost, const Status& remoteCommandStatus) { - if (ErrorCodes::isNotMasterError(remoteCommandStatus.code())) { + if (ErrorCodes::isNotMasterError(remoteCommandStatus.code()) || + (remoteCommandStatus == ErrorCodes::InterruptedDueToReplStateChange)) { targeter->markHostNotMaster(remoteHost); } else if (ErrorCodes::isNetworkError(remoteCommandStatus.code())) { targeter->markHostUnreachable(remoteHost); diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp index 8b23528a04a..c82de6a3bbe 100644 --- a/src/mongo/s/query/async_results_merger.cpp +++ b/src/mongo/s/query/async_results_merger.cpp @@ -51,15 +51,6 @@ namespace { // Maximum number of retries for network and replication notMaster errors (per host). const int kMaxNumFailedHostRetryAttempts = 3; -/** - * Returns whether a particular error code returned from the initial cursor establishment should - * be retried. - */ -bool isPerShardRetriableError(ErrorCodes::Error err) { - return (ShardRegistry::kAllRetriableErrors.count(err) || - err == ErrorCodes::NotMasterOrSecondary); -} - } // namespace AsyncResultsMerger::AsyncResultsMerger(executor::TaskExecutor* executor, @@ -438,8 +429,7 @@ void AsyncResultsMerger::handleBatchResponse( if (!cursorResponseStatus.isOK()) { // Notify the shard registry of the failure. if (remote.shardId) { - // TODO: Pass down an OperationContext* to use here. - auto shard = grid.shardRegistry()->getShard(nullptr, *remote.shardId); + auto shard = grid.shardRegistry()->getShardNoReload(*remote.shardId); if (!shard) { remote.status = Status(cursorResponseStatus.getStatus().code(), str::stream() << "Could not find shard " << *remote.shardId @@ -453,7 +443,10 @@ void AsyncResultsMerger::handleBatchResponse( // If the error is retriable, schedule another request. if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts && - isPerShardRetriableError(cursorResponseStatus.getStatus().code())) { + ShardRegistry::kAllRetriableErrors.count(cursorResponseStatus.getStatus().code())) { + LOG(1) << "Initial cursor establishment failed with retriable error and will be retried" + << causedBy(cursorResponseStatus.getStatus()); + ++remote.retryCount; // Since we potentially updated the targeter that the last host it chose might be @@ -641,13 +634,13 @@ Status AsyncResultsMerger::RemoteCursorData::resolveShardIdToHostAndPort( invariant(shardId); invariant(!cursorId); - // TODO: Pass down an OperationContext* to use here. - const auto shard = grid.shardRegistry()->getShard(nullptr, *shardId); + const auto shard = grid.shardRegistry()->getShardNoReload(*shardId); if (!shard) { return Status(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << *shardId); } + // TODO: Pass down an OperationContext* to use here. auto findHostStatus = shard->getTargeter()->findHost( readPref, RemoteCommandTargeter::selectFindHostMaxWaitTime(nullptr)); if (!findHostStatus.isOK()) { |