diff options
author | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2015-12-30 17:01:04 -0500 |
---|---|---|
committer | Kaloian Manassiev <kaloian.manassiev@mongodb.com> | 2015-12-30 17:09:27 -0500 |
commit | 715e9e1cdc618dad480a7a1a73458daf6ea9ce0f (patch) | |
tree | 95ee80f3e51d3218647bc6fb013dec7f3f735297 | |
parent | 5d2d6e209acd862324612c7f9c41d65940f8dcba (diff) | |
download | mongo-715e9e1cdc618dad480a7a1a73458daf6ea9ce0f.tar.gz |
Revert "SERVER-22027 Sharding should not retry killed operations"
This reverts commit 5d2d6e209acd862324612c7f9c41d65940f8dcba.
-rw-r--r-- | jstests/sharding/auth_add_shard.js | 86 | ||||
-rw-r--r-- | jstests/sharding/features3.js | 32 | ||||
-rw-r--r-- | jstests/sharding/mongos_rs_shard_failure_tolerance.js | 9 | ||||
-rw-r--r-- | src/mongo/base/error_codes.err | 6 | ||||
-rw-r--r-- | src/mongo/db/operation_context.cpp | 9 | ||||
-rw-r--r-- | src/mongo/db/operation_context.h | 40 | ||||
-rw-r--r-- | src/mongo/db/operation_context_impl.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/repl/replication_coordinator_external_state_impl.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/service_context.h | 2 | ||||
-rw-r--r-- | src/mongo/db/service_context_d.cpp | 38 | ||||
-rw-r--r-- | src/mongo/db/service_context_d.h | 15 | ||||
-rw-r--r-- | src/mongo/db/service_context_noop.cpp | 3 | ||||
-rw-r--r-- | src/mongo/db/service_context_noop.h | 2 | ||||
-rw-r--r-- | src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp | 2 | ||||
-rw-r--r-- | src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp | 4 | ||||
-rw-r--r-- | src/mongo/s/client/shard_registry.cpp | 9 | ||||
-rw-r--r-- | src/mongo/s/query/async_results_merger.cpp | 21 |
17 files changed, 153 insertions, 135 deletions
diff --git a/jstests/sharding/auth_add_shard.js b/jstests/sharding/auth_add_shard.js index ef2e5dfa760..a20f34034cd 100644 --- a/jstests/sharding/auth_add_shard.js +++ b/jstests/sharding/auth_add_shard.js @@ -1,8 +1,8 @@ -// SERVER-5124 -// The puporse of this test is to test authentication when adding/removing a shard. The test sets -// up a sharded system, then adds/removes a shard. -(function() { -'use strict'; + +/* SERVER-5124 + * The puporse of this test is to test authentication when adding/removing a shard + * The test sets up a sharded system, then adds/remove a shard. + */ // login method to login into the database function login(userObj) { @@ -11,89 +11,89 @@ function login(userObj) { } // admin user object -var adminUser = { db: "admin", username: "foo", password: "bar" }; +adminUser = { db : "admin", username : "foo", password : "bar" }; //set up a 2 shard cluster with keyfile -var st = new ShardingTest({ name: "auth_add_shard1", shards: 1, - mongos: 1, keyFile: "jstests/libs/key1" }); +var st = new ShardingTest( { name : "auth_add_shard1", shards : 1, + mongos : 1, keyFile : "jstests/libs/key1" } ) -var mongos = st.s0; -var admin = mongos.getDB("admin"); +var mongos = st.s0 +var admin = mongos.getDB("admin") print("1 shard system setup"); //add the admin user print("adding user"); -mongos.getDB(adminUser.db).createUser({ user: adminUser.username, - pwd: adminUser.password, - roles: jsTest.adminUserRoles}); +mongos.getDB(adminUser.db).createUser({user: adminUser.username, pwd: adminUser.password, roles: jsTest.adminUserRoles}); //login as admin user login(adminUser); -assert.eq(1, st.config.shards.count() , "initial server count wrong"); +st.stopBalancer(); +assert.eq( 1, st.config.shards.count() , "initial server count wrong" ); //start a mongod with NO keyfile var conn = MongoRunner.runMongod({}); -print(conn); +print (conn); // --------------- Test 1 -------------------- -// Add shard to the existing cluster (should fail because it was added without a keyfile) -printjson(assert.commandFailed(admin.runCommand({ addShard: conn.host }))); - +// Add shard to the existing cluster +var result = admin.runCommand( {addShard : conn.host} ); +printjson(result); +// make sure the shard wasn't added +assert.eq(result.ok, 0, "added shard without keyfile"); // stop mongod -MongoRunner.stopMongod(conn); +MongoRunner.stopMongod( conn ); //--------------- Test 2 -------------------- //start mongod again, this time with keyfile -var conn = MongoRunner.runMongod({keyFile: "jstests/libs/key1"}); +var conn = MongoRunner.runMongod( {keyFile : "jstests/libs/key1"} ); //try adding the new shard -printjson(assert.commandWorked(admin.runCommand({ addShard: conn.host }))); +var result = admin.runCommand( {addShard : conn.host} ); +printjson(result); +//make sure the shard was added successfully +assert.eq(result.ok, 1, "failed to add shard with keyfile"); //Add some data var db = mongos.getDB("foo"); -var collA = mongos.getCollection("foo.bar"); +var collA = mongos.getCollection("foo.bar") // enable sharding on a collection -assert.commandWorked(admin.runCommand({ enableSharding: "" + collA.getDB() })); -st.ensurePrimaryShard("foo", "shard0000"); +printjson( admin.runCommand( { enableSharding : "" + collA.getDB() } ) ) +printjson( admin.runCommand( { movePrimary : "foo", to : "shard0000" } ) ); -assert.commandWorked(admin.runCommand({ shardCollection: "" + collA, key: { _id: 1 } })); +admin.runCommand( { shardCollection : "" + collA, key : { _id : 1 } } ) // add data to the sharded collection -for (var i = 0; i < 4; i++) { - db.bar.save({ _id: i }); - assert.commandWorked(admin.runCommand({ split: "" + collA, middle: { _id: i } })); +for (i=0; i<4; i++) { + db.bar.save({_id:i}); + printjson(admin.runCommand( { split : "" + collA, middle : { _id : i } }) ) } - // move a chunk -assert.commandWorked(admin.runCommand({ moveChunk: "foo.bar", find: { _id: 1 }, to: "shard0001" })); +printjson( admin.runCommand( { moveChunk : "foo.bar", find : { _id : 1 }, to : "shard0001" }) ) //verify the chunk was moved -admin.runCommand({ flushRouterConfig: 1 }); - -var config = mongos.getDB("config"); -config.printShardingStatus(true); +admin.runCommand( { flushRouterConfig : 1 } ) +var config = mongos.getDB("config") +config.printShardingStatus(true) // start balancer before removing the shard st.startBalancer(); //--------------- Test 3 -------------------- // now drain the shard -printjson(assert.commandWorked(admin.runCommand({removeShard: conn.host}))); +var result = admin.runCommand( {removeShard : conn.host} ); +printjson(result); +assert.eq(result.ok, 1, "failed to start draining shard"); // give it some time to drain assert.soon(function() { - var result = admin.runCommand({removeShard: conn.host}); + var result = admin.runCommand( {removeShard : conn.host} ); printjson(result); - - return result.ok && result.state == "completed"; + return result.ok && result.state == "completed" }, "failed to drain shard completely", 5 * 60 * 1000) -assert.eq(1, st.config.shards.count() , "removed server still appears in count"); - -MongoRunner.stopMongod(conn); +assert.eq( 1, st.config.shards.count() , "removed server still appears in count" ); +MongoRunner.stopMongod( conn ); st.stop(); - -})(); diff --git a/jstests/sharding/features3.js b/jstests/sharding/features3.js index 5e256676c73..84c857e644c 100644 --- a/jstests/sharding/features3.js +++ b/jstests/sharding/features3.js @@ -29,9 +29,8 @@ s.startBalancer(); // insert 10k small documents into the sharded collection var bulk = dbForTest.foo.initializeUnorderedBulkOp(); -for (var i = 0; i < numDocs; i++) { +for (i = 0; i < numDocs; i++) bulk.insert({ _id: i }); -} assert.writeOK(bulk.execute()); var x = dbForTest.foo.stats(); @@ -54,9 +53,12 @@ assert(!x.sharded, "XXX3: " + tojson(x)); // fork shell and start querying the data var start = new Date(); +// TODO: Still potential problem when our sampling of current ops misses when $where is active - +// solution is to increase sleep time +var whereKillSleepTime = 10000; var parallelCommand = "db.foo.find(function() { " + - " sleep(1000); " + + " sleep( " + whereKillSleepTime + " ); " + " return false; " + "}).itcount(); "; @@ -66,22 +68,17 @@ var awaitShell = startParallelShell(parallelCommand, s.s.port); print("done forking shell at: " + Date()); // Get all current $where operations -function getInProgWhereOps() { +function getMine(printInprog) { var inprog = dbForTest.currentOp().inprog; + if (printInprog) + printjson(inprog); // Find all the where queries var myProcs = []; - inprog.forEach(function(op) { - if (op.query && op.query.filter && op.query.filter.$where) { - myProcs.push(op); + for (var x = 0; x < inprog.length; x++) { + if (inprog[x].query && inprog[x].query.filter && inprog[x].query.filter.$where) { + myProcs.push(inprog[x]); } - }); - - if (myProcs.length == 0) { - print('No $where operations found: ' + tojson(inprog)); - } - else { - print('Found ' + myProcs.length + ' $where operations: ' + tojson(myProcs)); } return myProcs; @@ -89,11 +86,16 @@ function getInProgWhereOps() { var curOpState = 0; // 0 = not found, 1 = killed var killTime = null; +var i = 0; var mine; assert.soon(function() { // Get all the current operations - mine = getInProgWhereOps(); + mine = getMine(true); // SERVER-8794: print all operations + + // get curren tops, but only print out operations before we see a $where op has started + // mine = getMine(curOpState == 0 && i > 20); + i++; // Wait for the queries to start (one per shard, so 2 total) if (curOpState == 0 && mine.length == 2) { diff --git a/jstests/sharding/mongos_rs_shard_failure_tolerance.js b/jstests/sharding/mongos_rs_shard_failure_tolerance.js index b5117439925..6cd99a1bbc3 100644 --- a/jstests/sharding/mongos_rs_shard_failure_tolerance.js +++ b/jstests/sharding/mongos_rs_shard_failure_tolerance.js @@ -10,10 +10,11 @@ // sequence), idle (connection is connected but not used before a shard change), and new // (connection connected after shard change). // -(function() { -'use strict'; -var st = new ShardingTest({ shards: 3, mongos: 1, other: { rs: true, rsOptions: { nodes: 2 } } }); +var options = {rs : true, rsOptions : { nodes : 2 }}; + +var st = new ShardingTest({shards : 3, mongos : 1, other : options}); +st.stopBalancer(); var mongos = st.s0; var admin = mongos.getDB( "admin" ); @@ -411,6 +412,6 @@ assert.writeError(mongosConnNew.getCollection( collUnsharded.toString() ).insert gc(); // Clean up new connections +jsTest.log("DONE!"); st.stop(); -})(); diff --git a/src/mongo/base/error_codes.err b/src/mongo/base/error_codes.err index 43fdefa1c50..faed0dbb14a 100644 --- a/src/mongo/base/error_codes.err +++ b/src/mongo/base/error_codes.err @@ -160,7 +160,6 @@ error_code("CannotGrowDocumentInCappedNamespace", 10003) error_code("DuplicateKey", 11000) error_code("InterruptedAtShutdown", 11600) error_code("Interrupted", 11601) -error_code("InterruptedDueToReplStateChange", 11602) error_code("OutOfDiskSpace", 14031 ) error_code("KeyTooLong", 17280); error_code("BackgroundOperationInProgressForDatabase", 12586); @@ -174,10 +173,7 @@ error_code("PrepareConfigsFailed", 13104); # SERVER-21428 explains the extra changes needed if error_class components change. error_class("NetworkError", ["HostUnreachable", "HostNotFound", "NetworkTimeout"]) -error_class("Interruption", ["Interrupted", - "InterruptedAtShutdown", - "InterruptedDueToReplStateChange", - "ExceededTimeLimit"]) +error_class("Interruption", ["Interrupted", "InterruptedAtShutdown", "ExceededTimeLimit"]) error_class("NotMasterError", ["NotMaster", "NotMasterNoSlaveOk"]) error_class("StaleShardingError", ["RecvStaleConfig", "SendStaleConfig", "StaleShardVersion", "StaleEpoch"]) diff --git a/src/mongo/db/operation_context.cpp b/src/mongo/db/operation_context.cpp index 1434ee5ddc7..1b79dadcb50 100644 --- a/src/mongo/db/operation_context.cpp +++ b/src/mongo/db/operation_context.cpp @@ -46,13 +46,12 @@ Client* OperationContext::getClient() const { return _client; } -void OperationContext::markKilled(ErrorCodes::Error killCode) { - invariant(killCode != ErrorCodes::OK); - _killCode.compareAndSwap(ErrorCodes::OK, killCode); +void OperationContext::markKilled() { + _killPending.store(1); } -ErrorCodes::Error OperationContext::getKillStatus() const { - return _killCode.loadRelaxed(); +bool OperationContext::isKillPending() const { + return _killPending.loadRelaxed(); } } // namespace mongo diff --git a/src/mongo/db/operation_context.h b/src/mongo/db/operation_context.h index 3a7e3165dda..a146a8f7f14 100644 --- a/src/mongo/db/operation_context.h +++ b/src/mongo/db/operation_context.h @@ -77,6 +77,7 @@ public: */ virtual RecoveryUnit* recoveryUnit() const = 0; + /** * Returns the RecoveryUnit (same return value as recoveryUnit()) but the caller takes * ownership of the returned RecoveryUnit, and the OperationContext instance relinquishes @@ -182,33 +183,23 @@ public: virtual bool writesAreReplicated() const = 0; /** - * Marks this operation as killed so that subsequent calls to checkForInterrupt and - * checkForInterruptNoAssert by the thread executing the operation will start returning the - * specified error code. + * Marks this operation as killed. * - * If multiple threads kill the same operation with different codes, only the first code will - * be preserved. + * Subsequent calls to checkForInterrupt and checkForInterruptNoAssert by the thread + * executing the operation will indicate that the operation has been killed. * - * May be called by any thread that has locked the Client owning this operation context. + * May be called by any thread that has locked the Client owning this operation context, + * or by the thread executing on behalf of this operation context. */ - void markKilled(ErrorCodes::Error killCode = ErrorCodes::Interrupted); + void markKilled(); /** - * Returns the code passed to markKilled if this operation context has been killed previously - * or ErrorCodes::OK otherwise. + * Returns true if markKilled has been called on this operation context. * - * May be called by any thread that has locked the Client owning this operation context, or - * without lock by the thread executing on behalf of this operation context. - */ - ErrorCodes::Error getKillStatus() const; - - /** - * Shortcut method, which checks whether getKillStatus returns a non-OK value. Has the same - * concurrency rules as getKillStatus. + * May be called by any thread that has locked the Client owning this operation context, + * or by the thread executing on behalf of this operation context. */ - bool isKillPending() const { - return getKillStatus() != ErrorCodes::OK; - } + bool isKillPending() const; protected: OperationContext(Client* client, unsigned int opId, Locker* locker); @@ -220,14 +211,11 @@ private: Client* const _client; const unsigned int _opId; - // Not owned. + // The lifetime of locker is managed by subclasses of OperationContext, so it is not + // safe to access _locker in the destructor of OperationContext. Locker* const _locker; - // Follows the values of ErrorCodes::Error. The default value is 0 (OK), which means the - // operation is not killed. If killed, it will contain a specific code. This value changes only - // once from OK to some kill code. - AtomicWord<ErrorCodes::Error> _killCode{ErrorCodes::OK}; - + AtomicInt32 _killPending{0}; WriteConcernOptions _writeConcern; }; diff --git a/src/mongo/db/operation_context_impl.cpp b/src/mongo/db/operation_context_impl.cpp index b7733958bc2..f2c0166876f 100644 --- a/src/mongo/db/operation_context_impl.cpp +++ b/src/mongo/db/operation_context_impl.cpp @@ -140,7 +140,6 @@ uint64_t OperationContextImpl::getRemainingMaxTimeMicros() const { MONGO_FP_DECLARE(checkForInterruptFail); namespace { - // Helper function for checkForInterrupt fail point. Decides whether the operation currently // being run by the given Client meet the (probabilistic) conditions for interruption as // specified in the fail point info. @@ -195,9 +194,8 @@ Status OperationContextImpl::checkForInterruptNoAssert() { } } - const auto killStatus = getKillStatus(); - if (killStatus != ErrorCodes::OK) { - return Status(killStatus, "operation was interrupted"); + if (isKillPending()) { + return Status(ErrorCodes::Interrupted, "operation was interrupted"); } return Status::OK(); diff --git a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp index 52b0fee436c..c42b80a605d 100644 --- a/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp +++ b/src/mongo/db/repl/replication_coordinator_external_state_impl.cpp @@ -350,8 +350,8 @@ void ReplicationCoordinatorExternalStateImpl::closeConnections() { } void ReplicationCoordinatorExternalStateImpl::killAllUserOperations(OperationContext* txn) { - ServiceContext* environment = txn->getServiceContext(); - environment->killAllUserOperations(txn, ErrorCodes::InterruptedDueToReplStateChange); + ServiceContext* environment = getGlobalServiceContext(); + environment->killAllUserOperations(txn); } void ReplicationCoordinatorExternalStateImpl::clearShardingState() { diff --git a/src/mongo/db/service_context.h b/src/mongo/db/service_context.h index 4e63046726f..6f9aa36a22f 100644 --- a/src/mongo/db/service_context.h +++ b/src/mongo/db/service_context.h @@ -284,7 +284,7 @@ public: * Kills all operations that have a Client that is associated with an incoming user * connection, except for the one associated with txn. */ - virtual void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) = 0; + virtual void killAllUserOperations(const OperationContext* txn) = 0; /** * Registers a listener to be notified each time an op is killed. diff --git a/src/mongo/db/service_context_d.cpp b/src/mongo/db/service_context_d.cpp index 470dcb927a5..3e5fd2d5be7 100644 --- a/src/mongo/db/service_context_d.cpp +++ b/src/mongo/db/service_context_d.cpp @@ -220,9 +220,21 @@ bool ServiceContextMongoD::getKillAllOperations() { return _globalKill; } -void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx, - ErrorCodes::Error killCode) { - opCtx->markKilled(killCode); +bool ServiceContextMongoD::_killOperationsAssociatedWithClientAndOpId_inlock(Client* client, + unsigned int opId) { + OperationContext* opCtx = client->getOperationContext(); + if (!opCtx) { + return false; + } + if (opCtx->getOpID() != opId) { + return false; + } + _killOperation_inlock(opCtx); + return true; +} + +void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx) { + opCtx->markKilled(); for (const auto listener : _killOpListeners) { try { @@ -236,10 +248,8 @@ void ServiceContextMongoD::_killOperation_inlock(OperationContext* opCtx, bool ServiceContextMongoD::killOperation(unsigned int opId) { for (LockedClientsCursor cursor(this); Client* client = cursor.next();) { stdx::lock_guard<Client> lk(*client); - - OperationContext* opCtx = client->getOperationContext(); - if (opCtx && opCtx->getOpID() == opId) { - _killOperation_inlock(opCtx, ErrorCodes::Interrupted); + bool found = _killOperationsAssociatedWithClientAndOpId_inlock(client, opId); + if (found) { return true; } } @@ -247,8 +257,7 @@ bool ServiceContextMongoD::killOperation(unsigned int opId) { return false; } -void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn, - ErrorCodes::Error killCode) { +void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn) { for (LockedClientsCursor cursor(this); Client* client = cursor.next();) { if (!client->isFromUserConnection()) { // Don't kill system operations. @@ -257,11 +266,16 @@ void ServiceContextMongoD::killAllUserOperations(const OperationContext* txn, stdx::lock_guard<Client> lk(*client); OperationContext* toKill = client->getOperationContext(); + if (!toKill) { + continue; + } - // Don't kill ourself. - if (toKill && toKill->getOpID() != txn->getOpID()) { - _killOperation_inlock(toKill, killCode); + if (toKill->getOpID() == txn->getOpID()) { + // Don't kill ourself. + continue; } + + _killOperation_inlock(toKill); } } diff --git a/src/mongo/db/service_context_d.h b/src/mongo/db/service_context_d.h index dc834197e9e..0c560ff17f4 100644 --- a/src/mongo/db/service_context_d.h +++ b/src/mongo/db/service_context_d.h @@ -67,7 +67,7 @@ public: bool killOperation(unsigned int opId) override; - void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) override; + void killAllUserOperations(const OperationContext* txn) override; void registerKillOpListener(KillOpListenerInterface* listener) override; @@ -79,11 +79,22 @@ private: std::unique_ptr<OperationContext> _newOpCtx(Client* client) override; /** + * Kills the active operation on "client" if that operation is associated with operation id + * "opId". + * + * Returns true if an operation was killed. + * + * Must only be called by a thread owning both this service context's mutex and the + * client's. + */ + bool _killOperationsAssociatedWithClientAndOpId_inlock(Client* client, unsigned int opId); + + /** * Kills the given operation. * * Caller must own the service context's _mutex. */ - void _killOperation_inlock(OperationContext* opCtx, ErrorCodes::Error killCode); + void _killOperation_inlock(OperationContext* opCtx); bool _globalKill; diff --git a/src/mongo/db/service_context_noop.cpp b/src/mongo/db/service_context_noop.cpp index fd94b35db89..65184906442 100644 --- a/src/mongo/db/service_context_noop.cpp +++ b/src/mongo/db/service_context_noop.cpp @@ -79,8 +79,7 @@ bool ServiceContextNoop::killOperation(unsigned int opId) { return false; } -void ServiceContextNoop::killAllUserOperations(const OperationContext* txn, - ErrorCodes::Error killCode) {} +void ServiceContextNoop::killAllUserOperations(const OperationContext* txn) {} void ServiceContextNoop::registerKillOpListener(KillOpListenerInterface* listener) {} diff --git a/src/mongo/db/service_context_noop.h b/src/mongo/db/service_context_noop.h index 62e6a169896..0e74b61b7a2 100644 --- a/src/mongo/db/service_context_noop.h +++ b/src/mongo/db/service_context_noop.h @@ -49,7 +49,7 @@ public: bool killOperation(unsigned int opId) override; - void killAllUserOperations(const OperationContext* txn, ErrorCodes::Error killCode) override; + void killAllUserOperations(const OperationContext* txn) override; void setKillAllOperations() override; diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp index 5686476e1a3..5943cc6e255 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set.cpp @@ -389,6 +389,8 @@ StatusWith<OpTimePair<DatabaseType>> CatalogManagerReplicaSet::_fetchDatabaseMet StatusWith<OpTimePair<CollectionType>> CatalogManagerReplicaSet::getCollection( OperationContext* txn, const std::string& collNs) { + auto configShard = grid.shardRegistry()->getShard(txn, "config"); + auto statusFind = _exhaustiveFindOnConfig(txn, kConfigReadSelector, NamespaceString(CollectionType::ConfigNS), diff --git a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp index 9bf44eaa09c..b21ff63961f 100644 --- a/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp +++ b/src/mongo/s/catalog/replset/catalog_manager_replica_set_write_retry_test.cpp @@ -93,7 +93,7 @@ TEST_F(InsertRetryTest, RetryOnInterruptedAndNetworkErrorSuccess) { onCommand([&](const RemoteCommandRequest& request) { ASSERT_EQ(request.target, kTestHosts[0]); configTargeter()->setFindHostReturnValue({kTestHosts[1]}); - return Status(ErrorCodes::InterruptedDueToReplStateChange, "Interruption"); + return Status(ErrorCodes::Interrupted, "Interruption"); }); onCommand([&](const RemoteCommandRequest& request) { @@ -271,7 +271,7 @@ TEST_F(UpdateRetryTest, OperationInterruptedDueToPrimaryStepDown) { auto writeErrDetail = stdx::make_unique<WriteErrorDetail>(); writeErrDetail->setIndex(0); - writeErrDetail->setErrCode(ErrorCodes::InterruptedDueToReplStateChange); + writeErrDetail->setErrCode(ErrorCodes::Interrupted); writeErrDetail->setErrMessage("Operation interrupted"); response.addToErrDetails(writeErrDetail.release()); diff --git a/src/mongo/s/client/shard_registry.cpp b/src/mongo/s/client/shard_registry.cpp index e5b91dda83a..55121f42209 100644 --- a/src/mongo/s/client/shard_registry.cpp +++ b/src/mongo/s/client/shard_registry.cpp @@ -126,7 +126,6 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kNotMasterErrors{ErrorCodes::N const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{ ErrorCodes::NotMaster, ErrorCodes::NotMasterNoSlaveOk, - ErrorCodes::NotMasterOrSecondary, // If write concern failed to be satisfied on the remote server, this most probably means that // some of the secondary nodes were unreachable or otherwise unresponsive, so the call is safe // to be retried if idempotency can be guaranteed. @@ -134,7 +133,10 @@ const ShardRegistry::ErrorCodesSet ShardRegistry::kAllRetriableErrors{ ErrorCodes::HostUnreachable, ErrorCodes::HostNotFound, ErrorCodes::NetworkTimeout, - ErrorCodes::InterruptedDueToReplStateChange}; + // This set includes interrupted because replica set step down kills all server operations + // before it closes connections so it may happen that the caller actually receives the + // interruption. + ErrorCodes::Interrupted}; ShardRegistry::ShardRegistry(std::unique_ptr<RemoteCommandTargeterFactory> targeterFactory, std::unique_ptr<executor::TaskExecutorPool> executorPool, @@ -780,8 +782,7 @@ StatusWith<ShardRegistry::CommandResponse> ShardRegistry::_runCommandWithMetadat void ShardRegistry::updateReplSetMonitor(const std::shared_ptr<RemoteCommandTargeter>& targeter, const HostAndPort& remoteHost, const Status& remoteCommandStatus) { - if (ErrorCodes::isNotMasterError(remoteCommandStatus.code()) || - (remoteCommandStatus == ErrorCodes::InterruptedDueToReplStateChange)) { + if (ErrorCodes::isNotMasterError(remoteCommandStatus.code())) { targeter->markHostNotMaster(remoteHost); } else if (ErrorCodes::isNetworkError(remoteCommandStatus.code())) { targeter->markHostUnreachable(remoteHost); diff --git a/src/mongo/s/query/async_results_merger.cpp b/src/mongo/s/query/async_results_merger.cpp index c82de6a3bbe..8b23528a04a 100644 --- a/src/mongo/s/query/async_results_merger.cpp +++ b/src/mongo/s/query/async_results_merger.cpp @@ -51,6 +51,15 @@ namespace { // Maximum number of retries for network and replication notMaster errors (per host). const int kMaxNumFailedHostRetryAttempts = 3; +/** + * Returns whether a particular error code returned from the initial cursor establishment should + * be retried. + */ +bool isPerShardRetriableError(ErrorCodes::Error err) { + return (ShardRegistry::kAllRetriableErrors.count(err) || + err == ErrorCodes::NotMasterOrSecondary); +} + } // namespace AsyncResultsMerger::AsyncResultsMerger(executor::TaskExecutor* executor, @@ -429,7 +438,8 @@ void AsyncResultsMerger::handleBatchResponse( if (!cursorResponseStatus.isOK()) { // Notify the shard registry of the failure. if (remote.shardId) { - auto shard = grid.shardRegistry()->getShardNoReload(*remote.shardId); + // TODO: Pass down an OperationContext* to use here. + auto shard = grid.shardRegistry()->getShard(nullptr, *remote.shardId); if (!shard) { remote.status = Status(cursorResponseStatus.getStatus().code(), str::stream() << "Could not find shard " << *remote.shardId @@ -443,10 +453,7 @@ void AsyncResultsMerger::handleBatchResponse( // If the error is retriable, schedule another request. if (!remote.cursorId && remote.retryCount < kMaxNumFailedHostRetryAttempts && - ShardRegistry::kAllRetriableErrors.count(cursorResponseStatus.getStatus().code())) { - LOG(1) << "Initial cursor establishment failed with retriable error and will be retried" - << causedBy(cursorResponseStatus.getStatus()); - + isPerShardRetriableError(cursorResponseStatus.getStatus().code())) { ++remote.retryCount; // Since we potentially updated the targeter that the last host it chose might be @@ -634,13 +641,13 @@ Status AsyncResultsMerger::RemoteCursorData::resolveShardIdToHostAndPort( invariant(shardId); invariant(!cursorId); - const auto shard = grid.shardRegistry()->getShardNoReload(*shardId); + // TODO: Pass down an OperationContext* to use here. + const auto shard = grid.shardRegistry()->getShard(nullptr, *shardId); if (!shard) { return Status(ErrorCodes::ShardNotFound, str::stream() << "Could not find shard " << *shardId); } - // TODO: Pass down an OperationContext* to use here. auto findHostStatus = shard->getTargeter()->findHost( readPref, RemoteCommandTargeter::selectFindHostMaxWaitTime(nullptr)); if (!findHostStatus.isOK()) { |