diff options
-rw-r--r-- | jstests/sharding/max_time_ms_sharded.js | 66 | ||||
-rw-r--r-- | src/mongo/db/s/balancer/migration_manager.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/s/balancer/migration_manager_test.cpp | 75 | ||||
-rw-r--r-- | src/mongo/s/client/SConscript | 1 | ||||
-rw-r--r-- | src/mongo/s/client/shard.cpp | 8 | ||||
-rw-r--r-- | src/mongo/s/client/shard.h | 8 | ||||
-rw-r--r-- | src/mongo/s/client/shard_remote.cpp | 1 |
7 files changed, 122 insertions, 43 deletions
diff --git a/jstests/sharding/max_time_ms_sharded.js b/jstests/sharding/max_time_ms_sharded.js index 329a49f599c..b551826ddfe 100644 --- a/jstests/sharding/max_time_ms_sharded.js +++ b/jstests/sharding/max_time_ms_sharded.js @@ -13,7 +13,6 @@ var shards = [st.shard0, st.shard1]; var coll = mongos.getCollection("foo.bar"); var admin = mongos.getDB("admin"); - var exceededTimeLimit = 50; // ErrorCodes::ExceededTimeLimit var cursor; var res; @@ -117,12 +116,11 @@ // Positive test for "validate". configureMaxTimeAlwaysTimeOut("alwaysOn"); - res = coll.runCommand("validate", {maxTimeMS: 60 * 1000}); - assert.commandFailed( - res, "expected validate to fail in mongod due to maxTimeAlwaysTimeOut fail point"); - assert.eq(res["code"], - exceededTimeLimit, - "expected code " + exceededTimeLimit + " from validate, instead got: " + tojson(res)); + assert.commandFailedWithCode( + coll.runCommand("validate", {maxTimeMS: 60 * 1000}), + ErrorCodes.ExceededTimeLimit, + "expected vailidate to fail with code " + ErrorCodes.ExceededTimeLimit + + " due to maxTimeAlwaysTimeOut fail point, but instead got: " + tojson(res)); // Negative test for "validate". configureMaxTimeAlwaysTimeOut("off"); @@ -131,12 +129,11 @@ // Positive test for "count". configureMaxTimeAlwaysTimeOut("alwaysOn"); - res = coll.runCommand("count", {maxTimeMS: 60 * 1000}); - assert.commandFailed(res, - "expected count to fail in mongod due to maxTimeAlwaysTimeOut fail point"); - assert.eq(res["code"], - exceededTimeLimit, - "expected code " + exceededTimeLimit + " from count , instead got: " + tojson(res)); + assert.commandFailedWithCode( + coll.runCommand("count", {maxTimeMS: 60 * 1000}), + ErrorCodes.ExceededTimeLimit, + "expected count to fail with code " + ErrorCodes.ExceededTimeLimit + + " due to maxTimeAlwaysTimeOut fail point, but instead got: " + tojson(res)); // Negative test for "count". configureMaxTimeAlwaysTimeOut("off"); @@ -145,13 +142,11 @@ // Positive test for "collStats". configureMaxTimeAlwaysTimeOut("alwaysOn"); - res = coll.runCommand("collStats", {maxTimeMS: 60 * 1000}); - assert.commandFailed( - res, "expected collStats to fail in mongod due to maxTimeAlwaysTimeOut fail point"); - assert.eq( - res["code"], - exceededTimeLimit, - "expected code " + exceededTimeLimit + " from collStats, instead got: " + tojson(res)); + assert.commandFailedWithCode( + coll.runCommand("collStats", {maxTimeMS: 60 * 1000}), + ErrorCodes.ExceededTimeLimit, + "expected collStats to fail with code " + ErrorCodes.ExceededTimeLimit + + " due to maxTimeAlwaysTimeOut fail point, but instead got: " + tojson(res)); // Negative test for "collStats". configureMaxTimeAlwaysTimeOut("off"); @@ -170,12 +165,11 @@ out: {inline: 1}, maxTimeMS: 60 * 1000 }); - assert.commandFailed( - res, "expected mapReduce to fail in mongod due to maxTimeAlwaysTimeOut fail point"); - assert.eq( - res["code"], - exceededTimeLimit, - "expected code " + exceededTimeLimit + " from mapReduce, instead got: " + tojson(res)); + assert.commandFailedWithCode( + res, + ErrorCodes.ExceededTimeLimit, + "expected mapReduce to fail with code " + ErrorCodes.ExceededTimeLimit + + " due to maxTimeAlwaysTimeOut fail point, but instead got: " + tojson(res)); // Negative test for "mapReduce". configureMaxTimeAlwaysTimeOut("off"); @@ -193,13 +187,11 @@ // Positive test for "aggregate". configureMaxTimeAlwaysTimeOut("alwaysOn"); - res = coll.runCommand("aggregate", {pipeline: [], maxTimeMS: 60 * 1000}); - assert.commandFailed( - res, "expected aggregate to fail in mongod due to maxTimeAlwaysTimeOut fail point"); - assert.eq( - res["code"], - exceededTimeLimit, - "expected code " + exceededTimeLimit + " from aggregate , instead got: " + tojson(res)); + assert.commandFailedWithCode( + coll.runCommand("aggregate", {pipeline: [], maxTimeMS: 60 * 1000}), + ErrorCodes.ExceededTimeLimit, + "expected aggregate to fail with code " + ErrorCodes.ExceededTimeLimit + + " due to maxTimeAlwaysTimeOut fail point, but instead got: " + tojson(res)); // Negative test for "aggregate". configureMaxTimeAlwaysTimeOut("off"); @@ -215,11 +207,9 @@ maxTimeMS: 1000 * 60 * 60 * 24 }); assert.commandFailed( - res, "expected moveChunk to fail in mongod due to maxTimeAlwaysTimeOut fail point"); - assert.eq( - res["code"], - exceededTimeLimit, - "expected code " + exceededTimeLimit + " from moveChunk, instead got: " + tojson(res)); + res, + "expected moveChunk to fail due to maxTimeAlwaysTimeOut fail point, but instead got: " + + tojson(res)); // Negative test for "moveChunk". configureMaxTimeAlwaysTimeOut("off"); diff --git a/src/mongo/db/s/balancer/migration_manager.cpp b/src/mongo/db/s/balancer/migration_manager.cpp index 82ace936e49..7ac13446e5f 100644 --- a/src/mongo/db/s/balancer/migration_manager.cpp +++ b/src/mongo/db/s/balancer/migration_manager.cpp @@ -44,7 +44,6 @@ #include "mongo/rpc/get_status_from_command_result.h" #include "mongo/s/catalog/sharding_catalog_client.h" #include "mongo/s/client/shard_registry.h" -#include "mongo/s/client/shard_registry.h" #include "mongo/s/grid.h" #include "mongo/s/move_chunk_request.h" #include "mongo/s/sharding_raii.h" @@ -765,6 +764,11 @@ Status MigrationManager::_processRemoteCommandResponse( commandStatus = remoteCommandResponse.status; } else { commandStatus = extractMigrationStatusFromCommandResponse(remoteCommandResponse.data); + if (!Shard::shouldErrorBePropagated(commandStatus.code())) { + commandStatus = {ErrorCodes::OperationFailed, + stream() << "moveChunk command failed on source shard." + << causedBy(commandStatus)}; + } } // Any failure to remove the migration document should be because the config server is diff --git a/src/mongo/db/s/balancer/migration_manager_test.cpp b/src/mongo/db/s/balancer/migration_manager_test.cpp index 9d65e740fe2..9e6b8aa1eac 100644 --- a/src/mongo/db/s/balancer/migration_manager_test.cpp +++ b/src/mongo/db/s/balancer/migration_manager_test.cpp @@ -745,8 +745,9 @@ TEST_F(MigrationManagerTest, InterruptMigration) { // up a dummy host for kShardHost0. shardTargeterMock(txn.get(), kShardId0)->setFindHostReturnValue(kShardHost0); - ASSERT_NOT_OK(_migrationManager->executeManualMigration( - txn.get(), {kShardId1, chunk}, 0, kDefaultSecondaryThrottle, false)); + ASSERT_EQ(ErrorCodes::BalancerInterrupted, + _migrationManager->executeManualMigration( + txn.get(), {kShardId1, chunk}, 0, kDefaultSecondaryThrottle, false)); }); // Wait till the move chunk request gets sent and pretend that it is stuck by never responding @@ -768,8 +769,9 @@ TEST_F(MigrationManagerTest, InterruptMigration) { future.timed_get(kFutureTimeout); // Ensure that no new migrations can be scheduled - ASSERT_NOT_OK(_migrationManager->executeManualMigration( - operationContext(), {kShardId1, chunk}, 0, kDefaultSecondaryThrottle, false)); + ASSERT_EQ(ErrorCodes::BalancerInterrupted, + _migrationManager->executeManualMigration( + operationContext(), {kShardId1, chunk}, 0, kDefaultSecondaryThrottle, false)); // Ensure that the migration manager is no longer handling any migrations. _migrationManager->drainActiveMigrations(); @@ -949,5 +951,70 @@ TEST_F(MigrationManagerTest, FailMigrationRecovery) { // distributed locks are unlocked. } +// Check that retriable / replset monitor altering errors returned from remote moveChunk commands +// sent to source shards are not returned to the caller (mongos), but instead converted into +// OperationFailed errors. +TEST_F(MigrationManagerTest, RemoteCallErrorConversionToOperationFailed) { + // Set up two shards in the metadata. + ASSERT_OK(catalogClient()->insertConfigDocument( + operationContext(), ShardType::ConfigNS, kShard0, kMajorityWriteConcern)); + ASSERT_OK(catalogClient()->insertConfigDocument( + operationContext(), ShardType::ConfigNS, kShard2, kMajorityWriteConcern)); + + // Set up the database and collection as sharded in the metadata. + std::string dbName = "foo"; + std::string collName = "foo.bar"; + ChunkVersion version(1, 0, OID::gen()); + + setUpDatabase(dbName, kShardId0); + setUpCollection(collName, version); + + // Set up two chunks in the metadata. + ChunkType chunk1 = + setUpChunk(collName, kKeyPattern.globalMin(), BSON(kPattern << 49), kShardId0, version); + version.incMinor(); + ChunkType chunk2 = + setUpChunk(collName, BSON(kPattern << 49), kKeyPattern.globalMax(), kShardId2, version); + + auto future = launchAsync([&] { + Client::initThreadIfNotAlready("Test"); + auto txn = cc().makeOperationContext(); + + // Scheduling the moveChunk commands requires finding a host to which to send the command. + // Set up dummy hosts for the source shards. + shardTargeterMock(txn.get(), kShardId0)->setFindHostReturnValue(kShardHost0); + shardTargeterMock(txn.get(), kShardId2)->setFindHostReturnValue(kShardHost2); + + MigrationStatuses migrationStatuses = _migrationManager->executeMigrationsForAutoBalance( + txn.get(), + {{kShardId1, chunk1}, {kShardId3, chunk2}}, + 0, + kDefaultSecondaryThrottle, + false); + + ASSERT_EQ(ErrorCodes::OperationFailed, migrationStatuses.at(chunk1.getName())); + ASSERT_EQ(ErrorCodes::OperationFailed, migrationStatuses.at(chunk2.getName())); + }); + + // Expect a moveChunk command that will fail with a retriable error. + expectMoveChunkCommand( + chunk1, + kShardId1, + false, + Status(ErrorCodes::NotMasterOrSecondary, + "RemoteCallErrorConversionToOperationFailedCheck generated error.")); + + // Expect a moveChunk command that will fail with a replset monitor updating error. + expectMoveChunkCommand( + chunk2, + kShardId3, + false, + Status(ErrorCodes::ExceededTimeLimit, + "RemoteCallErrorConversionToOperationFailedCheck generated error.")); + + // Run the MigrationManager code. + future.timed_get(kFutureTimeout); +} + } // namespace } // namespace mongo diff --git a/src/mongo/s/client/SConscript b/src/mongo/s/client/SConscript index 6fe11753411..573d7a5c70b 100644 --- a/src/mongo/s/client/SConscript +++ b/src/mongo/s/client/SConscript @@ -77,6 +77,7 @@ env.Library( ], LIBDEPS=[ '$BUILD_DIR/mongo/base', + '$BUILD_DIR/mongo/client/remote_command_retry_scheduler', '$BUILD_DIR/mongo/s/shard_id', '$BUILD_DIR/mongo/s/write_ops/batch_write_types', ] diff --git a/src/mongo/s/client/shard.cpp b/src/mongo/s/client/shard.cpp index bc1e791e3db..be61c500604 100644 --- a/src/mongo/s/client/shard.cpp +++ b/src/mongo/s/client/shard.cpp @@ -30,6 +30,7 @@ #include "mongo/platform/basic.h" +#include "mongo/client/remote_command_retry_scheduler.h" #include "mongo/db/operation_context.h" #include "mongo/s/client/shard.h" #include "mongo/s/client/shard_registry.h" @@ -92,6 +93,13 @@ Status Shard::CommandResponse::processBatchWriteResponse( const Milliseconds Shard::kDefaultConfigCommandTimeout = Seconds{30}; +bool Shard::shouldErrorBePropagated(ErrorCodes::Error code) { + return std::find(RemoteCommandRetryScheduler::kAllRetriableErrors.begin(), + RemoteCommandRetryScheduler::kAllRetriableErrors.end(), + code) == RemoteCommandRetryScheduler::kAllRetriableErrors.end() && + code != ErrorCodes::ExceededTimeLimit; +} + Shard::Shard(const ShardId& id) : _id(id) {} const ShardId Shard::getId() const { diff --git a/src/mongo/s/client/shard.h b/src/mongo/s/client/shard.h index 723a66d8ca9..26413b61c6d 100644 --- a/src/mongo/s/client/shard.h +++ b/src/mongo/s/client/shard.h @@ -223,6 +223,14 @@ public: // explicitly overridden static const Milliseconds kDefaultConfigCommandTimeout; + /** + * Returns false if the error is a retriable error and/or causes a replset monitor update. These + * errors, if from a remote call, should not be further propagated back to another server + * because that server will interpret them as orignating on this server rather than the one this + * server called. + */ + static bool shouldErrorBePropagated(ErrorCodes::Error code); + protected: struct HostWithResponse { HostWithResponse(boost::optional<HostAndPort> _host, diff --git a/src/mongo/s/client/shard_remote.cpp b/src/mongo/s/client/shard_remote.cpp index 64d4f58ff1f..66392dd4481 100644 --- a/src/mongo/s/client/shard_remote.cpp +++ b/src/mongo/s/client/shard_remote.cpp @@ -118,6 +118,7 @@ const ConnectionString ShardRemote::getConnString() const { return _targeter->connectionString(); } +// Any error code changes should possibly also be made to Shard::shouldErrorBePropagated! void ShardRemote::updateReplSetMonitor(const HostAndPort& remoteHost, const Status& remoteCommandStatus) { if (remoteCommandStatus.isOK()) |