summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--jstests/sharding/max_time_ms_sharded.js66
-rw-r--r--src/mongo/db/s/balancer/migration_manager.cpp6
-rw-r--r--src/mongo/db/s/balancer/migration_manager_test.cpp75
-rw-r--r--src/mongo/s/client/SConscript1
-rw-r--r--src/mongo/s/client/shard.cpp8
-rw-r--r--src/mongo/s/client/shard.h8
-rw-r--r--src/mongo/s/client/shard_remote.cpp1
7 files changed, 122 insertions, 43 deletions
diff --git a/jstests/sharding/max_time_ms_sharded.js b/jstests/sharding/max_time_ms_sharded.js
index 329a49f599c..b551826ddfe 100644
--- a/jstests/sharding/max_time_ms_sharded.js
+++ b/jstests/sharding/max_time_ms_sharded.js
@@ -13,7 +13,6 @@
var shards = [st.shard0, st.shard1];
var coll = mongos.getCollection("foo.bar");
var admin = mongos.getDB("admin");
- var exceededTimeLimit = 50; // ErrorCodes::ExceededTimeLimit
var cursor;
var res;
@@ -117,12 +116,11 @@
// Positive test for "validate".
configureMaxTimeAlwaysTimeOut("alwaysOn");
- res = coll.runCommand("validate", {maxTimeMS: 60 * 1000});
- assert.commandFailed(
- res, "expected validate to fail in mongod due to maxTimeAlwaysTimeOut fail point");
- assert.eq(res["code"],
- exceededTimeLimit,
- "expected code " + exceededTimeLimit + " from validate, instead got: " + tojson(res));
+ assert.commandFailedWithCode(
+ coll.runCommand("validate", {maxTimeMS: 60 * 1000}),
+ ErrorCodes.ExceededTimeLimit,
+ "expected vailidate to fail with code " + ErrorCodes.ExceededTimeLimit +
+ " due to maxTimeAlwaysTimeOut fail point, but instead got: " + tojson(res));
// Negative test for "validate".
configureMaxTimeAlwaysTimeOut("off");
@@ -131,12 +129,11 @@
// Positive test for "count".
configureMaxTimeAlwaysTimeOut("alwaysOn");
- res = coll.runCommand("count", {maxTimeMS: 60 * 1000});
- assert.commandFailed(res,
- "expected count to fail in mongod due to maxTimeAlwaysTimeOut fail point");
- assert.eq(res["code"],
- exceededTimeLimit,
- "expected code " + exceededTimeLimit + " from count , instead got: " + tojson(res));
+ assert.commandFailedWithCode(
+ coll.runCommand("count", {maxTimeMS: 60 * 1000}),
+ ErrorCodes.ExceededTimeLimit,
+ "expected count to fail with code " + ErrorCodes.ExceededTimeLimit +
+ " due to maxTimeAlwaysTimeOut fail point, but instead got: " + tojson(res));
// Negative test for "count".
configureMaxTimeAlwaysTimeOut("off");
@@ -145,13 +142,11 @@
// Positive test for "collStats".
configureMaxTimeAlwaysTimeOut("alwaysOn");
- res = coll.runCommand("collStats", {maxTimeMS: 60 * 1000});
- assert.commandFailed(
- res, "expected collStats to fail in mongod due to maxTimeAlwaysTimeOut fail point");
- assert.eq(
- res["code"],
- exceededTimeLimit,
- "expected code " + exceededTimeLimit + " from collStats, instead got: " + tojson(res));
+ assert.commandFailedWithCode(
+ coll.runCommand("collStats", {maxTimeMS: 60 * 1000}),
+ ErrorCodes.ExceededTimeLimit,
+ "expected collStats to fail with code " + ErrorCodes.ExceededTimeLimit +
+ " due to maxTimeAlwaysTimeOut fail point, but instead got: " + tojson(res));
// Negative test for "collStats".
configureMaxTimeAlwaysTimeOut("off");
@@ -170,12 +165,11 @@
out: {inline: 1},
maxTimeMS: 60 * 1000
});
- assert.commandFailed(
- res, "expected mapReduce to fail in mongod due to maxTimeAlwaysTimeOut fail point");
- assert.eq(
- res["code"],
- exceededTimeLimit,
- "expected code " + exceededTimeLimit + " from mapReduce, instead got: " + tojson(res));
+ assert.commandFailedWithCode(
+ res,
+ ErrorCodes.ExceededTimeLimit,
+ "expected mapReduce to fail with code " + ErrorCodes.ExceededTimeLimit +
+ " due to maxTimeAlwaysTimeOut fail point, but instead got: " + tojson(res));
// Negative test for "mapReduce".
configureMaxTimeAlwaysTimeOut("off");
@@ -193,13 +187,11 @@
// Positive test for "aggregate".
configureMaxTimeAlwaysTimeOut("alwaysOn");
- res = coll.runCommand("aggregate", {pipeline: [], maxTimeMS: 60 * 1000});
- assert.commandFailed(
- res, "expected aggregate to fail in mongod due to maxTimeAlwaysTimeOut fail point");
- assert.eq(
- res["code"],
- exceededTimeLimit,
- "expected code " + exceededTimeLimit + " from aggregate , instead got: " + tojson(res));
+ assert.commandFailedWithCode(
+ coll.runCommand("aggregate", {pipeline: [], maxTimeMS: 60 * 1000}),
+ ErrorCodes.ExceededTimeLimit,
+ "expected aggregate to fail with code " + ErrorCodes.ExceededTimeLimit +
+ " due to maxTimeAlwaysTimeOut fail point, but instead got: " + tojson(res));
// Negative test for "aggregate".
configureMaxTimeAlwaysTimeOut("off");
@@ -215,11 +207,9 @@
maxTimeMS: 1000 * 60 * 60 * 24
});
assert.commandFailed(
- res, "expected moveChunk to fail in mongod due to maxTimeAlwaysTimeOut fail point");
- assert.eq(
- res["code"],
- exceededTimeLimit,
- "expected code " + exceededTimeLimit + " from moveChunk, instead got: " + tojson(res));
+ res,
+ "expected moveChunk to fail due to maxTimeAlwaysTimeOut fail point, but instead got: " +
+ tojson(res));
// Negative test for "moveChunk".
configureMaxTimeAlwaysTimeOut("off");
diff --git a/src/mongo/db/s/balancer/migration_manager.cpp b/src/mongo/db/s/balancer/migration_manager.cpp
index 82ace936e49..7ac13446e5f 100644
--- a/src/mongo/db/s/balancer/migration_manager.cpp
+++ b/src/mongo/db/s/balancer/migration_manager.cpp
@@ -44,7 +44,6 @@
#include "mongo/rpc/get_status_from_command_result.h"
#include "mongo/s/catalog/sharding_catalog_client.h"
#include "mongo/s/client/shard_registry.h"
-#include "mongo/s/client/shard_registry.h"
#include "mongo/s/grid.h"
#include "mongo/s/move_chunk_request.h"
#include "mongo/s/sharding_raii.h"
@@ -765,6 +764,11 @@ Status MigrationManager::_processRemoteCommandResponse(
commandStatus = remoteCommandResponse.status;
} else {
commandStatus = extractMigrationStatusFromCommandResponse(remoteCommandResponse.data);
+ if (!Shard::shouldErrorBePropagated(commandStatus.code())) {
+ commandStatus = {ErrorCodes::OperationFailed,
+ stream() << "moveChunk command failed on source shard."
+ << causedBy(commandStatus)};
+ }
}
// Any failure to remove the migration document should be because the config server is
diff --git a/src/mongo/db/s/balancer/migration_manager_test.cpp b/src/mongo/db/s/balancer/migration_manager_test.cpp
index 9d65e740fe2..9e6b8aa1eac 100644
--- a/src/mongo/db/s/balancer/migration_manager_test.cpp
+++ b/src/mongo/db/s/balancer/migration_manager_test.cpp
@@ -745,8 +745,9 @@ TEST_F(MigrationManagerTest, InterruptMigration) {
// up a dummy host for kShardHost0.
shardTargeterMock(txn.get(), kShardId0)->setFindHostReturnValue(kShardHost0);
- ASSERT_NOT_OK(_migrationManager->executeManualMigration(
- txn.get(), {kShardId1, chunk}, 0, kDefaultSecondaryThrottle, false));
+ ASSERT_EQ(ErrorCodes::BalancerInterrupted,
+ _migrationManager->executeManualMigration(
+ txn.get(), {kShardId1, chunk}, 0, kDefaultSecondaryThrottle, false));
});
// Wait till the move chunk request gets sent and pretend that it is stuck by never responding
@@ -768,8 +769,9 @@ TEST_F(MigrationManagerTest, InterruptMigration) {
future.timed_get(kFutureTimeout);
// Ensure that no new migrations can be scheduled
- ASSERT_NOT_OK(_migrationManager->executeManualMigration(
- operationContext(), {kShardId1, chunk}, 0, kDefaultSecondaryThrottle, false));
+ ASSERT_EQ(ErrorCodes::BalancerInterrupted,
+ _migrationManager->executeManualMigration(
+ operationContext(), {kShardId1, chunk}, 0, kDefaultSecondaryThrottle, false));
// Ensure that the migration manager is no longer handling any migrations.
_migrationManager->drainActiveMigrations();
@@ -949,5 +951,70 @@ TEST_F(MigrationManagerTest, FailMigrationRecovery) {
// distributed locks are unlocked.
}
+// Check that retriable / replset monitor altering errors returned from remote moveChunk commands
+// sent to source shards are not returned to the caller (mongos), but instead converted into
+// OperationFailed errors.
+TEST_F(MigrationManagerTest, RemoteCallErrorConversionToOperationFailed) {
+ // Set up two shards in the metadata.
+ ASSERT_OK(catalogClient()->insertConfigDocument(
+ operationContext(), ShardType::ConfigNS, kShard0, kMajorityWriteConcern));
+ ASSERT_OK(catalogClient()->insertConfigDocument(
+ operationContext(), ShardType::ConfigNS, kShard2, kMajorityWriteConcern));
+
+ // Set up the database and collection as sharded in the metadata.
+ std::string dbName = "foo";
+ std::string collName = "foo.bar";
+ ChunkVersion version(1, 0, OID::gen());
+
+ setUpDatabase(dbName, kShardId0);
+ setUpCollection(collName, version);
+
+ // Set up two chunks in the metadata.
+ ChunkType chunk1 =
+ setUpChunk(collName, kKeyPattern.globalMin(), BSON(kPattern << 49), kShardId0, version);
+ version.incMinor();
+ ChunkType chunk2 =
+ setUpChunk(collName, BSON(kPattern << 49), kKeyPattern.globalMax(), kShardId2, version);
+
+ auto future = launchAsync([&] {
+ Client::initThreadIfNotAlready("Test");
+ auto txn = cc().makeOperationContext();
+
+ // Scheduling the moveChunk commands requires finding a host to which to send the command.
+ // Set up dummy hosts for the source shards.
+ shardTargeterMock(txn.get(), kShardId0)->setFindHostReturnValue(kShardHost0);
+ shardTargeterMock(txn.get(), kShardId2)->setFindHostReturnValue(kShardHost2);
+
+ MigrationStatuses migrationStatuses = _migrationManager->executeMigrationsForAutoBalance(
+ txn.get(),
+ {{kShardId1, chunk1}, {kShardId3, chunk2}},
+ 0,
+ kDefaultSecondaryThrottle,
+ false);
+
+ ASSERT_EQ(ErrorCodes::OperationFailed, migrationStatuses.at(chunk1.getName()));
+ ASSERT_EQ(ErrorCodes::OperationFailed, migrationStatuses.at(chunk2.getName()));
+ });
+
+ // Expect a moveChunk command that will fail with a retriable error.
+ expectMoveChunkCommand(
+ chunk1,
+ kShardId1,
+ false,
+ Status(ErrorCodes::NotMasterOrSecondary,
+ "RemoteCallErrorConversionToOperationFailedCheck generated error."));
+
+ // Expect a moveChunk command that will fail with a replset monitor updating error.
+ expectMoveChunkCommand(
+ chunk2,
+ kShardId3,
+ false,
+ Status(ErrorCodes::ExceededTimeLimit,
+ "RemoteCallErrorConversionToOperationFailedCheck generated error."));
+
+ // Run the MigrationManager code.
+ future.timed_get(kFutureTimeout);
+}
+
} // namespace
} // namespace mongo
diff --git a/src/mongo/s/client/SConscript b/src/mongo/s/client/SConscript
index 6fe11753411..573d7a5c70b 100644
--- a/src/mongo/s/client/SConscript
+++ b/src/mongo/s/client/SConscript
@@ -77,6 +77,7 @@ env.Library(
],
LIBDEPS=[
'$BUILD_DIR/mongo/base',
+ '$BUILD_DIR/mongo/client/remote_command_retry_scheduler',
'$BUILD_DIR/mongo/s/shard_id',
'$BUILD_DIR/mongo/s/write_ops/batch_write_types',
]
diff --git a/src/mongo/s/client/shard.cpp b/src/mongo/s/client/shard.cpp
index bc1e791e3db..be61c500604 100644
--- a/src/mongo/s/client/shard.cpp
+++ b/src/mongo/s/client/shard.cpp
@@ -30,6 +30,7 @@
#include "mongo/platform/basic.h"
+#include "mongo/client/remote_command_retry_scheduler.h"
#include "mongo/db/operation_context.h"
#include "mongo/s/client/shard.h"
#include "mongo/s/client/shard_registry.h"
@@ -92,6 +93,13 @@ Status Shard::CommandResponse::processBatchWriteResponse(
const Milliseconds Shard::kDefaultConfigCommandTimeout = Seconds{30};
+bool Shard::shouldErrorBePropagated(ErrorCodes::Error code) {
+ return std::find(RemoteCommandRetryScheduler::kAllRetriableErrors.begin(),
+ RemoteCommandRetryScheduler::kAllRetriableErrors.end(),
+ code) == RemoteCommandRetryScheduler::kAllRetriableErrors.end() &&
+ code != ErrorCodes::ExceededTimeLimit;
+}
+
Shard::Shard(const ShardId& id) : _id(id) {}
const ShardId Shard::getId() const {
diff --git a/src/mongo/s/client/shard.h b/src/mongo/s/client/shard.h
index 723a66d8ca9..26413b61c6d 100644
--- a/src/mongo/s/client/shard.h
+++ b/src/mongo/s/client/shard.h
@@ -223,6 +223,14 @@ public:
// explicitly overridden
static const Milliseconds kDefaultConfigCommandTimeout;
+ /**
+ * Returns false if the error is a retriable error and/or causes a replset monitor update. These
+ * errors, if from a remote call, should not be further propagated back to another server
+ * because that server will interpret them as orignating on this server rather than the one this
+ * server called.
+ */
+ static bool shouldErrorBePropagated(ErrorCodes::Error code);
+
protected:
struct HostWithResponse {
HostWithResponse(boost::optional<HostAndPort> _host,
diff --git a/src/mongo/s/client/shard_remote.cpp b/src/mongo/s/client/shard_remote.cpp
index 64d4f58ff1f..66392dd4481 100644
--- a/src/mongo/s/client/shard_remote.cpp
+++ b/src/mongo/s/client/shard_remote.cpp
@@ -118,6 +118,7 @@ const ConnectionString ShardRemote::getConnString() const {
return _targeter->connectionString();
}
+// Any error code changes should possibly also be made to Shard::shouldErrorBePropagated!
void ShardRemote::updateReplSetMonitor(const HostAndPort& remoteHost,
const Status& remoteCommandStatus) {
if (remoteCommandStatus.isOK())