summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDianna Hohensee <dianna.hohensee@10gen.com>2017-11-20 13:24:03 -0500
committerDianna Hohensee <dianna.hohensee@10gen.com>2017-11-28 10:33:46 -0500
commit2d2a0ebee73a4a1652ae1f1373b2aa47856ddb55 (patch)
tree8900278c15713cf8e0a6add221c1cf51e108a848
parent42fe228abc27664f630cbafe932ffaf2b12eccb8 (diff)
downloadmongo-2d2a0ebee73a4a1652ae1f1373b2aa47856ddb55.tar.gz
SERVER-30226 Best effort make the recipient shard refresh its metadata after migration commit
-rw-r--r--buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml3
-rw-r--r--jstests/aggregation/shard_targeting.js7
-rw-r--r--jstests/sharding/cleanup_orphaned_cmd_prereload.js40
-rw-r--r--jstests/sharding/pending_chunk.js7
-rw-r--r--src/mongo/db/s/migration_source_manager.cpp46
5 files changed, 81 insertions, 22 deletions
diff --git a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml
index 75586a2f738..5bedb8b9c95 100644
--- a/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml
+++ b/buildscripts/resmokeconfig/suites/sharding_last_stable_mongos_and_mixed_shards.yml
@@ -102,6 +102,9 @@ selector:
- jstests/sharding/read_pref_cmd.js
# SERVER-30444: Add a maxChunkSize field to the serverStatus command response
- jstests/sharding/server_status.js
+ # New failpoint in v3.6 shard mongod.
+ - jstests/sharding/cleanup_orphaned_cmd_prereload.js
+ - jstests/sharding/pending_chunk.js
executor:
config:
diff --git a/jstests/aggregation/shard_targeting.js b/jstests/aggregation/shard_targeting.js
index 409eb636b61..c78bb93a29a 100644
--- a/jstests/aggregation/shard_targeting.js
+++ b/jstests/aggregation/shard_targeting.js
@@ -38,6 +38,13 @@
const shard0DB = primaryShardDB = st.shard0.getDB(jsTestName());
const shard1DB = st.shard1.getDB(jsTestName());
+ // Turn off best-effort recipient metadata refresh post-migration commit on both shards because
+ // it creates non-determinism for the profiler.
+ assert.commandWorked(st.shard0.getDB('admin').runCommand(
+ {configureFailPoint: 'doNotRefreshRecipientAfterCommit', mode: 'alwaysOn'}));
+ assert.commandWorked(st.shard1.getDB('admin').runCommand(
+ {configureFailPoint: 'doNotRefreshRecipientAfterCommit', mode: 'alwaysOn'}));
+
assert.commandWorked(mongosDB.dropDatabase());
// Enable sharding on the test DB and ensure its primary is shard0000.
diff --git a/jstests/sharding/cleanup_orphaned_cmd_prereload.js b/jstests/sharding/cleanup_orphaned_cmd_prereload.js
index a5077faa7eb..c24179d0ff2 100644
--- a/jstests/sharding/cleanup_orphaned_cmd_prereload.js
+++ b/jstests/sharding/cleanup_orphaned_cmd_prereload.js
@@ -8,25 +8,26 @@ var mongos = st.s0;
var admin = mongos.getDB("admin");
var coll = mongos.getCollection("foo.bar");
-assert(admin.runCommand({enableSharding: coll.getDB() + ""}).ok);
+assert.commandWorked(admin.runCommand({enableSharding: coll.getDB() + ""}));
printjson(admin.runCommand({movePrimary: coll.getDB() + "", to: st.shard0.shardName}));
-assert(admin.runCommand({shardCollection: coll + "", key: {_id: 1}}).ok);
+assert.commandWorked(admin.runCommand({shardCollection: coll + "", key: {_id: 1}}));
+
+// Turn off best-effort recipient metadata refresh post-migration commit on both shards because it
+// would clean up the pending chunks on migration recipients.
+assert.commandWorked(st.shard0.getDB('admin').runCommand(
+ {configureFailPoint: 'doNotRefreshRecipientAfterCommit', mode: 'alwaysOn'}));
+assert.commandWorked(st.shard1.getDB('admin').runCommand(
+ {configureFailPoint: 'doNotRefreshRecipientAfterCommit', mode: 'alwaysOn'}));
jsTest.log("Moving some chunks to shard1...");
-assert(admin.runCommand({split: coll + "", middle: {_id: 0}}).ok);
-assert(admin.runCommand({split: coll + "", middle: {_id: 1}}).ok);
+assert.commandWorked(admin.runCommand({split: coll + "", middle: {_id: 0}}));
+assert.commandWorked(admin.runCommand({split: coll + "", middle: {_id: 1}}));
-assert(
- admin
- .runCommand(
- {moveChunk: coll + "", find: {_id: 0}, to: st.shard1.shardName, _waitForDelete: true})
- .ok);
-assert(
- admin
- .runCommand(
- {moveChunk: coll + "", find: {_id: 1}, to: st.shard1.shardName, _waitForDelete: true})
- .ok);
+assert.commandWorked(admin.runCommand(
+ {moveChunk: coll + "", find: {_id: 0}, to: st.shard1.shardName, _waitForDelete: true}));
+assert.commandWorked(admin.runCommand(
+ {moveChunk: coll + "", find: {_id: 1}, to: st.shard1.shardName, _waitForDelete: true}));
var metadata =
st.shard1.getDB("admin").runCommand({getShardVersion: coll + "", fullMetadata: true}).metadata;
@@ -44,8 +45,8 @@ assert(!st.shard1.getDB("admin")
jsTest.log("Moving some chunks back to shard0 after empty...");
-admin.runCommand(
- {moveChunk: coll + "", find: {_id: -1}, to: st.shard1.shardName, _waitForDelete: true});
+assert.commandWorked(admin.runCommand(
+ {moveChunk: coll + "", find: {_id: -1}, to: st.shard1.shardName, _waitForDelete: true}));
var metadata =
st.shard0.getDB("admin").runCommand({getShardVersion: coll + "", fullMetadata: true}).metadata;
@@ -56,11 +57,8 @@ assert.eq(metadata.shardVersion.t, 0);
assert.neq(metadata.collVersion.t, 0);
assert.eq(metadata.pending.length, 0);
-assert(
- admin
- .runCommand(
- {moveChunk: coll + "", find: {_id: 1}, to: st.shard0.shardName, _waitForDelete: true})
- .ok);
+assert.commandWorked(admin.runCommand(
+ {moveChunk: coll + "", find: {_id: 1}, to: st.shard0.shardName, _waitForDelete: true}));
var metadata =
st.shard0.getDB("admin").runCommand({getShardVersion: coll + "", fullMetadata: true}).metadata;
diff --git a/jstests/sharding/pending_chunk.js b/jstests/sharding/pending_chunk.js
index 14e7c3ebf61..fb8730b6ab4 100644
--- a/jstests/sharding/pending_chunk.js
+++ b/jstests/sharding/pending_chunk.js
@@ -17,6 +17,13 @@
printjson(admin.runCommand({movePrimary: dbName, to: st.shard0.shardName}));
assert.commandWorked(admin.runCommand({shardCollection: ns, key: {_id: 1}}));
+ // Turn off best-effort recipient metadata refresh post-migration commit on both shards because
+ // it would clean up the pending chunks on migration recipients.
+ assert.commandWorked(st.shard0.getDB('admin').runCommand(
+ {configureFailPoint: 'doNotRefreshRecipientAfterCommit', mode: 'alwaysOn'}));
+ assert.commandWorked(st.shard1.getDB('admin').runCommand(
+ {configureFailPoint: 'doNotRefreshRecipientAfterCommit', mode: 'alwaysOn'}));
+
jsTest.log('Moving some chunks to shard1...');
assert.commandWorked(admin.runCommand({split: ns, middle: {_id: 0}}));
diff --git a/src/mongo/db/s/migration_source_manager.cpp b/src/mongo/db/s/migration_source_manager.cpp
index 5ff7cdf49ad..10d134784bb 100644
--- a/src/mongo/db/s/migration_source_manager.cpp
+++ b/src/mongo/db/s/migration_source_manager.cpp
@@ -43,6 +43,8 @@
#include "mongo/db/s/shard_metadata_util.h"
#include "mongo/db/s/sharding_state.h"
#include "mongo/db/s/sharding_state_recovery.h"
+#include "mongo/executor/task_executor.h"
+#include "mongo/executor/task_executor_pool.h"
#include "mongo/s/catalog/sharding_catalog_client.h"
#include "mongo/s/catalog/type_chunk.h"
#include "mongo/s/catalog/type_shard_collection.h"
@@ -50,6 +52,7 @@
#include "mongo/s/client/shard_registry.h"
#include "mongo/s/grid.h"
#include "mongo/s/request_types/commit_chunk_migration_request_type.h"
+#include "mongo/s/set_shard_version_request.h"
#include "mongo/s/shard_key_pattern.h"
#include "mongo/s/stale_exception.h"
#include "mongo/stdx/memory.h"
@@ -75,11 +78,43 @@ const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority,
WriteConcernOptions::SyncMode::UNSET,
Seconds(15));
+/**
+ * Best-effort attempt to ensure the recipient shard has refreshed its routing table to
+ * 'newCollVersion'. Fires and forgets an asychronous remote setShardVersion command.
+ */
+void refreshRecipientRoutingTable(OperationContext* opCtx,
+ const NamespaceString& nss,
+ ShardId toShard,
+ const HostAndPort& toShardHost,
+ const ChunkVersion& newCollVersion) {
+ SetShardVersionRequest ssv = SetShardVersionRequest::makeForVersioningNoPersist(
+ Grid::get(opCtx)->shardRegistry()->getConfigServerConnectionString(),
+ toShard,
+ ConnectionString(toShardHost),
+ nss,
+ newCollVersion,
+ false);
+
+ const executor::RemoteCommandRequest request(
+ toShardHost,
+ NamespaceString::kAdminDb.toString(),
+ ssv.toBSON(),
+ ReadPreferenceSetting{ReadPreference::PrimaryOnly}.toContainingBSON(),
+ opCtx,
+ executor::RemoteCommandRequest::kNoTimeout);
+
+ executor::TaskExecutor* const executor =
+ Grid::get(opCtx)->getExecutorPool()->getFixedExecutor();
+ executor->scheduleRemoteCommand(
+ request, [](const executor::TaskExecutor::RemoteCommandCallbackArgs& args) {});
+}
+
} // namespace
-MONGO_FP_DECLARE(migrationCommitNetworkError);
+MONGO_FP_DECLARE(doNotRefreshRecipientAfterCommit);
MONGO_FP_DECLARE(failMigrationCommit);
MONGO_FP_DECLARE(hangBeforeLeavingCriticalSection);
+MONGO_FP_DECLARE(migrationCommitNetworkError);
MigrationSourceManager::MigrationSourceManager(OperationContext* opCtx,
MoveChunkRequest request,
@@ -523,6 +558,15 @@ Status MigrationSourceManager::commitChunkMetadataOnConfig(OperationContext* opC
return CollectionShardingState::get(opCtx, getNss())->cleanUpRange(range, whenToClean);
}();
+ if (!MONGO_FAIL_POINT(doNotRefreshRecipientAfterCommit)) {
+ // Best-effort make the recipient refresh its routing table to the new collection version.
+ refreshRecipientRoutingTable(opCtx,
+ getNss(),
+ _args.getToShardId(),
+ _recipientHost,
+ refreshedMetadata->getCollVersion());
+ }
+
if (_args.getWaitForDelete()) {
log() << "Waiting for cleanup of " << getNss().ns() << " range "
<< redact(range.toString());