From fc8aa79ddb1e4910ad227dddbe0ab3603e663fa8 Mon Sep 17 00:00:00 2001 From: Cheahuychou Mao Date: Fri, 18 Dec 2020 20:19:13 +0000 Subject: SERVER-53444 Make tests that run removeShard in assert.soon to wait for the state to become completed not error on ShardNotFound (cherry picked from commit 03637b5614c1a29983cdac9a1f9ab2d3f7060f15) --- jstests/sharding/balancing_sessions_collection.js | 9 +++++++++ jstests/sharding/listshards.js | 9 +++++++++ ...hard_removal_triggers_catalog_cache_invalidation.js | 18 ++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/jstests/sharding/balancing_sessions_collection.js b/jstests/sharding/balancing_sessions_collection.js index 416eaa6186d..8313b1ca80d 100644 --- a/jstests/sharding/balancing_sessions_collection.js +++ b/jstests/sharding/balancing_sessions_collection.js @@ -51,6 +51,15 @@ function removeShardFromCluster(shardName) { assert.commandWorked(st.s.adminCommand({removeShard: shardName})); assert.soon(function() { const res = st.s.adminCommand({removeShard: shardName}); + if (!res.ok && res.code === ErrorCodes.ShardNotFound) { + // If the config server primary steps down right after removing the config.shards doc + // for the shard but before responding with "state": "completed", the mongos would retry + // the _configsvrRemoveShard command against the new config server primary, which would + // not find the removed shard in its ShardRegistry if it has done a ShardRegistry reload + // after the config.shards doc for the shard was removed. This would cause the command + // to fail with ShardNotFound. + return true; + } assert.commandWorked(res); return ("completed" == res.state); }, "failed to remove shard " + shardName, kBalancerTimeoutMS); diff --git a/jstests/sharding/listshards.js b/jstests/sharding/listshards.js index e008ffb6689..0960746fe7c 100644 --- a/jstests/sharding/listshards.js +++ b/jstests/sharding/listshards.js @@ -51,6 +51,15 @@ assert(checkShardName('repl', shardsArray), // remove 'repl' shard assert.soon(function() { var res = shardTest.admin.runCommand({removeShard: 'repl'}); + if (!res.ok && res.code === ErrorCodes.ShardNotFound) { + // If the config server primary steps down right after removing the config.shards doc + // for the shard but before responding with "state": "completed", the mongos would retry + // the _configsvrRemoveShard command against the new config server primary, which would + // not find the removed shard in its ShardRegistry if it has done a ShardRegistry reload + // after the config.shards doc for the shard was removed. This would cause the command + // to fail with ShardNotFound. + return true; + } assert.commandWorked(res, 'removeShard command failed'); return res.state === 'completed'; }, 'failed to remove the replica set shard'); diff --git a/jstests/sharding/shard_removal_triggers_catalog_cache_invalidation.js b/jstests/sharding/shard_removal_triggers_catalog_cache_invalidation.js index 5eaeba8cf0a..aa51130c007 100644 --- a/jstests/sharding/shard_removal_triggers_catalog_cache_invalidation.js +++ b/jstests/sharding/shard_removal_triggers_catalog_cache_invalidation.js @@ -57,12 +57,30 @@ st.startBalancer(); // Remove shard1. assert.soon(() => { const removeRes = assert.commandWorked(st.s0.adminCommand({removeShard: st.shard1.shardName})); + if (!removeRes.ok && removeRes.code === ErrorCodes.ShardNotFound) { + // If the config server primary steps down right after removing the config.shards doc + // for the shard but before responding with "state": "completed", the mongos would retry + // the _configsvrRemoveShard command against the new config server primary, which would + // not find the removed shard in its ShardRegistry if it has done a ShardRegistry reload + // after the config.shards doc for the shard was removed. This would cause the command + // to fail with ShardNotFound. + return true; + } return 'completed' === removeRes.state; }); // Remove shard0. assert.soon(() => { const removeRes = assert.commandWorked(st.s0.adminCommand({removeShard: st.shard0.shardName})); + if (!removeRes.ok && removeRes.code === ErrorCodes.ShardNotFound) { + // If the config server primary steps down right after removing the config.shards doc + // for the shard but before responding with "state": "completed", the mongos would retry + // the _configsvrRemoveShard command against the new config server primary, which would + // not find the removed shard in its ShardRegistry if it has done a ShardRegistry reload + // after the config.shards doc for the shard was removed. This would cause the command + // to fail with ShardNotFound. + return true; + } return 'completed' === removeRes.state; }); -- cgit v1.2.1