diff options
author | Jordi Serra Torrens <jordi.serra-torrens@mongodb.com> | 2023-02-07 11:40:31 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2023-02-07 13:08:12 +0000 |
commit | 9fb9d210409c00e69ec40179330fee0b28f62aec (patch) | |
tree | fda0e2d43204884d5e948329b6c831d496c95250 | |
parent | 5d7876b19184677959be260754691d5bfeefc7c7 (diff) | |
download | mongo-9fb9d210409c00e69ec40179330fee0b28f62aec.tar.gz |
SERVER-43099 Reenable random chunk migration failpoint for concurrency with_balancer suites
17 files changed, 217 insertions, 33 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency_and_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency_and_balancer.yml index 21d3a3e68c6..532d73e87a7 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency_and_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_causal_consistency_and_balancer.yml @@ -116,6 +116,7 @@ executor: global_vars: TestData: runningWithCausalConsistency: true + runningWithBalancer: true hooks: - class: CheckReplDBHashInBackground - class: CheckReplDBHash diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_kill_primary_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_kill_primary_with_balancer.yml index 9270478e0d2..34e315a92fc 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_kill_primary_with_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_kill_primary_with_balancer.yml @@ -190,6 +190,7 @@ executor: runningWithConfigStepdowns: true runningWithShardStepdowns: true useActionPermittedFile: true + runningWithBalancer: true hooks: - class: ContinuousStepdown config_stepdown: true diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_local_read_write_multi_stmt_txn_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_local_read_write_multi_stmt_txn_with_balancer.yml index 99651926077..90e6c034c70 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_local_read_write_multi_stmt_txn_with_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_local_read_write_multi_stmt_txn_with_balancer.yml @@ -163,6 +163,7 @@ executor: runningWithCausalConsistency: false runningWithSessions: true traceExceptions: false + runningWithBalancer: true hooks: - class: CheckReplDBHashInBackground - class: CheckReplDBHash diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml index e465ad1f603..b8046559efb 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer.yml @@ -158,6 +158,7 @@ executor: runInsideTransaction: true runningWithSessions: true traceExceptions: false + runningWithBalancer: true hooks: - class: CheckReplDBHashInBackground - class: CheckReplDBHash diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer.yml index 230b55fc226..5bedc0c4521 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer.yml @@ -108,7 +108,11 @@ executor: - CheckReplDBHash - ValidateCollections tests: true - config: {} + config: + shell_options: + global_vars: + TestData: + runningWithBalancer: true hooks: - class: CheckReplDBHashInBackground - class: CheckReplDBHash diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_terminate_primary_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_terminate_primary_with_balancer.yml index c1126912a9c..5d65b19f0e7 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_terminate_primary_with_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_terminate_primary_with_balancer.yml @@ -190,6 +190,7 @@ executor: runningWithConfigStepdowns: true runningWithShardStepdowns: true useActionPermittedFile: true + runningWithBalancer: true hooks: - class: ContinuousStepdown config_stepdown: true diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml index 9a17908f53a..b768f44a966 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml @@ -183,6 +183,7 @@ executor: runningWithConfigStepdowns: true runningWithShardStepdowns: true useActionPermittedFile: true + runningWithBalancer: true hooks: - class: ContinuousStepdown config_stepdown: true diff --git a/jstests/concurrency/fsm_libs/cluster.js b/jstests/concurrency/fsm_libs/cluster.js index 98c62c6648e..fe38d2e6078 100644 --- a/jstests/concurrency/fsm_libs/cluster.js +++ b/jstests/concurrency/fsm_libs/cluster.js @@ -231,23 +231,21 @@ var Cluster = function(options) { replSets.push(rs); } - // SERVER-43099 Reenable random chunk migration failpoint for concurrency with_balancer - // suites - // if (options.sharded.enableBalancer === true) { - // st._configServers.forEach((conn) => { - // const configDb = conn.getDB('admin'); - - // configDb.adminCommand({ - // configureFailPoint: 'balancerShouldReturnRandomMigrations', - // mode: 'alwaysOn' - // }); - // configDb.adminCommand({ - // configureFailPoint: 'overrideBalanceRoundInterval', - // mode: 'alwaysOn', - // data: {intervalMs: 100} - // }); - // }); - // } + if (options.sharded.enableBalancer === true) { + st._configServers.forEach((conn) => { + const configDb = conn.getDB('admin'); + + configDb.adminCommand({ + configureFailPoint: 'balancerShouldReturnRandomMigrations', + mode: 'alwaysOn' + }); + configDb.adminCommand({ + configureFailPoint: 'overrideBalanceRoundInterval', + mode: 'alwaysOn', + data: {intervalMs: 100} + }); + }); + } } else if (options.replication.enabled) { rst = new ReplSetTest(db.getMongo().host); diff --git a/jstests/concurrency/fsm_workload_helpers/balancer.js b/jstests/concurrency/fsm_workload_helpers/balancer.js new file mode 100644 index 00000000000..bb7e023e594 --- /dev/null +++ b/jstests/concurrency/fsm_workload_helpers/balancer.js @@ -0,0 +1,53 @@ +'use strict'; + +/** + * Provides helpers for configuring the balancer. + * + * Intended for use by workloads testing sharding (i.e., workloads starting with 'sharded_'). + */ + +var BalancerHelper = (function() { + // Disables balancing for a given collection. + function disableBalancerForCollection(db, ns) { + assertAlways.commandWorked( + db.getSiblingDB('config').collections.update({_id: ns}, {$set: {"noBalance": true}})); + } + + // Enables balancing for a given collection. + function enableBalancerForCollection(db, ns) { + assertAlways.commandWorked( + db.getSiblingDB('config').collections.update({_id: ns}, {$unset: {"noBalance": 1}})); + } + + // Joins the ongoing balancer round (if enabled at all). + function joinBalancerRound(db, timeout) { + timeout = timeout || 60000; + + var initialStatus = db.adminCommand({balancerStatus: 1}); + var currentStatus; + assert.soon(function() { + currentStatus = db.adminCommand({balancerStatus: 1}); + if (currentStatus.mode === 'off') { + // Balancer is disabled. + return true; + } + if (!friendlyEqual(currentStatus.term, initialStatus.term)) { + // A new primary of the csrs has been elected + initialStatus = currentStatus; + return false; + } + assert.gte( + currentStatus.numBalancerRounds, + initialStatus.numBalancerRounds, + 'Number of balancer rounds moved back in time unexpectedly. Current status: ' + + tojson(currentStatus) + ', initial status: ' + tojson(initialStatus)); + return currentStatus.numBalancerRounds > initialStatus.numBalancerRounds; + }, 'Latest balancer status: ' + tojson(currentStatus), timeout); + } + + return { + disableBalancerForCollection: disableBalancerForCollection, + enableBalancerForCollection: enableBalancerForCollection, + joinBalancerRound: joinBalancerRound, + }; +})(); diff --git a/jstests/concurrency/fsm_workloads/cleanupOrphanedWhileMigrating.js b/jstests/concurrency/fsm_workloads/cleanupOrphanedWhileMigrating.js index ec673ba935d..c6c287082f8 100644 --- a/jstests/concurrency/fsm_workloads/cleanupOrphanedWhileMigrating.js +++ b/jstests/concurrency/fsm_workloads/cleanupOrphanedWhileMigrating.js @@ -8,6 +8,7 @@ load('jstests/concurrency/fsm_libs/extend_workload.js'); load('jstests/concurrency/fsm_workloads/sharded_base_partitioned.js'); +load('jstests/concurrency/fsm_workload_helpers/balancer.js'); var $config = extendWorkload($config, function($config, $super) { $config.threadCount = 5; @@ -33,11 +34,17 @@ var $config = extendWorkload($config, function($config, $super) { const shard = connCache.shards[shardNames[randomIndex]]; const shardPrimary = ChunkHelper.getPrimary(shard); + // Disable balancing so that waiting for orphan cleanup can converge quickly. + BalancerHelper.disableBalancerForCollection(db, ns); + // Ensure the cleanup of all chunk orphans of the primary shard assert.soonNoExcept(() => { assert.commandWorked(shardPrimary.adminCommand({cleanupOrphaned: ns})); return true; }, undefined, 10 * 1000, 100); + + // Reenable balancing. + BalancerHelper.enableBalancerForCollection(db, ns); }; // Verify that counts are stable. @@ -62,6 +69,10 @@ var $config = extendWorkload($config, function($config, $super) { $config.setup = function setup(db, collName, cluster) { const ns = db[collName].getFullName(); + // Disallow balancing 'ns' during $setup so it does not interfere with the splits. + BalancerHelper.disableBalancerForCollection(db, ns); + BalancerHelper.joinBalancerRound(db); + for (let chunkIndex = 0; chunkIndex < numChunks; chunkIndex++) { let bulk = db[collName].initializeUnorderedBulkOp(); @@ -75,14 +86,12 @@ var $config = extendWorkload($config, function($config, $super) { assertAlways.commandWorked(bulk.execute()); if (chunkIndex > 0) { - // Need to retry split command to avoid conflicting with moveChunks issued by the - // balancer. - assert.soonNoExcept(() => { - assert.commandWorked(db.adminCommand({split: ns, middle: {skey: splitKey}})); - return true; - }, undefined, 10 * 1000, 100); + assert.commandWorked(db.adminCommand({split: ns, middle: {skey: splitKey}})); } } + + // Allow balancing 'ns' again. + BalancerHelper.enableBalancerForCollection(db, ns); }; $config.transitions = { diff --git a/jstests/concurrency/fsm_workloads/collection_defragmentation.js b/jstests/concurrency/fsm_workloads/collection_defragmentation.js index ce39d730255..befdda902c4 100644 --- a/jstests/concurrency/fsm_workloads/collection_defragmentation.js +++ b/jstests/concurrency/fsm_workloads/collection_defragmentation.js @@ -193,8 +193,26 @@ var $config = (function() { }; let defaultChunkDefragmentationThrottlingMS; + let defaultBalancerShouldReturnRandomMigrations; function setup(db, collName, cluster) { + cluster.executeOnConfigNodes((db) => { + defaultBalancerShouldReturnRandomMigrations = + assert + .commandWorked(db.adminCommand({ + getParameter: 1, + 'failpoint.balancerShouldReturnRandomMigrations': 1 + }))['failpoint.balancerShouldReturnRandomMigrations'] + .mode; + + // If the failpoint is enabled on this suite, disable it because this test relies on the + // balancer taking correct decisions. + if (defaultBalancerShouldReturnRandomMigrations === 1) { + assert.commandWorked(db.adminCommand( + {configureFailPoint: 'balancerShouldReturnRandomMigrations', mode: 'off'})); + } + }); + const mongos = cluster.getDB('config').getMongo(); // Create all fragmented collections for (let i = 0; i < dbCount; i++) { @@ -228,7 +246,13 @@ var $config = (function() { function teardown(db, collName, cluster) { const mongos = cluster.getDB('config').getMongo(); + let defaultOverrideBalanceRoundInterval; cluster.executeOnConfigNodes((db) => { + defaultOverrideBalanceRoundInterval = assert.commandWorked(db.adminCommand({ + getParameter: 1, + 'failpoint.overrideBalanceRoundInterval': 1 + }))['failpoint.overrideBalanceRoundInterval']; + assert.commandWorked(db.adminCommand({ configureFailPoint: 'overrideBalanceRoundInterval', mode: 'alwaysOn', @@ -267,14 +291,33 @@ var $config = (function() { } cluster.executeOnConfigNodes((db) => { - assert.commandWorked( - db.adminCommand({configureFailPoint: 'overrideBalanceRoundInterval', mode: 'off'})); + // Reset the failpoint to its original value. + if (defaultBalancerShouldReturnRandomMigrations === 1) { + defaultBalancerShouldReturnRandomMigrations = + assert + .commandWorked(db.adminCommand({ + configureFailPoint: 'balancerShouldReturnRandomMigrations', + mode: 'alwaysOn' + })) + .was; + } + + if (defaultOverrideBalanceRoundInterval.mode === 0) { + assert.commandWorked(db.adminCommand( + {configureFailPoint: 'overrideBalanceRoundInterval', mode: 'off'})); + } else if (defaultOverrideBalanceRoundInterval.mode === 1) { + assert.commandWorked(db.adminCommand({ + configureFailPoint: 'overrideBalanceRoundInterval', + mode: 'alwaysOn', + data: {intervalMs: defaultOverrideBalanceRoundInterval.data.intervalMs} + })); + } }); } return { threadCount: 5, - iterations: 10, + iterations: 1, states: states, transitions: transitions, setup: setup, diff --git a/jstests/concurrency/fsm_workloads/collection_uuid.js b/jstests/concurrency/fsm_workloads/collection_uuid.js index e6a334711f4..0151d65d1d4 100644 --- a/jstests/concurrency/fsm_workloads/collection_uuid.js +++ b/jstests/concurrency/fsm_workloads/collection_uuid.js @@ -184,7 +184,8 @@ var $config = (function() { const namespace = db.getName() + "." + collName; // Find - const findCmd = {find: namespace, collectionUUID: this.collUUID}; + // Use 'singleBatch: true' to avoid leaving open cursors. + const findCmd = {find: namespace, collectionUUID: this.collUUID, singleBatch: true}; testCommand(db, namespace, "find", findCmd, this); // Update diff --git a/jstests/concurrency/fsm_workloads/indexed_insert_ttl.js b/jstests/concurrency/fsm_workloads/indexed_insert_ttl.js index b0d999076e7..12a7529e110 100644 --- a/jstests/concurrency/fsm_workloads/indexed_insert_ttl.js +++ b/jstests/concurrency/fsm_workloads/indexed_insert_ttl.js @@ -9,6 +9,9 @@ * doc inserted by each thread is no longer in the collection. * @tags: [uses_ttl] */ + +load('jstests/concurrency/fsm_workload_helpers/balancer.js'); + var $config = (function() { var states = { init: function init(db, collName) { @@ -33,6 +36,14 @@ var $config = (function() { } function teardown(db, collName, cluster) { + if (TestData.runningWithBalancer) { + // Disallow balancing 'ns' so that it does not cause the TTLMonitor to fail rounds due + // to ongoing migration critical sections. TTLMonitor will retry on the next round, but + // it might not converge in time for the following assertion to pass. + BalancerHelper.disableBalancerForCollection(db, db[collName].getFullName()); + BalancerHelper.joinBalancerRound(db); + } + // By default, the TTL monitor thread runs every 60 seconds. var defaultTTLSecs = 60; diff --git a/jstests/concurrency/fsm_workloads/insert_ttl_timeseries.js b/jstests/concurrency/fsm_workloads/insert_ttl_timeseries.js index eadbb937cc9..891df4042c3 100644 --- a/jstests/concurrency/fsm_workloads/insert_ttl_timeseries.js +++ b/jstests/concurrency/fsm_workloads/insert_ttl_timeseries.js @@ -13,6 +13,8 @@ * ] */ +load('jstests/concurrency/fsm_workload_helpers/balancer.js'); + var $config = (function() { const initData = { getCollectionName: function(collName) { @@ -156,6 +158,14 @@ var $config = (function() { } function teardown(db, collName, cluster) { + if (TestData.runningWithBalancer) { + // Disallow balancing 'ns' so that it does not cause the TTLMonitor to fail rounds due + // to ongoing migration critical sections. TTLMonitor will retry on the next round, but + // it might not converge in time for the following assertion to pass. + BalancerHelper.disableBalancerForCollection(db, db[collName].getFullName()); + BalancerHelper.joinBalancerRound(db); + } + // Default TTL monitor period const ttlMonitorSleepSecs = 60; diff --git a/jstests/concurrency/fsm_workloads/insert_with_data_size_aware_balancing.js b/jstests/concurrency/fsm_workloads/insert_with_data_size_aware_balancing.js index 230d14b7e10..c97a2f9530b 100644 --- a/jstests/concurrency/fsm_workloads/insert_with_data_size_aware_balancing.js +++ b/jstests/concurrency/fsm_workloads/insert_with_data_size_aware_balancing.js @@ -53,10 +53,29 @@ var $config = (function() { }, }; + let defaultBalancerShouldReturnRandomMigrations; + /* * Create sharded collections with random maxChunkSizeMB (betwen 1MB and 10MB) */ let setup = function(db, collName, cluster) { + cluster.executeOnConfigNodes((db) => { + defaultBalancerShouldReturnRandomMigrations = + assert + .commandWorked(db.adminCommand({ + getParameter: 1, + 'failpoint.balancerShouldReturnRandomMigrations': 1 + }))['failpoint.balancerShouldReturnRandomMigrations'] + .mode; + + // If the failpoint is enabled on this suite, disable it because this test relies on the + // balancer taking correct decisions. + if (defaultBalancerShouldReturnRandomMigrations === 1) { + assert.commandWorked(db.adminCommand( + {configureFailPoint: 'balancerShouldReturnRandomMigrations', mode: 'off'})); + } + }); + const mongos = cluster.getDB('config').getMongo(); const shardNames = Object.keys(cluster.getSerializedCluster().shards); const numShards = shardNames.length; @@ -113,6 +132,19 @@ var $config = (function() { assert(testedAtLeastOneCollection); } + + cluster.executeOnConfigNodes((db) => { + // Reset the failpoint to its original value. + if (defaultBalancerShouldReturnRandomMigrations === 1) { + defaultBalancerShouldReturnRandomMigrations = + assert + .commandWorked(db.adminCommand({ + configureFailPoint: 'balancerShouldReturnRandomMigrations', + mode: 'alwaysOn' + })) + .was; + } + }); }; let transitions = {insert: {insert: 1.0}}; diff --git a/jstests/concurrency/fsm_workloads/internal_transactions_sharded.js b/jstests/concurrency/fsm_workloads/internal_transactions_sharded.js index 0e06f84b798..f55fa716950 100644 --- a/jstests/concurrency/fsm_workloads/internal_transactions_sharded.js +++ b/jstests/concurrency/fsm_workloads/internal_transactions_sharded.js @@ -17,6 +17,7 @@ load('jstests/concurrency/fsm_libs/extend_workload.js'); load('jstests/concurrency/fsm_workloads/random_moveChunk_base.js'); load('jstests/concurrency/fsm_workloads/internal_transactions_unsharded.js'); +load('jstests/concurrency/fsm_workload_helpers/balancer.js'); load('jstests/libs/fail_point_util.js'); var $config = extendWorkload($config, function($config, $super) { @@ -64,6 +65,10 @@ var $config = extendWorkload($config, function($config, $super) { $config.setup = function setup(db, collName, cluster) { const ns = db.getName() + "." + collName; + // Disallow balancing 'ns' during $setup so it does not interfere with the splits. + BalancerHelper.disableBalancerForCollection(db, ns); + BalancerHelper.joinBalancerRound(db); + // Move the initial chunk to shard0. const shards = Object.keys(cluster.getSerializedCluster().shards); ChunkHelper.moveChunk( @@ -82,6 +87,7 @@ var $config = extendWorkload($config, function($config, $super) { assert.commandWorked(db.adminCommand( {split: ns, middle: {[this.defaultShardKeyField]: partition.lower}})); } + assert.commandWorked( db.adminCommand({split: ns, middle: {[this.defaultShardKeyField]: partition.mid}})); @@ -108,6 +114,9 @@ var $config = extendWorkload($config, function($config, $super) { } } + // Allow balancing 'ns' again. + BalancerHelper.enableBalancerForCollection(db, ns); + this.overrideInternalTransactionsReapThreshold(cluster); if (this.lowerTransactionLifetimeLimitSeconds) { this.overrideTransactionLifetimeLimit(cluster); diff --git a/jstests/concurrency/fsm_workloads/rename_sharded_collection.js b/jstests/concurrency/fsm_workloads/rename_sharded_collection.js index 6a95ad8235a..7b919761b83 100644 --- a/jstests/concurrency/fsm_workloads/rename_sharded_collection.js +++ b/jstests/concurrency/fsm_workloads/rename_sharded_collection.js @@ -17,9 +17,14 @@ * # This test just performs rename operations that can't be executed in transactions * does_not_support_transactions, * # Can be removed once PM-1965-Milestone-1 is completed. + * + * # TODO SERVER-73385 reenable when fixed. + * assumes_balancer_off, * ] */ +load('jstests/concurrency/fsm_workload_helpers/balancer.js'); + const numChunks = 20; const documentsPerChunk = 5; const dbNames = ['db0', 'db1']; @@ -34,17 +39,17 @@ function initAndFillShardedCollection(db, collName, shardNames) { const ns = coll.getFullName(); db.adminCommand({shardCollection: ns, key: {x: 1}}); + // Disallow balancing 'ns' during $setup so it does not interfere with the splits. + BalancerHelper.disableBalancerForCollection(db, ns); + BalancerHelper.joinBalancerRound(db); + var nextShardKeyValue = 0; for (var i = 0; i < numChunks; i++) { for (var j = 0; j < documentsPerChunk; j++) { coll.insert({x: nextShardKeyValue++}); } - // Need to retry split command to avoid conflicting with moveChunks issued by the balancer. - assert.soonNoExcept(() => { - assert.commandWorked(db.adminCommand({split: ns, middle: {x: nextShardKeyValue}})); - return true; - }); + assert.commandWorked(db.adminCommand({split: ns, middle: {x: nextShardKeyValue}})); const lastInsertedShardKeyValue = nextShardKeyValue - 1; @@ -57,6 +62,9 @@ function initAndFillShardedCollection(db, collName, shardNames) { }); assert.commandWorkedOrFailedWithCode(res, ErrorCodes.ConflictingOperationInProgress); } + + // Allow balancing 'ns' again. + BalancerHelper.enableBalancerForCollection(db, ns); } /* |