diff options
author | Charlie Swanson <charlie.swanson@mongodb.com> | 2020-02-19 14:05:09 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-04-27 14:58:40 +0000 |
commit | f67a13326a90013aeb28c09f82d4ea2bea49c494 (patch) | |
tree | 1797ee436813e4814db23f0de78d314498dc0b0c | |
parent | cc18cf86a9af09110974f2dc1ae5c78505b71aea (diff) | |
download | mongo-f67a13326a90013aeb28c09f82d4ea2bea49c494.tar.gz |
SERVER-45541 Test interrupting $unionWith.
(cherry picked from commit 54488c22e2ce672a8bdbb2dac68941b958e69b5c)
9 files changed, 140 insertions, 11 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_kill_primary_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_kill_primary_with_balancer.yml index 778501e17ce..ab7878a2753 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_kill_primary_with_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_kill_primary_with_balancer.yml @@ -116,6 +116,7 @@ selector: # Uses getmores. - jstests/concurrency/fsm_workloads/agg_base.js + - jstests/concurrency/fsm_workloads/agg_unionWith_interrupt_cleanup.js - jstests/concurrency/fsm_workloads/create_capped_collection.js - jstests/concurrency/fsm_workloads/create_capped_collection_maxdocs.js - jstests/concurrency/fsm_workloads/create_index_background.js diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_kill_primary.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_kill_primary.yml index bc95236fb7c..311380b8e89 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_kill_primary.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_kill_primary.yml @@ -185,6 +185,7 @@ selector: # Uses getMore. If a kill node happens between the time of creation of cursor (usually by calling # find or aggregate) and calling getMore(), server will throw CursortNotFound exception. # We currently do not retry the transaction on this exception. + - jstests/concurrency/fsm_workloads/agg_unionWith_interrupt_cleanup.js - jstests/concurrency/fsm_workloads/agg_union_with_chunk_migrations.js - jstests/concurrency/fsm_workloads/create_capped_collection.js - jstests/concurrency/fsm_workloads/create_capped_collection_maxdocs.js diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_terminate_primary.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_terminate_primary.yml index 5c229b91bf6..ded60306b90 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_terminate_primary.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_terminate_primary.yml @@ -185,6 +185,7 @@ selector: # Uses getMore. If a terminate happens between the time of creation of cursor (usually by calling # find or aggregate) and calling getMore(), server will throw CursortNotFound exception. # We currently do not retry the transaction on this exception. + - jstests/concurrency/fsm_workloads/agg_unionWith_interrupt_cleanup.js - jstests/concurrency/fsm_workloads/agg_union_with_chunk_migrations.js - jstests/concurrency/fsm_workloads/create_capped_collection.js - jstests/concurrency/fsm_workloads/create_capped_collection_maxdocs.js diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_stepdowns.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_stepdowns.yml index fb2ffb35b1c..c4648343ebb 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_stepdowns.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_stepdowns.yml @@ -166,6 +166,7 @@ selector: ## # Uses getMore in the same state function as a command not supported in a transaction. + - jstests/concurrency/fsm_workloads/agg_unionWith_interrupt_cleanup.js - jstests/concurrency/fsm_workloads/list_indexes.js - jstests/concurrency/fsm_workloads/agg_union_with_chunk_migrations.js diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_terminate_primary_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_terminate_primary_with_balancer.yml index efb809269ee..0267727c9d3 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_terminate_primary_with_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_terminate_primary_with_balancer.yml @@ -116,6 +116,7 @@ selector: # Uses getmores. - jstests/concurrency/fsm_workloads/agg_base.js + - jstests/concurrency/fsm_workloads/agg_unionWith_interrupt_cleanup.js - jstests/concurrency/fsm_workloads/create_capped_collection.js - jstests/concurrency/fsm_workloads/create_capped_collection_maxdocs.js - jstests/concurrency/fsm_workloads/create_index_background.js diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml index cf1a0e9eb75..9c373d43505 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml @@ -105,6 +105,7 @@ selector: # Uses getmores. - jstests/concurrency/fsm_workloads/agg_base.js + - jstests/concurrency/fsm_workloads/agg_unionWith_interrupt_cleanup.js - jstests/concurrency/fsm_workloads/create_index_background.js - jstests/concurrency/fsm_workloads/create_index_background_partial_filter.js - jstests/concurrency/fsm_workloads/create_index_background_wildcard.js diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml index 80cc94184fd..9f35ff0d7ee 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml @@ -111,6 +111,7 @@ selector: # Uses getmores. - jstests/concurrency/fsm_workloads/agg_base.js + - jstests/concurrency/fsm_workloads/agg_unionWith_interrupt_cleanup.js - jstests/concurrency/fsm_workloads/create_index_background.js - jstests/concurrency/fsm_workloads/create_index_background_partial_filter.js - jstests/concurrency/fsm_workloads/create_index_background_wildcard.js diff --git a/jstests/concurrency/fsm_workloads/agg_out_interrupt_cleanup.js b/jstests/concurrency/fsm_workloads/agg_out_interrupt_cleanup.js index bc6c9b48515..d6bdebb5d57 100644 --- a/jstests/concurrency/fsm_workloads/agg_out_interrupt_cleanup.js +++ b/jstests/concurrency/fsm_workloads/agg_out_interrupt_cleanup.js @@ -1,6 +1,12 @@ /** * Tests $out stage of aggregate command concurrently with killOp. Ensures that all the temporary - * collections created during aggreate command are deleted. + * collections created during aggreate command are deleted. If extending this workload, consider + * overriding the following: + * - $config.states.aggregate: The function to execute the aggregation. + * - $config.states.killOp: The function to find the aggregation and kill it. Consider reusing + * $config.data.killOpsMatchingFilter to do the deed. + * - $config.teardown: If you want any assertion to make sure nothing got leaked or left behind by + * the interrupted aggregation. * * @tags: [uses_curop_agg_stage] */ @@ -15,11 +21,23 @@ var $config = extendWorkload($config, function($config, $super) { {aggregate: collName, pipeline: [{$out: "interrupt_temp_out"}], cursor: {}}); }; + // This test sets up aggregations just to tear them down. There's no benefit to using large + // documents here, and doing so can increase memory pressure on the test host, so we lower it + // down to 1KB. + $config.data.docSize = 1024; + $config.data.killOpsMatchingFilter = function killOpsMatchingFilter(db, filter) { + const currentOpOutput = + db.getSiblingDB('admin').aggregate([{$currentOp: {}}, {$match: filter}]).toArray(); + for (let op of currentOpOutput) { + assert(op.hasOwnProperty('opid')); + assertAlways.commandWorked(db.getSiblingDB('admin').killOp(op.opid)); + } + }; $config.states.killOp = function killOp(db, collName) { // The aggregate command could be running different commands internally (renameCollection, // insertDocument, etc.) depending on which stage of execution it is in. So, get all the // operations that are running against the input, output or temp collections. - const activeCurOpsFilter = { + this.killOpsMatchingFilter(db, { op: "command", active: true, $or: [ @@ -31,15 +49,7 @@ var $config = extendWorkload($config, function($config, $super) { $exists: false } // Exclude 'drop' command from the filter to make sure that we don't kill the the // drop command which is responsible for dropping the temporary collection. - }; - - const currentOpOutput = db.getSiblingDB('admin') - .aggregate([{$currentOp: {}}, {$match: activeCurOpsFilter}]) - .toArray(); - for (let op of currentOpOutput) { - assert(op.hasOwnProperty('opid')); - assertAlways.commandWorked(db.getSiblingDB('admin').killOp(op.opid)); - } + }); }; $config.teardown = function teardown(db, collName, cluster) { diff --git a/jstests/concurrency/fsm_workloads/agg_unionWith_interrupt_cleanup.js b/jstests/concurrency/fsm_workloads/agg_unionWith_interrupt_cleanup.js new file mode 100644 index 00000000000..912e26e80c0 --- /dev/null +++ b/jstests/concurrency/fsm_workloads/agg_unionWith_interrupt_cleanup.js @@ -0,0 +1,112 @@ +/** + * Tests $unionWith stage of aggregate command concurrently with killOp. Ensures that all cursors + * opened on behalf of the $unionWith are killed when interrupted. + * + * @tags: [ + * uses_curop_agg_stage, + * requires_fcv_44, # Uses $unionWith + * ] + */ +'use strict'; +load('jstests/concurrency/fsm_libs/extend_workload.js'); // for extendWorkload +load('jstests/concurrency/fsm_workloads/agg_out_interrupt_cleanup.js'); // for $config + +var $config = extendWorkload($config, function($config, $super) { + $config.data.commentStr = "agg_unionWith_interrupt_cleanup"; + + $config.states.aggregate = function aggregate(db, collName) { + // Here we consistenly union with the same namespace to benefit from the sharded collection + // setup that may have been done in sharded passthroughs. + // TODO SERVER-46251 use multiple namespaces. + let response = db[collName].runCommand({ + aggregate: collName, + pipeline: [{$unionWith: {coll: collName, pipeline: [{$unionWith: collName}]}}], + comment: this.commentStr, + // Use a small batch size to ensure these operations open up a cursor and use multiple + // getMores. We want to give coverage to interrupting the getMores as well. + cursor: {batchSize: this.numDocs / 4} + }); + // Keep iterating the cursor until we exhaust it or we are interrupted. + while (response.ok && response.cursor.id != 0) { + response = db[collName].runCommand({getMore: response.cursor.id, collection: collName}); + } + if (!response.ok) { + // If the interrupt happens just as the cursor is being checked back in, the cursor will + // be killed without failing the operation. When this happens, the next getMore will + // fail with CursorNotFound. + assertWhenOwnColl.contains( + response.code, [ErrorCodes.Interrupted, ErrorCodes.CursorNotFound], response); + } + }; + + $config.states.killOp = function killOp(db, collName) { + // The aggregate command could be running different sub-aggregates internally depending on + // which stage of execution it is in. So we rely on the comment to detect which operations + // are eligible to be interrupted, and interrupt those. + this.killOpsMatchingFilter(db, { + $and: [ + {active: true}, + { + $or: [ + {"command.comment": this.commentStr}, + {"cursor.originatingCommand.comment": this.commentStr}, + ] + } + ] + }); + }; + + $config.teardown = function teardown(db, collName, cluster) { + // Ensure that no operations, cursors, or sub-operations are left active. After + // SERVER-46255, We normally expect all operations to be cleaned up safely, but there are + // race conditions or possible network blips where the kill won't arrive as expected. We + // don't want to block the interrupt thread or the operation itself to wait around to make + // sure everything dies correctly, so we just rely on cursor timeouts or session reaps to + // cover these rare cases. Here we make sure everything is cleaned up so we avoid hogging + // resources for future tests. + this.killOpsMatchingFilter(db, { + $and: [ + {active: true}, + { + $or: [ + {"command.comment": this.commentStr}, + {"cursor.originatingCommand.comment": this.commentStr}, + ] + } + ] + }); + const curOpCursor = db.getSiblingDB("admin").aggregate([ + {$currentOp: {idleCursors: true}}, + {$match: {"cursor.originatingCommand.comment": this.commentStr}}, + {$project: {shard: 1, host: 1, "cursor.cursorId": 1}}, + ]); + while (curOpCursor.hasNext()) { + let result = curOpCursor.next(); + assertAlways.commandWorked( + new Mongo(`${result.shard}/${result.host}`).getDB(db.getName()).runCommand({ + killCursors: collName, + cursors: [result.cursor.cursorId] + })); + } + const remainingOps = + db.getSiblingDB("admin") + .aggregate([ + {$currentOp: {idleCursors: true}}, + // Look for any trace of state that wasn't cleaned up. + { + $match: { + $or: [ + // The originating aggregation or a sub-aggregation still active. + {"command.comment": this.commentStr}, + // An idle cursor left around. + {"cursor.originatingCommand.comment": this.commentStr} + ] + } + } + ]) + .toArray(); + assertAlways.eq(remainingOps.length, 0, remainingOps); + }; + + return $config; +}); |