summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJonathan Abrahams <jonathan@mongodb.com>2018-05-01 14:49:15 -0400
committerRobert Guo <robert.guo@10gen.com>2018-06-04 17:31:44 -0400
commitbf72dbc9922412b01c0e4d2f485338aa7ae9b76c (patch)
treeda561311ca9ed7b1af4add111311ae210fa367a0
parentf65f3f4d283b7789181e28d98b24972189b6c6cc (diff)
downloadmongo-bf72dbc9922412b01c0e4d2f485338aa7ae9b76c.tar.gz
SERVER-34555 Add stepdown to FSM resmoke integration
(cherry picked from commit 2b10b06044dfaaf5b9c37f4379521f14e9bdb0e5)
-rw-r--r--buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml223
-rw-r--r--buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml223
-rw-r--r--buildscripts/resmokelib/testing/hooks/stepdown.py162
-rw-r--r--buildscripts/resmokelib/testing/testcases/fsm_workload_test.py17
-rw-r--r--buildscripts/resmokelib/utils/__init__.py9
-rw-r--r--etc/evergreen.yml2
-rw-r--r--jstests/concurrency/fsm_libs/cluster.js23
-rw-r--r--jstests/concurrency/fsm_libs/resmoke_runner.js67
-rw-r--r--jstests/concurrency/fsm_libs/runner.js14
9 files changed, 689 insertions, 51 deletions
diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml
index b312f8d68a0..b169dc169a1 100644
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml
@@ -1,14 +1,229 @@
-test_kind: js_test
+test_kind: fsm_workload_test
selector:
roots:
- - jstests/concurrency/fsm_all_sharded_with_stepdowns.js
+ - jstests/concurrency/fsm_workloads/**/*.js
+ exclude_files:
+ # SERVER-13116 distinct isn't sharding aware
+ - jstests/concurrency/fsm_workloads/distinct.js
+ - jstests/concurrency/fsm_workloads/distinct_noindex.js
+ - jstests/concurrency/fsm_workloads/distinct_projection.js
+
+ # SERVER-17397 Drops of sharded namespaces may not fully succeed
+ - jstests/concurrency/fsm_workloads/create_database.js
+ - jstests/concurrency/fsm_workloads/drop_database.js
+
+ # Disabled due to SERVER-33753, '.count() without a predicate can be wrong on sharded
+ # collections'. This bug is problematic for these workloads because they assert on count()
+ # values:
+ - jstests/concurrency/fsm_workloads/agg_match.js
+
+ # $lookup and $graphLookup are not supported on sharded collections.
+ - jstests/concurrency/fsm_workloads/agg_graph_lookup.js
+ - jstests/concurrency/fsm_workloads/view_catalog_cycle_lookup.js
+
+ # Disabled due to SERVER-20057, 'Concurrent, sharded mapReduces can fail when temporary
+ # namespaces collide across mongos processes'
+ - jstests/concurrency/fsm_workloads/map_reduce_drop.js
+ - jstests/concurrency/fsm_workloads/map_reduce_inline.js
+ - jstests/concurrency/fsm_workloads/map_reduce_merge.js
+ - jstests/concurrency/fsm_workloads/map_reduce_merge_nonatomic.js
+ - jstests/concurrency/fsm_workloads/map_reduce_reduce.js
+ - jstests/concurrency/fsm_workloads/map_reduce_reduce_nonatomic.js
+ - jstests/concurrency/fsm_workloads/map_reduce_replace.js
+ - jstests/concurrency/fsm_workloads/map_reduce_replace_nonexistent.js
+ - jstests/concurrency/fsm_workloads/map_reduce_replace_remove.js
+
+ # Disabled due to MongoDB restrictions and/or workload restrictions
+
+ # These workloads sometimes trigger 'Could not lock auth data update lock'
+ # errors because the AuthorizationManager currently waits for only five
+ # seconds to acquire the lock for authorization documents
+ - jstests/concurrency/fsm_workloads/auth_create_role.js
+ - jstests/concurrency/fsm_workloads/auth_create_user.js
+ - jstests/concurrency/fsm_workloads/auth_drop_role.js
+ - jstests/concurrency/fsm_workloads/auth_drop_user.js
+
+ # uses >100MB of data, which can overwhelm test hosts
+ - jstests/concurrency/fsm_workloads/agg_group_external.js
+ - jstests/concurrency/fsm_workloads/agg_sort_external.js
+
+ # compact can only be run against a standalone mongod
+ - jstests/concurrency/fsm_workloads/compact.js
+ - jstests/concurrency/fsm_workloads/compact_simultaneous_padding_bytes.js
+
+ # convertToCapped can't be run on mongos processes
+ - jstests/concurrency/fsm_workloads/convert_to_capped_collection.js
+ - jstests/concurrency/fsm_workloads/convert_to_capped_collection_index.js
+
+ # findAndModify requires a shard key
+ - jstests/concurrency/fsm_workloads/findAndModify_mixed_queue_unindexed.js
+ - jstests/concurrency/fsm_workloads/findAndModify_remove_queue_unindexed.js
+ - jstests/concurrency/fsm_workloads/findAndModify_update_collscan.js
+ - jstests/concurrency/fsm_workloads/findAndModify_update_queue.js
+ - jstests/concurrency/fsm_workloads/findAndModify_update_queue_unindexed.js
+
+ # remove cannot be {} for findAndModify
+ - jstests/concurrency/fsm_workloads/findAndModify_remove_queue.js
+
+ # can cause OOM kills on test hosts
+ - jstests/concurrency/fsm_workloads/findAndModify_update_grow.js
+
+ # eval doesn't work with sharded collections
+ - jstests/concurrency/fsm_workloads/indexed_insert_eval.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_eval_nolock.js
+ - jstests/concurrency/fsm_workloads/remove_single_document_eval.js
+ - jstests/concurrency/fsm_workloads/remove_single_document_eval_nolock.js
+ - jstests/concurrency/fsm_workloads/update_simple_eval.js
+ - jstests/concurrency/fsm_workloads/update_simple_eval_nolock.js
+
+ # cannot ensureIndex after dropDatabase without sharding first
+ - jstests/concurrency/fsm_workloads/plan_cache_drop_database.js
+
+ # our .remove(query, {justOne: true}) calls lack shard keys
+ - jstests/concurrency/fsm_workloads/remove_single_document.js
+
+ # The rename_* workloads are disabled since renameCollection doesn't work with sharded
+ # collections
+ - jstests/concurrency/fsm_workloads/rename_capped_collection_chain.js
+ - jstests/concurrency/fsm_workloads/rename_capped_collection_dbname_chain.js
+ - jstests/concurrency/fsm_workloads/rename_capped_collection_dbname_droptarget.js
+ - jstests/concurrency/fsm_workloads/rename_capped_collection_droptarget.js
+ - jstests/concurrency/fsm_workloads/rename_collection_chain.js
+ - jstests/concurrency/fsm_workloads/rename_collection_dbname_chain.js
+ - jstests/concurrency/fsm_workloads/rename_collection_dbname_droptarget.js
+ - jstests/concurrency/fsm_workloads/rename_collection_droptarget.js
+
+ # our update queries lack shard keys
+ - jstests/concurrency/fsm_workloads/update_upsert_multi.js
+ - jstests/concurrency/fsm_workloads/update_upsert_multi_noindex.js
+
+ # cannot use upsert command with $where with sharded collections
+ - jstests/concurrency/fsm_workloads/upsert_where.js
+
+ # stagedebug can only be run against a standalone mongod
+ - jstests/concurrency/fsm_workloads/yield_and_hashed.js
+ - jstests/concurrency/fsm_workloads/yield_and_sorted.js
+
+ # ChunkHelper directly talks to the config servers and doesn't support retries for network errors
+ - jstests/concurrency/fsm_workloads/sharded_base_partitioned.js
+ - jstests/concurrency/fsm_workloads/sharded_mergeChunks_partitioned.js
+ - jstests/concurrency/fsm_workloads/sharded_moveChunk_drop_shard_key_index.js
+ - jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js
+ - jstests/concurrency/fsm_workloads/sharded_splitChunk_partitioned.js
+
+ # These workloads frequently time out waiting for the distributed lock to drop a sharded
+ # collection.
+ - jstests/concurrency/fsm_workloads/kill_aggregation.js
+ - jstests/concurrency/fsm_workloads/kill_rooted_or.js
+ - jstests/concurrency/fsm_workloads/view_catalog_cycle_with_drop.js
+ - jstests/concurrency/fsm_workloads/view_catalog.js
+
+ # Uses getmores.
+ - jstests/concurrency/fsm_workloads/agg_base.js
+ - jstests/concurrency/fsm_workloads/create_index_background.js
+ - jstests/concurrency/fsm_workloads/globally_managed_cursors.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_ordered_bulk.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_text.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_unordered_bulk.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_upsert.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_where.js
+ - jstests/concurrency/fsm_workloads/list_indexes.js
+ - jstests/concurrency/fsm_workloads/reindex.js
+ - jstests/concurrency/fsm_workloads/reindex_background.js
+ - jstests/concurrency/fsm_workloads/remove_multiple_documents.js
+ - jstests/concurrency/fsm_workloads/remove_where.js
+ - jstests/concurrency/fsm_workloads/touch_base.js
+ - jstests/concurrency/fsm_workloads/touch_data.js
+ - jstests/concurrency/fsm_workloads/touch_index.js
+ - jstests/concurrency/fsm_workloads/touch_no_data_no_index.js
+ - jstests/concurrency/fsm_workloads/update_where.js
+ - jstests/concurrency/fsm_workloads/yield.js
+ - jstests/concurrency/fsm_workloads/yield_fetch.js
+ - jstests/concurrency/fsm_workloads/yield_rooted_or.js
+ - jstests/concurrency/fsm_workloads/yield_sort.js
+ - jstests/concurrency/fsm_workloads/yield_sort_merge.js
+ - jstests/concurrency/fsm_workloads/yield_text.js
+
+ # Uses non retryable writes.
+ - jstests/concurrency/fsm_workloads/remove_and_bulk_insert.js
+ - jstests/concurrency/fsm_workloads/update_and_bulk_insert.js
+ - jstests/concurrency/fsm_workloads/update_check_index.js
+ - jstests/concurrency/fsm_workloads/update_multifield_multiupdate.js
+ - jstests/concurrency/fsm_workloads/update_multifield_multiupdate_noindex.js
+ - jstests/concurrency/fsm_workloads/update_ordered_bulk_inc.js
+ - jstests/concurrency/fsm_workloads/yield_geo_near.js
+ - jstests/concurrency/fsm_workloads/yield_geo_near_dedup.js
+ - jstests/concurrency/fsm_workloads/yield_id_hack.js
+
+ # Uses non retryable commands.
+ - jstests/concurrency/fsm_workloads/agg_out.js
+ - jstests/concurrency/fsm_workloads/agg_sort.js
+ - jstests/concurrency/fsm_workloads/collmod.js
+ - jstests/concurrency/fsm_workloads/collmod_separate_collections.js
+ - jstests/concurrency/fsm_workloads/kill_multicollection_aggregation.js
+ - jstests/concurrency/fsm_workloads/invalidated_cursors.js
+
+ # The auto_retry_on_network_error.js override needs to overwrite the response from drop on
+ # NamespaceNotFound, and since this workload only creates and drops collections there isn't
+ # much value in running it.
+ - jstests/concurrency/fsm_workloads/drop_collection.js
-# Concurrency tests that run against a sharded cluster start one themselves.
executor:
archive:
+ hooks:
+ - CheckReplDBHash
+ - ValidateCollections
tests: true
config:
shell_options:
- nodb: ''
readMode: commands
+ global_vars:
+ TestData:
+ runningWithConfigStepdowns: true
+ runningWithShardStepdowns: true
+ runningWithAutoSplit: false
+ runningWithBalancer: false
+ useStepdownPermittedFile: true
+ useSteppingDownFile: true
+ usingReplicaSetShards: true
+ hooks:
+ - class: ContinuousStepdown
+ config_stepdown: true
+ shard_stepdown: true
+ use_stepdown_permitted_file: true
+ use_stepping_down_file: true
+ wait_for_mongos_retarget: true
+ - class: CheckReplDBHash
+ - class: ValidateCollections
+ - class: CleanupConcurrencyWorkloads
+ fixture:
+ class: ShardedClusterFixture
+ enable_balancer: false
+ enable_autosplit: false
+ mongos_options:
+ set_parameters:
+ enableTestCommands: 1
+ configsvr_options:
+ num_nodes: 3
+ all_nodes_electable: true
+ replset_config_options:
+ settings:
+ catchUpTimeoutMillis: 0
+ electionTimeoutMillis: 5000
+ voting_secondaries: true
+ shard_options:
+ all_nodes_electable: true
+ mongod_options:
+ oplogSize: 1024
+ replset_config_options:
+ settings:
+ catchUpTimeoutMillis: 0
+ electionTimeoutMillis: 5000
+ voting_secondaries: true
+ mongod_options:
+ set_parameters:
+ enableTestCommands: 1
+ num_rs_nodes_per_shard: 3
+ num_shards: 2
+ num_mongos: 2
diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml
index 046c806a573..95a57406197 100644
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns_and_balancer.yml
@@ -1,14 +1,229 @@
-test_kind: js_test
+test_kind: fsm_workload_test
selector:
roots:
- - jstests/concurrency/fsm_all_sharded_with_stepdowns_and_balancer.js
+ - jstests/concurrency/fsm_workloads/**/*.js
+ exclude_files:
+ # SERVER-13116 distinct isn't sharding aware
+ - jstests/concurrency/fsm_workloads/distinct.js
+ - jstests/concurrency/fsm_workloads/distinct_noindex.js
+ - jstests/concurrency/fsm_workloads/distinct_projection.js
+
+ # SERVER-17397 Drops of sharded namespaces may not fully succeed
+ - jstests/concurrency/fsm_workloads/create_database.js
+ - jstests/concurrency/fsm_workloads/drop_database.js
+
+ # SERVER-14669 Multi-removes that use $where miscount removed documents
+ - jstests/concurrency/fsm_workloads/remove_where.js
+
+ # Disabled due to SERVER-33753, '.count() without a predicate can be wrong on sharded
+ # collections'. This bug is problematic for these workloads because they assert on count()
+ # values:
+ - jstests/concurrency/fsm_workloads/agg_match.js
+
+ # $lookup and $graphLookup are not supported on sharded collections.
+ - jstests/concurrency/fsm_workloads/agg_graph_lookup.js
+ - jstests/concurrency/fsm_workloads/view_catalog_cycle_lookup.js
+
+ # Disabled due to SERVER-20057, 'Concurrent, sharded mapReduces can fail when temporary
+ # namespaces collide across mongos processes'
+ - jstests/concurrency/fsm_workloads/map_reduce_drop.js
+ - jstests/concurrency/fsm_workloads/map_reduce_inline.js
+ - jstests/concurrency/fsm_workloads/map_reduce_merge.js
+ - jstests/concurrency/fsm_workloads/map_reduce_merge_nonatomic.js
+ - jstests/concurrency/fsm_workloads/map_reduce_reduce.js
+ - jstests/concurrency/fsm_workloads/map_reduce_reduce_nonatomic.js
+ - jstests/concurrency/fsm_workloads/map_reduce_replace.js
+ - jstests/concurrency/fsm_workloads/map_reduce_replace_nonexistent.js
+ - jstests/concurrency/fsm_workloads/map_reduce_replace_remove.js
+
+ # Disabled due to SERVER-13364, 'The geoNear command doesn't handle shard versioning, so a
+ # concurrent chunk migration may cause duplicate or missing results'
+ - jstests/concurrency/fsm_workloads/yield_geo_near_dedup.js
+
+ # Disabled due to MongoDB restrictions and/or workload restrictions
+
+ # These workloads sometimes trigger 'Could not lock auth data update lock'
+ # errors because the AuthorizationManager currently waits for only five
+ # seconds to acquire the lock for authorization documents
+ - jstests/concurrency/fsm_workloads/auth_create_role.js
+ - jstests/concurrency/fsm_workloads/auth_create_user.js
+ - jstests/concurrency/fsm_workloads/auth_drop_role.js
+ - jstests/concurrency/fsm_workloads/auth_drop_user.js
+
+ # uses >100MB of data, which can overwhelm test hosts
+ - jstests/concurrency/fsm_workloads/agg_group_external.js
+ - jstests/concurrency/fsm_workloads/agg_sort_external.js
+
+ # compact can only be run against a standalone mongod
+ - jstests/concurrency/fsm_workloads/compact.js
+ - jstests/concurrency/fsm_workloads/compact_simultaneous_padding_bytes.js
+
+ # convertToCapped can't be run on mongos processes
+ - jstests/concurrency/fsm_workloads/convert_to_capped_collection.js
+ - jstests/concurrency/fsm_workloads/convert_to_capped_collection_index.js
+
+ # findAndModify requires a shard key
+ - jstests/concurrency/fsm_workloads/findAndModify_mixed_queue_unindexed.js
+ - jstests/concurrency/fsm_workloads/findAndModify_remove_queue_unindexed.js
+ - jstests/concurrency/fsm_workloads/findAndModify_update_collscan.js
+ - jstests/concurrency/fsm_workloads/findAndModify_update_queue.js
+ - jstests/concurrency/fsm_workloads/findAndModify_update_queue_unindexed.js
+
+ # remove cannot be {} for findAndModify
+ - jstests/concurrency/fsm_workloads/findAndModify_remove_queue.js
+
+ # can cause OOM kills on test hosts
+ - jstests/concurrency/fsm_workloads/findAndModify_update_grow.js
+
+ # eval doesn't work with sharded collections
+ - jstests/concurrency/fsm_workloads/indexed_insert_eval.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_eval_nolock.js
+ - jstests/concurrency/fsm_workloads/remove_single_document_eval.js
+ - jstests/concurrency/fsm_workloads/remove_single_document_eval_nolock.js
+ - jstests/concurrency/fsm_workloads/update_simple_eval.js
+ - jstests/concurrency/fsm_workloads/update_simple_eval_nolock.js
+
+ # cannot ensureIndex after dropDatabase without sharding first
+ - jstests/concurrency/fsm_workloads/plan_cache_drop_database.js
+
+ # our .remove(query, {justOne: true}) calls lack shard keys
+ - jstests/concurrency/fsm_workloads/remove_single_document.js
+
+ # The rename_* workloads are disabled since renameCollection doesn't work with sharded
+ # collections
+ - jstests/concurrency/fsm_workloads/rename_capped_collection_chain.js
+ - jstests/concurrency/fsm_workloads/rename_capped_collection_dbname_chain.js
+ - jstests/concurrency/fsm_workloads/rename_capped_collection_dbname_droptarget.js
+ - jstests/concurrency/fsm_workloads/rename_capped_collection_droptarget.js
+ - jstests/concurrency/fsm_workloads/rename_collection_chain.js
+ - jstests/concurrency/fsm_workloads/rename_collection_dbname_chain.js
+ - jstests/concurrency/fsm_workloads/rename_collection_dbname_droptarget.js
+ - jstests/concurrency/fsm_workloads/rename_collection_droptarget.js
+
+ # our update queries lack shard keys
+ - jstests/concurrency/fsm_workloads/update_upsert_multi.js
+ - jstests/concurrency/fsm_workloads/update_upsert_multi_noindex.js
+
+ # cannot use upsert command with $where with sharded collections
+ - jstests/concurrency/fsm_workloads/upsert_where.js
+
+ # stagedebug can only be run against a standalone mongod
+ - jstests/concurrency/fsm_workloads/yield_and_hashed.js
+ - jstests/concurrency/fsm_workloads/yield_and_sorted.js
+
+ # ChunkHelper directly talks to the config servers and doesn't support retries for network errors
+ - jstests/concurrency/fsm_workloads/sharded_base_partitioned.js
+ - jstests/concurrency/fsm_workloads/sharded_mergeChunks_partitioned.js
+ - jstests/concurrency/fsm_workloads/sharded_moveChunk_drop_shard_key_index.js
+ - jstests/concurrency/fsm_workloads/sharded_moveChunk_partitioned.js
+ - jstests/concurrency/fsm_workloads/sharded_splitChunk_partitioned.js
+
+ # These workloads frequently time out waiting for the distributed lock to drop a sharded
+ # collection.
+ - jstests/concurrency/fsm_workloads/kill_aggregation.js
+ - jstests/concurrency/fsm_workloads/kill_rooted_or.js
+ - jstests/concurrency/fsm_workloads/view_catalog_cycle_with_drop.js
+ - jstests/concurrency/fsm_workloads/view_catalog.js
+
+ # Uses getmores.
+ - jstests/concurrency/fsm_workloads/agg_base.js
+ - jstests/concurrency/fsm_workloads/create_index_background.js
+ - jstests/concurrency/fsm_workloads/globally_managed_cursors.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_ordered_bulk.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_text.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_unordered_bulk.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_upsert.js
+ - jstests/concurrency/fsm_workloads/indexed_insert_where.js
+ - jstests/concurrency/fsm_workloads/list_indexes.js
+ - jstests/concurrency/fsm_workloads/reindex.js
+ - jstests/concurrency/fsm_workloads/reindex_background.js
+ - jstests/concurrency/fsm_workloads/remove_multiple_documents.js
+ - jstests/concurrency/fsm_workloads/touch_base.js
+ - jstests/concurrency/fsm_workloads/touch_data.js
+ - jstests/concurrency/fsm_workloads/touch_index.js
+ - jstests/concurrency/fsm_workloads/touch_no_data_no_index.js
+ - jstests/concurrency/fsm_workloads/update_where.js
+ - jstests/concurrency/fsm_workloads/yield.js
+ - jstests/concurrency/fsm_workloads/yield_fetch.js
+ - jstests/concurrency/fsm_workloads/yield_rooted_or.js
+ - jstests/concurrency/fsm_workloads/yield_sort.js
+ - jstests/concurrency/fsm_workloads/yield_sort_merge.js
+ - jstests/concurrency/fsm_workloads/yield_text.js
+
+ # Uses non retryable writes.
+ - jstests/concurrency/fsm_workloads/remove_and_bulk_insert.js
+ - jstests/concurrency/fsm_workloads/update_and_bulk_insert.js
+ - jstests/concurrency/fsm_workloads/update_check_index.js
+ - jstests/concurrency/fsm_workloads/update_multifield_multiupdate.js
+ - jstests/concurrency/fsm_workloads/update_multifield_multiupdate_noindex.js
+ - jstests/concurrency/fsm_workloads/update_ordered_bulk_inc.js
+ - jstests/concurrency/fsm_workloads/yield_geo_near.js
+ - jstests/concurrency/fsm_workloads/yield_id_hack.js
+
+ # Uses non retryable commands.
+ - jstests/concurrency/fsm_workloads/agg_out.js
+ - jstests/concurrency/fsm_workloads/agg_sort.js
+ - jstests/concurrency/fsm_workloads/collmod.js
+ - jstests/concurrency/fsm_workloads/collmod_separate_collections.js
+ - jstests/concurrency/fsm_workloads/kill_multicollection_aggregation.js
+ - jstests/concurrency/fsm_workloads/invalidated_cursors.js
+
+ # The auto_retry_on_network_error.js override needs to overwrite the response from drop on
+ # NamespaceNotFound, and since this workload only creates and drops collections there isn't
+ # much value in running it.
+ - jstests/concurrency/fsm_workloads/drop_collection.js
-# Concurrency tests that run against a sharded cluster start one themselves.
executor:
archive:
+ hooks:
+ - CheckReplDBHash
+ - ValidateCollections
tests: true
config:
shell_options:
- nodb: ''
readMode: commands
+ global_vars:
+ TestData:
+ runningWithConfigStepdowns: true
+ runningWithShardStepdowns: true
+ useStepdownPermittedFile: true
+ useSteppingDownFile: true
+ usingReplicaSetShards: true
+ hooks:
+ - class: ContinuousStepdown
+ config_stepdown: true
+ shard_stepdown: true
+ use_stepdown_permitted_file: true
+ use_stepping_down_file: true
+ wait_for_mongos_retarget: true
+ - class: CheckReplDBHash
+ - class: CleanupConcurrencyWorkloads
+ fixture:
+ class: ShardedClusterFixture
+ mongos_options:
+ set_parameters:
+ enableTestCommands: 1
+ configsvr_options:
+ num_nodes: 3
+ all_nodes_electable: true
+ replset_config_options:
+ settings:
+ catchUpTimeoutMillis: 0
+ electionTimeoutMillis: 5000
+ voting_secondaries: true
+ shard_options:
+ all_nodes_electable: true
+ mongod_options:
+ oplogSize: 1024
+ replset_config_options:
+ settings:
+ catchUpTimeoutMillis: 0
+ electionTimeoutMillis: 5000
+ voting_secondaries: true
+ mongod_options:
+ set_parameters:
+ enableTestCommands: 1
+ num_rs_nodes_per_shard: 3
+ num_shards: 2
+ num_mongos: 2
diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py
index 1dee53769fb..6645f183087 100644
--- a/buildscripts/resmokelib/testing/hooks/stepdown.py
+++ b/buildscripts/resmokelib/testing/hooks/stepdown.py
@@ -2,14 +2,16 @@
from __future__ import absolute_import
import collections
+import os.path
import random
-import time
import threading
+import time
import bson
import pymongo.errors
from buildscripts.resmokelib import errors
+from buildscripts.resmokelib import utils
from buildscripts.resmokelib.testing.hooks import interface
from buildscripts.resmokelib.testing.fixtures import replicaset
from buildscripts.resmokelib.testing.fixtures import shardedcluster
@@ -23,7 +25,9 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
def __init__( # pylint: disable=too-many-arguments
self, hook_logger, fixture, config_stepdown=True, shard_stepdown=True,
- stepdown_duration_secs=10, stepdown_interval_ms=8000, terminate=False, kill=False):
+ stepdown_duration_secs=10, stepdown_interval_ms=8000, terminate=False, kill=False,
+ use_stepdown_permitted_file=False, use_stepping_down_file=False,
+ wait_for_mongos_retarget=False):
"""Initialize the ContinuousStepdown.
Args:
@@ -35,6 +39,10 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
stepdown_interval_ms: the number of milliseconds between stepdowns.
terminate: shut down the node cleanly as a means of stepping it down.
kill: With a 50% probability, kill the node instead of shutting it down cleanly.
+ use_stepdown_permitted_file: use a file to control if stepdown thread should do a stepdown.
+ use_stepping_down_file: use a file to denote when stepdown is active.
+ wait_for_mongos_retarget: whether to run validate on all mongoses for each collection
+ in each database, after pausing the stepdown thread.
Note that the "terminate" and "kill" arguments are named after the "SIGTERM" and
"SIGKILL" signals that are used to stop the process. On Windows, there are no signals,
@@ -47,21 +55,41 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
self._shard_stepdown = shard_stepdown
self._stepdown_duration_secs = stepdown_duration_secs
self._stepdown_interval_secs = float(stepdown_interval_ms) / 1000
+ self._wait_for_mongos_retarget = wait_for_mongos_retarget
self._rs_fixtures = []
+ self._mongos_fixtures = []
self._stepdown_thread = None
# kill implies terminate.
self._terminate = terminate or kill
self._kill = kill
+ # The stepdown file names need to match the same construction as found in
+ # jstests/concurrency/fsm_libs/resmoke_runner.js.
+ dbpath_prefix = fixture.get_dbpath_prefix()
+
+ if use_stepdown_permitted_file:
+ self._stepdown_permitted_file = os.path.join(
+ dbpath_prefix, "concurrency_sharded_stepdown_stepdown_permitted")
+ else:
+ self._stepdown_permitted_file = None
+ if use_stepping_down_file:
+ self._stepping_down_file = os.path.join(dbpath_prefix,
+ "concurrency_sharded_stepdown_stepping_down")
+ else:
+ self._stepping_down_file = None
+
def before_suite(self, test_report):
"""Before suite."""
if not self._rs_fixtures:
self._add_fixture(self._fixture)
+ utils.remove_if_exists(self._stepdown_permitted_file)
+ utils.remove_if_exists(self._stepping_down_file)
self._stepdown_thread = _StepdownThread(
- self.logger, self._rs_fixtures, self._stepdown_interval_secs,
- self._stepdown_duration_secs, self._terminate, self._kill)
+ self.logger, self._mongos_fixtures, self._rs_fixtures, self._stepdown_interval_secs,
+ self._stepdown_duration_secs, self._terminate, self._kill,
+ self._stepdown_permitted_file, self._stepping_down_file, self._wait_for_mongos_retarget)
self.logger.info("Starting the stepdown thread.")
self._stepdown_thread.start()
@@ -69,11 +97,16 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
"""After suite."""
self.logger.info("Stopping the stepdown thread.")
self._stepdown_thread.stop()
+ self.logger.info("Stepdown thread stopped.")
def before_test(self, test, test_report):
"""Before test."""
self._check_thread()
self.logger.info("Resuming the stepdown thread.")
+ # Once the stepdown thread has started any files it creates must be deleted within the
+ # thread, since the Windows file handle is still open.
+ self._stepdown_thread.pause()
+ self._stepdown_thread.clean_stepdown_files()
self._stepdown_thread.resume()
def after_test(self, test, test_report):
@@ -102,21 +135,29 @@ class ContinuousStepdown(interface.Hook): # pylint: disable=too-many-instance-a
self._add_fixture(shard_fixture)
if self._config_stepdown:
self._add_fixture(fixture.configsvr)
+ if self._wait_for_mongos_retarget:
+ for mongos_fixture in fixture.mongos:
+ self._mongos_fixtures.append(mongos_fixture)
class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-attributes
def __init__( # pylint: disable=too-many-arguments
- self, logger, rs_fixtures, stepdown_interval_secs, stepdown_duration_secs, terminate,
- kill):
+ self, logger, mongos_fixtures, rs_fixtures, stepdown_interval_secs,
+ stepdown_duration_secs, terminate, kill, stepdown_permitted_file, stepping_down_file,
+ wait_for_mongos_retarget):
"""Initialize _StepdownThread."""
threading.Thread.__init__(self, name="StepdownThread")
self.daemon = True
self.logger = logger
+ self._mongos_fixtures = mongos_fixtures
self._rs_fixtures = rs_fixtures
self._stepdown_interval_secs = stepdown_interval_secs
self._stepdown_duration_secs = stepdown_duration_secs
self._terminate = terminate
self._kill = kill
+ self._stepdown_permitted_file = stepdown_permitted_file
+ self._stepping_down_file = stepping_down_file
+ self._should_wait_for_mongos_retarget = wait_for_mongos_retarget
self._last_exec = time.time()
# Event set when the thread has been stopped using the 'stop()' method.
@@ -137,20 +178,24 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at
return
while True:
- self._pause_if_needed()
if self._is_stopped():
break
+ self._wait_for_permission_or_resume()
now = time.time()
if now - self._last_exec > self._stepdown_interval_secs:
+ self.logger.info("Starting stepdown of all primaries")
self._step_down_all()
# Wait until each replica set has a primary, so the test can make progress.
self._await_primaries()
self._last_exec = time.time()
+ self.logger.info("Completed stepdown of all primaries in %0d ms",
+ (self._last_exec - now) * 1000)
now = time.time()
- # 'wait_secs' is used to wait 'self._stepdown_interval_secs' from the moment the last
- # stepdown command was sent.
- wait_secs = max(0, self._stepdown_interval_secs - (now - self._last_exec))
- self._wait(wait_secs)
+ if self._is_permitted():
+ # The 'wait_secs' is used to wait 'self._stepdown_interval_secs' from the moment
+ # the last stepdown command was sent.
+ wait_secs = max(0, self._stepdown_interval_secs - (now - self._last_exec))
+ self._wait(wait_secs)
def stop(self):
"""Stop the thread."""
@@ -169,6 +214,8 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at
self._is_idle_evt.wait()
# Wait until we all the replica sets have primaries.
self._await_primaries()
+ # Wait for Mongos to retarget the primary for each shard and the config server.
+ self._do_wait_for_mongos_retarget()
def resume(self):
"""Resume the thread."""
@@ -178,9 +225,14 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at
"Current statistics about which nodes have been successfully stepped up: %s",
self._step_up_stats)
- def _pause_if_needed(self):
- # Wait until resume or stop.
- self._is_resumed_evt.wait()
+ def _wait_for_permission_or_resume(self):
+ # Wait until stop, _stepdown_permitted_file or resume.
+ if self._stepdown_permitted_file:
+ while not os.path.isfile(self._stepdown_permitted_file) and not self._is_stopped():
+ # Set a short sleep during busy wait time for self._stepdown_permitted_file.
+ self._wait(0.1)
+ else:
+ self._is_resumed_evt.wait()
def _wait(self, timeout):
# Wait until stop or timeout.
@@ -192,10 +244,13 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at
def _step_down_all(self):
self._is_idle_evt.clear()
+ self._stepdown_starting()
try:
- for rs_fixture in self._rs_fixtures:
- self._step_down(rs_fixture)
+ if self._is_permitted():
+ for rs_fixture in self._rs_fixtures:
+ self._step_down(rs_fixture)
finally:
+ self._stepdown_completed()
self._is_idle_evt.set()
def _step_down(self, rs_fixture):
@@ -284,3 +339,78 @@ class _StepdownThread(threading.Thread): # pylint: disable=too-many-instance-at
key = "{}/{}".format(rs_fixture.replset_name,
chosen.get_internal_connection_string() if secondaries else "none")
self._step_up_stats[key] += 1
+
+ def _do_wait_for_mongos_retarget(self): # pylint: disable=too-many-branches
+ """Run collStats on each collection in each database on each mongos.
+
+ This is to ensure mongos can target the primary for each shard with data, including the
+ config servers.
+ """
+ if not self._should_wait_for_mongos_retarget:
+ return
+
+ for mongos_fixture in self._mongos_fixtures:
+ mongos_conn_str = mongos_fixture.get_internal_connection_string()
+ try:
+ client = mongos_fixture.mongo_client()
+ except pymongo.errors.AutoReconnect:
+ pass
+ for db in client.database_names():
+ self.logger.info("Waiting for mongos %s to retarget db: %s", mongos_conn_str, db)
+ start_time = time.time()
+ while True:
+ try:
+ coll_names = client[db].collection_names()
+ break
+ except pymongo.errors.NotMasterError:
+ pass
+ retarget_time = time.time() - start_time
+ if retarget_time >= 60:
+ self.logger.exception(
+ "Timeout waiting for mongos: %s to retarget to db: %s", mongos_conn_str,
+ db)
+ raise # pylint: disable=misplaced-bare-raise
+ time.sleep(0.2)
+ for coll in coll_names:
+ while True:
+ try:
+ client[db].command({"collStats": coll})
+ break
+ except pymongo.errors.NotMasterError:
+ pass
+ retarget_time = time.time() - start_time
+ if retarget_time >= 60:
+ self.logger.exception(
+ "Timeout waiting for mongos: %s to retarget to db: %s",
+ mongos_conn_str, db)
+ raise # pylint: disable=misplaced-bare-raise
+ time.sleep(0.2)
+ retarget_time = time.time() - start_time
+ self.logger.info("Finished waiting for mongos: %s to retarget db: %s, in %d ms",
+ mongos_conn_str, db, retarget_time * 1000)
+
+ def _is_permitted(self):
+ """Permit a stepdown if the permitted file is not specified or it exists.
+
+ The self._permitted_file is created by an external framework, i.e., JS tests.
+ """
+ if self._stepdown_permitted_file:
+ return os.path.isfile(self._stepdown_permitted_file)
+ return self._is_resumed_evt.is_set()
+
+ def _stepdown_starting(self):
+ """Create self._stepping_down_file, if specified."""
+ if self._stepping_down_file:
+ if os.path.isfile(self._stepping_down_file):
+ raise # pylint: disable=misplaced-bare-raise
+ with open(self._stepping_down_file, "w") as fh:
+ fh.write("")
+
+ def _stepdown_completed(self):
+ """Delete self._stepping_down_file, if specified."""
+ utils.remove_if_exists(self._stepping_down_file)
+
+ def clean_stepdown_files(self):
+ """Remove the stepdown files."""
+ utils.remove_if_exists(self._stepdown_permitted_file)
+ utils.remove_if_exists(self._stepping_down_file)
diff --git a/buildscripts/resmokelib/testing/testcases/fsm_workload_test.py b/buildscripts/resmokelib/testing/testcases/fsm_workload_test.py
index ea8f91692fb..1598273a448 100644
--- a/buildscripts/resmokelib/testing/testcases/fsm_workload_test.py
+++ b/buildscripts/resmokelib/testing/testcases/fsm_workload_test.py
@@ -5,6 +5,7 @@ from __future__ import absolute_import
import os.path
import threading
+from buildscripts.resmokelib.testing.testcases import interface
from buildscripts.resmokelib.testing.testcases import jsrunnerfile
@@ -24,11 +25,26 @@ class FSMWorkloadTestCase(jsrunnerfile.JSRunnerFileTestCase):
self.same_collection = same_collection
self.same_db = same_db or self.same_collection
self.db_name_prefix = db_name_prefix
+ self.dbpath_prefix = None
jsrunnerfile.JSRunnerFileTestCase.__init__(
self, logger, "FSM workload", fsm_workload,
test_runner_file="jstests/concurrency/fsm_libs/resmoke_runner.js",
shell_executable=shell_executable, shell_options=shell_options)
+ def configure(self, fixture, *args, **kwargs):
+ """Configure the FSMWorkloadTestCase runner."""
+ interface.ProcessTestCase.configure(self, fixture, *args, **kwargs)
+
+ self.dbpath_prefix = self.fixture.get_dbpath_prefix()
+
+ global_vars = self.shell_options.get("global_vars", {}).copy()
+
+ test_data = global_vars.get("TestData", {}).copy()
+ self._populate_test_data(test_data)
+
+ global_vars["TestData"] = test_data
+ self.shell_options["global_vars"] = global_vars
+
@property
def fsm_workload(self):
"""Get the test name."""
@@ -37,6 +53,7 @@ class FSMWorkloadTestCase(jsrunnerfile.JSRunnerFileTestCase):
def _populate_test_data(self, test_data):
test_data["fsmWorkloads"] = self.fsm_workload
+ test_data["resmokeDbPathPrefix"] = self.dbpath_prefix
with FSMWorkloadTestCase._COUNTER_LOCK:
count = FSMWorkloadTestCase._COUNTER
diff --git a/buildscripts/resmokelib/utils/__init__.py b/buildscripts/resmokelib/utils/__init__.py
index 9213302f8be..6b6a76d1f45 100644
--- a/buildscripts/resmokelib/utils/__init__.py
+++ b/buildscripts/resmokelib/utils/__init__.py
@@ -61,6 +61,15 @@ def is_windows():
return sys.platform.startswith("win32") or sys.platform.startswith("cygwin")
+def remove_if_exists(path):
+ """Remove path if it exists."""
+ if path is not None and os.path.exists(path):
+ try:
+ os.remove(path)
+ except OSError:
+ pass
+
+
def is_string_list(lst):
"""Return true if 'lst' is a list of strings, and false otherwise."""
return isinstance(lst, list) and all(isinstance(x, basestring) for x in lst)
diff --git a/etc/evergreen.yml b/etc/evergreen.yml
index e92737c2cd2..19ed7e2c8ed 100644
--- a/etc/evergreen.yml
+++ b/etc/evergreen.yml
@@ -4762,7 +4762,6 @@ tasks:
- func: "run tests"
vars:
resmoke_args: --suites=concurrency_sharded_with_stepdowns --storageEngine=wiredTiger
- timeout_secs: 21600 # 6 hour timeout for each test
- <<: *task_template
name: concurrency_sharded_with_stepdowns_and_balancer
@@ -4771,7 +4770,6 @@ tasks:
- func: "run tests"
vars:
resmoke_args: --suites=concurrency_sharded_with_stepdowns_and_balancer --storageEngine=wiredTiger
- timeout_secs: 21600 # 6 hour timeout for each test
- <<: *task_template
name: concurrency_simultaneous
diff --git a/jstests/concurrency/fsm_libs/cluster.js b/jstests/concurrency/fsm_libs/cluster.js
index 8ad92353ea8..73de632ef7f 100644
--- a/jstests/concurrency/fsm_libs/cluster.js
+++ b/jstests/concurrency/fsm_libs/cluster.js
@@ -275,10 +275,7 @@ var Cluster = function(options) {
rawST.startContinuousFailover();
};
- this.stopContinuousFailover = function() {
- rawST.stopContinuousFailover(
- {waitForPrimary: true, waitForMongosRetarget: true});
-
+ this.reestablishConnectionsAfterFailover = function() {
// Call getPrimary() to re-establish the connections in FSMShardingTest
// as it is not a transparent proxy for SharingTest/rawST.
st._configsvr.getPrimary();
@@ -286,6 +283,12 @@ var Cluster = function(options) {
rst.getPrimary();
}
};
+
+ this.stopContinuousFailover = function() {
+ rawST.stopContinuousFailover(
+ {waitForPrimary: true, waitForMongosRetarget: true});
+ this.reestablishConnectionsAfterFailover();
+ };
}
// Save all mongos, mongod, and ReplSet connections (if any).
@@ -394,17 +397,7 @@ var Cluster = function(options) {
if (!fn || typeof(fn) !== 'function' || fn.length !== 1) {
throw new Error('config function must be a function that takes a db as an argument');
}
-
- var configs = [];
- var config = st.c(0);
- var i = 0;
- while (config) {
- configs.push(config);
- ++i;
- config = st.c(i);
- }
-
- configs.forEach(function(conn) {
+ st._configServers.forEach(function(conn) {
fn(conn.getDB('admin'));
});
};
diff --git a/jstests/concurrency/fsm_libs/resmoke_runner.js b/jstests/concurrency/fsm_libs/resmoke_runner.js
index 3187a16bc05..5228d7f16c6 100644
--- a/jstests/concurrency/fsm_libs/resmoke_runner.js
+++ b/jstests/concurrency/fsm_libs/resmoke_runner.js
@@ -104,6 +104,18 @@
cleanup.push(workload);
});
+ // After the $config.setup() function has been called, it is safe for the stepdown
+ // thread to start running. The main thread won't attempt to interact with the cluster
+ // until all of the spawned worker threads have finished.
+ //
+
+ // Indicate that the stepdown thread can run. It is unnecessary for the stepdown thread
+ // to indicate that it is going to start running because it will eventually after the
+ // worker threads have started.
+ if (typeof executionOptions.stepdownPermittedFile === 'string') {
+ writeFile(executionOptions.stepdownPermittedFile, '');
+ }
+
// Since the worker threads may be running with causal consistency enabled, we set the
// initial clusterTime and initial operationTime for the sessions they'll create so that
// they are guaranteed to observe the effects of the workload's $config.setup() function
@@ -128,19 +140,41 @@
}
try {
- // Start this set of worker threads.
- threadMgr.spawnAll(cluster, executionOptions);
- // Allow 20% of the threads to fail. This allows the workloads to run on
- // underpowered test hosts.
- threadMgr.checkFailed(0.2);
+ try {
+ // Start this set of worker threads.
+ threadMgr.spawnAll(cluster, executionOptions);
+ // Allow 20% of the threads to fail. This allows the workloads to run on
+ // underpowered test hosts.
+ threadMgr.checkFailed(0.2);
+ } finally {
+ // Threads must be joined before destruction, so do this even in the presence of
+ // exceptions.
+ errors.push(...threadMgr.joinAll().map(
+ e => new WorkloadFailure(
+ e.err, e.stack, e.tid, 'Foreground ' + e.workloads.join(' '))));
+ }
} finally {
- // Threads must be joined before destruction, so do this even in the presence of
- // exceptions.
- errors.push(...threadMgr.joinAll().map(
- e => new WorkloadFailure(
- e.err, e.stack, e.tid, 'Foreground ' + e.workloads.join(' '))));
+ // Until we are guaranteed that the stepdown thread isn't running, it isn't safe for
+ // the $config.teardown() function to be called. We should signal to resmoke.py that
+ // the stepdown thread should stop running and wait for the stepdown thread to
+ // signal that it has stopped.
+ //
+ // Signal to the stepdown thread to stop stepping down the cluster.
+ if (typeof executionOptions.stepdownPermittedFile === 'string' &&
+ typeof executionOptions.steppingDownFile === 'string') {
+ removeFile(executionOptions.stepdownPermittedFile);
+ // Wait for the steppingDownFile to be removed by the stepdown thread.
+ assert.soonNoExcept(function() {
+ if (ls().indexOf(executionOptions.steppingDownFile) === -1) {
+ return true;
+ }
+ }, "stepdown still in progress");
+ }
}
} finally {
+ if (cluster.shouldPerformContinuousStepdowns()) {
+ cluster.reestablishConnectionsAfterFailover();
+ }
// Call each workload's teardown function. After all teardowns have completed check if
// any of them failed.
const cleanupResults = cleanup.map(
@@ -206,6 +240,19 @@
}
const executionOptions = {dbNamePrefix: TestData.dbNamePrefix || ""};
+ const resmokeDbPathPrefix = TestData.resmokeDbPathPrefix || ".";
+
+ // The stepdown file names need to match the same construction as found in
+ // buildscripts/resmokelib/testing/hooks/stepdown.py.
+ if (TestData.useStepdownPermittedFile) {
+ executionOptions.stepdownPermittedFile =
+ resmokeDbPathPrefix + '/concurrency_sharded_stepdown_stepdown_permitted';
+ }
+
+ if (TestData.useSteppingDownFile) {
+ executionOptions.steppingDownFile =
+ resmokeDbPathPrefix + '/concurrency_sharded_stepdown_stepping_down';
+ }
if (Object.keys(sessionOptions).length > 0) {
executionOptions.sessionOptions = sessionOptions;
diff --git a/jstests/concurrency/fsm_libs/runner.js b/jstests/concurrency/fsm_libs/runner.js
index e35ce46b2fa..9a729ba6008 100644
--- a/jstests/concurrency/fsm_libs/runner.js
+++ b/jstests/concurrency/fsm_libs/runner.js
@@ -47,6 +47,8 @@ var runner = (function() {
'dbNamePrefix',
'iterationMultiplier',
'sessionOptions',
+ 'stepdownPermittedFile',
+ 'steppingDownFile',
'threadMultiplier'
];
@@ -105,6 +107,18 @@ var runner = (function() {
1,
'expected iterationMultiplier to be greater than or equal to 1');
+ if (typeof options.stepdownPermittedFile !== 'undefined') {
+ assert.eq('string',
+ typeof options.stepdownPermittedFile,
+ 'expected stepdownPermittedFile to be a string');
+ }
+
+ if (typeof options.steppingDownFile !== 'undefined') {
+ assert.eq('string',
+ typeof options.steppingDownFile,
+ 'expected steppingDownFile to be a string');
+ }
+
options.threadMultiplier = options.threadMultiplier || 1;
assert(Number.isInteger(options.threadMultiplier),
'expected threadMultiplier to be an integer');