diff options
author | Brett Nawrocki <brett.nawrocki@mongodb.com> | 2021-10-15 19:02:39 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2021-11-23 22:53:36 +0000 |
commit | 417283d903c8234f7c444851a438275fa8a67e1b (patch) | |
tree | 8308313ffd2111aae50aace0825fca632f423e71 | |
parent | 0abc2f80baa4a9f77955e4c64419bff29799301a (diff) | |
download | mongo-417283d903c8234f7c444851a438275fa8a67e1b.tar.gz |
SERVER-54623 Wait longer for session collection in ShardedClusterFixture
Test suites using the ShardedClusterFixture could time out waiting for
replication after calling refreshLogicalSessionCacheNow during setup.
This was most commonly seen during periods of high disk utilization.
Instead of failing the test, wait however long it takes.
(cherry picked from commit 4dd7f84be70ee95893ecf863f2d82c79edb2b8df)
3 files changed, 29 insertions, 11 deletions
diff --git a/buildscripts/resmokelib/testing/fixtures/interface.py b/buildscripts/resmokelib/testing/fixtures/interface.py index 321955bcddd..c56353d60c3 100644 --- a/buildscripts/resmokelib/testing/fixtures/interface.py +++ b/buildscripts/resmokelib/testing/fixtures/interface.py @@ -26,6 +26,14 @@ def make_fixture(class_name, *args, **kwargs): class Fixture(object, metaclass=registry.make_registry_metaclass(_FIXTURES)): """Base class for all fixtures.""" + # Error response codes copied from mongo/base/error_codes.yml. + _WRITE_CONCERN_FAILED = 64 + _NODE_NOT_FOUND = 74 + _NEW_REPLICA_SET_CONFIGURATION_INCOMPATIBLE = 103 + _CONFIGURATION_IN_PROGRESS = 109 + _CURRENT_CONFIG_NOT_COMMITTED_YET = 308 + _INTERRUPTED_DUE_TO_REPL_STATE_CHANGE = 11602 + # We explicitly set the 'REGISTERED_NAME' attribute so that PyLint realizes that the attribute # is defined for all subclasses of Fixture. REGISTERED_NAME = "Fixture" diff --git a/buildscripts/resmokelib/testing/fixtures/replicaset.py b/buildscripts/resmokelib/testing/fixtures/replicaset.py index 9c449dbc432..0f0996c5d76 100644 --- a/buildscripts/resmokelib/testing/fixtures/replicaset.py +++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py @@ -19,9 +19,6 @@ from ... import utils class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-instance-attributes """Fixture which provides JSTests with a replica set to run against.""" - # Error response codes copied from mongo/base/error_codes.err. - _NODE_NOT_FOUND = 74 - def __init__( # pylint: disable=too-many-arguments, too-many-locals self, logger, job_num, mongod_executable=None, mongod_options=None, dbpath_prefix=None, preserve_dbpath=False, num_nodes=2, start_initial_sync_node=False, @@ -196,7 +193,7 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst raise errors.ServerFailure(msg) time.sleep(5) # Wait a little bit before trying again. - def await_last_op_committed(self): + def await_last_op_committed(self, timeout_secs=None): """Wait for the last majority committed op to be visible.""" primary_client = self.get_primary().mongo_client() self.auth(primary_client, self.auth_options) @@ -221,7 +218,8 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst return len(up_to_date_nodes) == len(self.nodes) - self._await_cmd_all_nodes(check_rcmaj_optime, "waiting for last committed optime") + self._await_cmd_all_nodes(check_rcmaj_optime, "waiting for last committed optime", + timeout_secs) def await_ready(self): """Wait for replica set to be ready.""" @@ -391,7 +389,7 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst return self._await_cmd_all_nodes(is_primary, "waiting for a primary", timeout_secs) - def _await_cmd_all_nodes(self, fn, msg, timeout_secs=30): + def _await_cmd_all_nodes(self, fn, msg, timeout_secs=None): """Run `fn` on all nodes until it returns a truthy value. Return the node for which makes `fn` become truthy. @@ -400,6 +398,8 @@ class ReplicaSetFixture(interface.ReplFixture): # pylint: disable=too-many-inst the MongoDFixture corresponding to that node. """ + if timeout_secs is None: + timeout_secs = self.AWAIT_REPL_TIMEOUT_MINS * 60 start = time.time() clients = {} while True: diff --git a/buildscripts/resmokelib/testing/fixtures/shardedcluster.py b/buildscripts/resmokelib/testing/fixtures/shardedcluster.py index f2a6814b171..50f1573381d 100644 --- a/buildscripts/resmokelib/testing/fixtures/shardedcluster.py +++ b/buildscripts/resmokelib/testing/fixtures/shardedcluster.py @@ -82,6 +82,19 @@ class ShardedClusterFixture(interface.Fixture): # pylint: disable=too-many-inst for shard in self.shards: shard.setup() + def refresh_logical_session_cache(self, target): + """Refresh logical session cache with no timeout.""" + primary = (target.mongo_client() + if self.num_rs_nodes_per_shard is None else target.get_primary().mongo_client()) + try: + primary.admin.command({"refreshLogicalSessionCacheNow": 1}) + except pymongo.errors.OperationFailure as err: + if err.code != self._WRITE_CONCERN_FAILED: + raise err + self.logger.info("Ignoring write concern timeout for refreshLogicalSessionCacheNow " + "command and continuing to wait") + target.await_last_op_committed(target.AWAIT_REPL_TIMEOUT_FOREVER_MINS * 60) + def await_ready(self): """Block until the fixture can be used for testing.""" # Wait for the config server @@ -137,13 +150,10 @@ class ShardedClusterFixture(interface.Fixture): # pylint: disable=too-many-inst # Ensure that the sessions collection gets auto-sharded by the config server if self.configsvr is not None: - primary = self.configsvr.get_primary().mongo_client() - primary.admin.command({"refreshLogicalSessionCacheNow": 1}) + self.refresh_logical_session_cache(self.configsvr) for shard in self.shards: - primary = (shard.mongo_client() if self.num_rs_nodes_per_shard is None else - shard.get_primary().mongo_client()) - primary.admin.command({"refreshLogicalSessionCacheNow": 1}) + self.refresh_logical_session_cache(shard) def _auth_to_db(self, client): """Authenticate client for the 'authenticationDatabase'.""" |