summaryrefslogtreecommitdiff
path: root/buildscripts/resmokelib/testing/fixtures/replicaset.py
blob: 43bcc854b677aed523d34ede00adadb4f5cb3d0f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
"""Replica set fixture for executing JSTests against."""

from __future__ import absolute_import

import os.path
import time

import pymongo
import pymongo.errors
import pymongo.write_concern

from . import interface
from . import standalone
from ... import config
from ... import errors
from ... import utils


class ReplicaSetFixture(interface.ReplFixture):  # pylint: disable=too-many-instance-attributes
    """Fixture which provides JSTests with a replica set to run against."""

    # Error response codes copied from mongo/base/error_codes.err.
    _NODE_NOT_FOUND = 74

    def __init__(  # pylint: disable=too-many-arguments
            self, logger, job_num, mongod_executable=None, mongod_options=None, dbpath_prefix=None,
            preserve_dbpath=False, num_nodes=2, start_initial_sync_node=False,
            write_concern_majority_journal_default=None, auth_options=None,
            replset_config_options=None, voting_secondaries=None, all_nodes_electable=False,
            use_replica_set_connection_string=None):
        """Initialize ReplicaSetFixture."""

        interface.ReplFixture.__init__(self, logger, job_num, dbpath_prefix=dbpath_prefix)

        self.mongod_executable = mongod_executable
        self.mongod_options = utils.default_if_none(mongod_options, {})
        self.preserve_dbpath = preserve_dbpath
        self.num_nodes = num_nodes
        self.start_initial_sync_node = start_initial_sync_node
        self.write_concern_majority_journal_default = write_concern_majority_journal_default
        self.auth_options = auth_options
        self.replset_config_options = utils.default_if_none(replset_config_options, {})
        self.voting_secondaries = voting_secondaries
        self.all_nodes_electable = all_nodes_electable
        self.use_replica_set_connection_string = use_replica_set_connection_string

        # If voting_secondaries has not been set, set a default. By default, secondaries have zero
        # votes unless they are also nodes capable of being elected primary.
        if self.voting_secondaries is None:
            self.voting_secondaries = self.all_nodes_electable

        # By default, we only use a replica set connection string if all nodes are capable of being
        # elected primary.
        if self.use_replica_set_connection_string is None:
            self.use_replica_set_connection_string = self.all_nodes_electable

        # Set the default oplogSize to 511MB.
        self.mongod_options.setdefault("oplogSize", 511)

        # The dbpath in mongod_options is used as the dbpath prefix for replica set members and
        # takes precedence over other settings. The ShardedClusterFixture uses this parameter to
        # create replica sets and assign their dbpath structure explicitly.
        if "dbpath" in self.mongod_options:
            self._dbpath_prefix = self.mongod_options.pop("dbpath")
        else:
            self._dbpath_prefix = os.path.join(self._dbpath_prefix, config.FIXTURE_SUBDIR)

        self.nodes = []
        self.replset_name = None
        self.initial_sync_node = None
        self.initial_sync_node_idx = -1

    def setup(self):  # pylint: disable=too-many-branches,too-many-statements
        """Set up the replica set."""
        self.replset_name = self.mongod_options.get("replSet", "rs")

        if not self.nodes:
            for i in xrange(self.num_nodes):
                node = self._new_mongod(i, self.replset_name)
                self.nodes.append(node)

        for node in self.nodes:
            node.setup()

        if self.start_initial_sync_node:
            if not self.initial_sync_node:
                self.initial_sync_node_idx = len(self.nodes)
                self.initial_sync_node = self._new_mongod(self.initial_sync_node_idx,
                                                          self.replset_name)
            self.initial_sync_node.setup()
            self.initial_sync_node.await_ready()

        # We need only to wait to connect to the first node of the replica set because we first
        # initiate it as a single node replica set.
        self.nodes[0].await_ready()

        # Initiate the replica set.
        members = []
        for (i, node) in enumerate(self.nodes):
            member_info = {"_id": i, "host": node.get_internal_connection_string()}
            if i > 0:
                if not self.all_nodes_electable:
                    member_info["priority"] = 0
                if i >= 7 or not self.voting_secondaries:
                    # Only 7 nodes in a replica set can vote, so the other members must still be
                    # non-voting when this fixture is configured to have voting secondaries.
                    member_info["votes"] = 0
            members.append(member_info)
        if self.initial_sync_node:
            members.append({
                "_id": self.initial_sync_node_idx,
                "host": self.initial_sync_node.get_internal_connection_string(), "priority": 0,
                "hidden": 1, "votes": 0
            })

        repl_config = {"_id": self.replset_name, "protocolVersion": 1}
        client = self.nodes[0].mongo_client()

        if self.auth_options is not None:
            auth_db = client[self.auth_options["authenticationDatabase"]]
            auth_db.authenticate(self.auth_options["username"],
                                 password=self.auth_options["password"],
                                 mechanism=self.auth_options["authenticationMechanism"])

        if client.local.system.replset.count():
            # Skip initializing the replset if there is an existing configuration.
            return

        if self.write_concern_majority_journal_default is not None:
            repl_config[
                "writeConcernMajorityJournalDefault"] = self.write_concern_majority_journal_default
        else:
            server_status = client.admin.command({"serverStatus": 1})
            cmd_line_opts = client.admin.command({"getCmdLineOpts": 1})
            if not (server_status["storageEngine"]["persistent"] and cmd_line_opts["parsed"].get(
                    "storage", {}).get("journal", {}).get("enabled", True)):
                repl_config["writeConcernMajorityJournalDefault"] = False

        if self.replset_config_options.get("configsvr", False):
            repl_config["configsvr"] = True
        if self.replset_config_options.get("settings"):
            replset_settings = self.replset_config_options["settings"]
            repl_config["settings"] = replset_settings

        # If secondaries vote, all nodes are not electable, and no election timeout was specified,
        # increase the election timeout to 24 hours to prevent elections.
        if self.voting_secondaries and not self.all_nodes_electable:
            repl_config.setdefault("settings", {})
            if "electionTimeoutMillis" not in repl_config["settings"]:
                repl_config["settings"]["electionTimeoutMillis"] = 24 * 60 * 60 * 1000

        # Start up a single node replica set then reconfigure to the correct size (if the config
        # contains more than 1 node), so the primary is elected more quickly.
        repl_config["members"] = [members[0]]
        self.logger.info("Issuing replSetInitiate command: %s", repl_config)
        self._configure_repl_set(client, {"replSetInitiate": repl_config})
        self._await_primary()

        if self.nodes[1:]:
            # Wait to connect to each of the secondaries before running the replSetReconfig
            # command.
            for node in self.nodes[1:]:
                node.await_ready()
            repl_config["version"] = 2
            repl_config["members"] = members
            self.logger.info("Issuing replSetReconfig command: %s", repl_config)
            self._configure_repl_set(client, {"replSetReconfig": repl_config})
            self._await_secondaries()

    def _configure_repl_set(self, client, cmd_obj):
        # replSetInitiate and replSetReconfig commands can fail with a NodeNotFound error
        # if a heartbeat times out during the quorum check. We retry three times to reduce
        # the chance of failing this way.
        num_initiate_attempts = 3
        for attempt in range(1, num_initiate_attempts + 1):
            try:
                client.admin.command(cmd_obj)
                break
            except pymongo.errors.OperationFailure as err:
                # Retry on NodeNotFound errors from the "replSetInitiate" command.
                if err.code != ReplicaSetFixture._NODE_NOT_FOUND:
                    msg = ("Operation failure while configuring the "
                           "replica set fixture: {}").format(err)
                    self.logger.error(msg)
                    raise errors.ServerFailure(msg)

                msg = "replSetInitiate failed attempt {0} of {1} with error: {2}".format(
                    attempt, num_initiate_attempts, err)
                self.logger.error(msg)
                if attempt == num_initiate_attempts:
                    msg = "Exceeded number of retries while configuring the replica set fixture"
                    self.logger.error(msg + ".")
                    raise errors.ServerFailure(msg)
                time.sleep(5)  # Wait a little bit before trying again.

    def await_ready(self):
        """Wait for replica set tpo be ready."""
        self._await_primary()
        self._await_secondaries()
        self._await_stable_checkpoint()

    def _await_primary(self):
        # Wait for the primary to be elected.
        # Since this method is called at startup we expect the first node to be primary even when
        # self.all_nodes_electable is True.
        primary = self.nodes[0]
        client = primary.mongo_client()
        while True:
            self.logger.info("Waiting for primary on port %d to be elected.", primary.port)
            is_master = client.admin.command("isMaster")["ismaster"]
            if is_master:
                break
            time.sleep(0.1)  # Wait a little bit before trying again.
        self.logger.info("Primary on port %d successfully elected.", primary.port)

    def _await_secondaries(self):
        # Wait for the secondaries to become available.
        # Since this method is called at startup we expect the nodes 1 to n to be secondaries even
        # when self.all_nodes_electable is True.
        secondaries = self.nodes[1:]
        if self.initial_sync_node:
            secondaries.append(self.initial_sync_node)

        for secondary in secondaries:
            client = secondary.mongo_client(read_preference=pymongo.ReadPreference.SECONDARY)
            while True:
                self.logger.info("Waiting for secondary on port %d to become available.",
                                 secondary.port)
                is_secondary = client.admin.command("isMaster")["secondary"]
                if is_secondary:
                    break
                time.sleep(0.1)  # Wait a little bit before trying again.
            self.logger.info("Secondary on port %d is now available.", secondary.port)

    def _await_stable_checkpoint(self):
        # Since this method is called at startup we expect the first node to be primary even when
        # self.all_nodes_electable is True.
        primary = self.nodes[0]
        primary_client = primary.mongo_client()
        if self.auth_options is not None:
            auth_db = primary_client[self.auth_options["authenticationDatabase"]]
            auth_db.authenticate(self.auth_options["username"],
                                 password=self.auth_options["password"],
                                 mechanism=self.auth_options["authenticationMechanism"])
        # Algorithm precondition: All nodes must be in primary/secondary state.
        #
        # 1) Perform a majority write. This will guarantee the primary updates its commit point
        #    to the value of this write.
        #
        # 2) Perform a second write. This will guarantee that all nodes will update their commit
        #    point to a time that is >= the previous write. That will trigger a stable checkpoint
        #    on all nodes.
        # TODO(SERVER-33248): Remove this block. We should not need to prod the replica set to
        # advance the commit point if the commit point being lagged is sufficient to choose a
        # sync source.
        admin = primary_client.get_database(
            "admin", write_concern=pymongo.write_concern.WriteConcern(w="majority"))
        admin.command("appendOplogNote", data={"await_stable_checkpoint": 1})
        admin.command("appendOplogNote", data={"await_stable_checkpoint": 2})

        for node in self.nodes:
            self.logger.info("Waiting for node on port %d to have a stable checkpoint.", node.port)
            client = node.mongo_client(read_preference=pymongo.ReadPreference.SECONDARY)
            client_admin = client["admin"]
            if self.auth_options is not None:
                client_auth_db = client[self.auth_options["authenticationDatabase"]]
                client_auth_db.authenticate(self.auth_options["username"],
                                            password=self.auth_options["password"],
                                            mechanism=self.auth_options["authenticationMechanism"])

            while True:
                status = client_admin.command("replSetGetStatus")
                # The `lastStableCheckpointTimestamp` field contains the timestamp of a previous
                # checkpoint taken at a stable timestamp. At startup recovery, this field
                # contains the timestamp reflected in the data. After startup recovery, it may
                # be lagged and there may be a stable checkpoint at a newer timestamp.
                last_stable = status.get("lastStableCheckpointTimestamp", None)

                # A missing `lastStableCheckpointTimestamp` field indicates that the storage
                # engine does not support "recover to a stable timestamp".
                if not last_stable:
                    break

                # A null `lastStableCheckpointTimestamp` indicates that the storage engine supports
                # "recover to a stable timestamp" but does not have a stable checkpoint yet.
                if last_stable.time:
                    self.logger.info("Node on port %d now has a stable checkpoint. Time: %s",
                                     node.port, last_stable)
                    break
                time.sleep(0.1)  # Wait a little bit before trying again.

    def _do_teardown(self):
        self.logger.info("Stopping all members of the replica set...")

        running_at_start = self.is_running()
        if not running_at_start:
            self.logger.info("All members of the replica set were expected to be running, "
                             "but weren't.")

        teardown_handler = interface.FixtureTeardownHandler(self.logger)

        if self.initial_sync_node:
            teardown_handler.teardown(self.initial_sync_node, "initial sync node")

        # Terminate the secondaries first to reduce noise in the logs.
        for node in reversed(self.nodes):
            teardown_handler.teardown(node, "replica set member on port %d" % node.port)

        if teardown_handler.was_successful():
            self.logger.info("Successfully stopped all members of the replica set.")
        else:
            self.logger.error("Stopping the replica set fixture failed.")
            raise errors.ServerFailure(teardown_handler.get_error_message())

    def is_running(self):
        """Return True if all nodes in the replica set are running."""
        running = all(node.is_running() for node in self.nodes)

        if self.initial_sync_node:
            running = self.initial_sync_node.is_running() or running

        return running

    def get_primary(self, timeout_secs=30):  # pylint: disable=arguments-differ
        """Return the primary from a replica set."""
        if not self.all_nodes_electable:
            # The primary is always the first element of the 'nodes' list because all other members
            # of the replica set are configured with priority=0.
            return self.nodes[0]

        start = time.time()
        clients = {}
        while True:
            for node in self.nodes:
                self._check_get_primary_timeout(start, timeout_secs)

                try:
                    client = clients.get(node.port)
                    if not client:
                        client = node.mongo_client()
                        clients[node.port] = client
                    is_master = client.admin.command("isMaster")["ismaster"]
                except pymongo.errors.AutoReconnect:
                    # AutoReconnect exceptions may occur if the primary stepped down since PyMongo
                    # last contacted it. We'll just try contacting the node again in the next round
                    # of isMaster requests.
                    continue

                if is_master:
                    self.logger.info("The node on port %d is primary of replica set '%s'",
                                     node.port, self.replset_name)
                    return node

    def _check_get_primary_timeout(self, start, timeout_secs):
        now = time.time()
        if (now - start) >= timeout_secs:
            msg = "Timed out while waiting for a primary for replica set '{}'.".format(
                self.replset_name)
            self.logger.error(msg)
            raise errors.ServerFailure(msg)

    def get_secondaries(self):
        """Return a list of secondaries from the replica set."""
        primary = self.get_primary()
        return [node for node in self.nodes if node.port != primary.port]

    def get_initial_sync_node(self):
        """Return initila sync node from the replica set."""
        return self.initial_sync_node

    def _new_mongod(self, index, replset_name):
        """Return a standalone.MongoDFixture configured to be used as replica-set member."""

        mongod_logger = self._get_logger_for_mongod(index)
        mongod_options = self.mongod_options.copy()
        mongod_options["replSet"] = replset_name
        mongod_options["dbpath"] = os.path.join(self._dbpath_prefix, "node{}".format(index))

        return standalone.MongoDFixture(
            mongod_logger, self.job_num, mongod_executable=self.mongod_executable,
            mongod_options=mongod_options, preserve_dbpath=self.preserve_dbpath)

    def _get_logger_for_mongod(self, index):
        """Return a new logging.Logger instance.

        The instance is used as the primary, secondary, or initial sync member of a replica-set.
        """

        if index == self.initial_sync_node_idx:
            node_name = "initsync"
        elif self.all_nodes_electable:
            node_name = "node{}".format(index)
        elif index == 0:
            node_name = "primary"
        else:
            suffix = str(index - 1) if self.num_nodes > 2 else ""
            node_name = "secondary{}".format(suffix)

        return self.logger.new_fixture_node_logger(node_name)

    def get_internal_connection_string(self):
        """Return the internal connection string."""
        if self.replset_name is None:
            raise ValueError("Must call setup() before calling get_internal_connection_string()")

        conn_strs = [node.get_internal_connection_string() for node in self.nodes]
        if self.initial_sync_node:
            conn_strs.append(self.initial_sync_node.get_internal_connection_string())
        return self.replset_name + "/" + ",".join(conn_strs)

    def get_driver_connection_url(self):
        """Return the driver connection URL."""
        if self.replset_name is None:
            raise ValueError("Must call setup() before calling get_driver_connection_url()")

        if self.use_replica_set_connection_string:
            # We use a replica set connection string when all nodes are electable because we
            # anticipate the client will want to gracefully handle any failovers.
            conn_strs = [node.get_internal_connection_string() for node in self.nodes]
            if self.initial_sync_node:
                conn_strs.append(self.initial_sync_node.get_internal_connection_string())
            return "mongodb://" + ",".join(conn_strs) + "/?replicaSet=" + self.replset_name
        else:
            # We return a direct connection to the expected pimary when only the first node is
            # electable because we want the client to error out if a stepdown occurs.
            return self.nodes[0].get_driver_connection_url()