summaryrefslogtreecommitdiff
path: root/jstests/libs/override_methods/sharding_continuous_config_stepdown.js
blob: b2f9b6d1c7974c7f5b4b9daf0e43d3b2088eb616 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
/**
 * Loading this file extends the prototype for ReplSetTest to spawn a thread, which continuously
 * step down the primary.
 */

// Contains the declaration for ScopedThread and CountDownLatch
load('jstests/libs/parallelTester.js');
load("jstests/replsets/rslib.js");

(function() {
'use strict';

// Preserve the original ReplSetTest and ShardingTest constructors, because we are overriding them
var originalReplSetTest = ReplSetTest;
var originalShardingTest = ShardingTest;

/**
 * Overrides the ReplSetTest constructor to start the continuous config server stepdown thread.
 */
ReplSetTest = function ReplSetTestWithContinuousPrimaryStepdown() {
    // Construct the original object
    originalReplSetTest.apply(this, arguments);

    /**
     * This function is intended to be called in a separate thread and it continuously steps down
     * the current primary for a number of attempts.
     *
     * @param {string} seedNode The connection string of a node from which to discover the primary
     *                          of the replica set.
     * @param {CountDownLatch} stopCounter Object, which can be used to stop the thread.
     *
     * @return Object with the following fields:
     *      ok {integer}: 0 if it failed, 1 if it succeeded.
     *      error {string}: Only present if ok == 0. Contains the cause for the error.
     *      stack {string}: Only present if ok == 0. Contains the stack at the time of the error.
     */
    function _continuousPrimaryStepdownFn(seedNode, stopCounter) {
        'use strict';

        var stepdownDelaySeconds = 10;

        print('*** Continuous stepdown thread running with seed node ' + seedNode);

        // The config primary may unexpectedly step down during startup if under heavy load and
        // too slowly processing heartbeats. When it steps down, it closes all of its connections.
        // This can happen during the call to new ReplSetTest, so in order to account for this and
        // make the tests stable, retry discovery of the replica set's configuration once
        // (SERVER-22794).
        var replSet;
        var networkErrorRetries = 1;
        while (networkErrorRetries >= 0) {
            try {
                replSet = new ReplSetTest(seedNode);
                break;
            } catch (e) {
                if ( ((networkErrorRetries--) > 0) &&
                     (e.toString().indexOf("network error") > -1) ) {
                    print("Error: " + e.toString() + "\nStacktrace: " + e.stack);
                    print("Stepdown thread's config server connection was closed, retrying.");
                } else {
                    print('*** Continuous stepdown thread failed to connect to the ' +
                          'config server: ' + tojson(e));
                    return { ok: 0, error: e.toString(), stack: e.stack };
                }
            }
        }

        try {
            var primary = replSet.getPrimary();

            while (stopCounter.getCount() > 0) {
                print('*** Stepping down ' + primary);

                assert.throws(function() {
                    var result = primary.adminCommand({
                        replSetStepDown: stepdownDelaySeconds,
                        force: true });
                    print('replSetStepDown command did not throw and returned: ' + tojson(result));

                    // The call to replSetStepDown should never succeed
                    assert.commandWorked(result);
                });

                // Wait for primary to get elected and allow the test to make some progress before
                // attempting another stepdown.
                if (stopCounter.getCount() > 0)
                    primary = replSet.getPrimary();

                if (stopCounter.getCount() > 0)
                    sleep(8000);
            }

            print('*** Continuous stepdown thread completed successfully');
            return { ok: 1 };
        }
        catch (e) {
            print('*** Continuous stepdown thread caught exception: ' + tojson(e));
            return { ok: 0, error: e.toString(), stack: e.stack };
        }
    }

    // Preserve the original stopSet method, because we are overriding it to stop the continuous
    // stepdown thread.
    var _originalStartSetFn = this.startSet;
    var _originalStopSetFn = this.stopSet;

    // These two manage the scoped failover thread
    var _scopedPrimaryStepdownThread;
    var _scopedPrimaryStepdownThreadStopCounter;

    /**
     * Overrides the startSet call so we can increase the logging verbosity
     */
    this.startSet = function(options) {
        if (!options) {
            options = {};
        }
        options.verbose = 2;
        return _originalStartSetFn.call(this, options);
    };

    /**
     * Overrides the stopSet call so it terminates the failover thread.
     */
    this.stopSet = function() {
        this.stopContinuousFailover();
        _originalStopSetFn.apply(this, arguments);
    };

    /**
     * Spawns a thread to invoke continuousPrimaryStepdownFn. See its comments for more information.
     */
    this.startContinuousFailover = function() {
        if (_scopedPrimaryStepdownThread) {
            throw new Error('Continuous failover thread is already active');
        }

        _scopedPrimaryStepdownThreadStopCounter = new CountDownLatch(1);
        _scopedPrimaryStepdownThread = new ScopedThread(_continuousPrimaryStepdownFn,
                                                        this.nodes[0].host,
                                                        _scopedPrimaryStepdownThreadStopCounter);
        _scopedPrimaryStepdownThread.start();
    };

    /**
     * Blocking method, which tells the thread running continuousPrimaryStepdownFn to stop and waits
     * for it to terminate.
     */
    this.stopContinuousFailover = function() {
        if (!_scopedPrimaryStepdownThread) {
            return;
        }

        _scopedPrimaryStepdownThreadStopCounter.countDown();
        _scopedPrimaryStepdownThreadStopCounter = null;

        _scopedPrimaryStepdownThread.join();

        var retVal = _scopedPrimaryStepdownThread.returnData();
        _scopedPrimaryStepdownThread = null;

        return assert.commandWorked(retVal);
    };
};

Object.extend(ReplSetTest, originalReplSetTest);

/**
 * Overrides the ShardingTest constructor to start the continuous config server stepdown thread.
 */
ShardingTest = function ShardingTestWithContinuousConfigPrimaryStepdown() {
    if (!arguments[0].other) {
        arguments[0].other = {};
    }
    arguments[0].verbose = 2;

    // Set electionTimeoutMillis to 5 seconds, from 10, so that chunk migrations don't
    // time out because of the CSRS primary being down so often for so long.
    arguments[0].configReplSetTestOptions = Object.merge(arguments[0].configReplSetTestOptions, {
        settings: {
            electionTimeoutMillis: 5000,
        },
    });

    // Construct the original object
    originalShardingTest.apply(this, arguments);

    if (!this.configRS) {
        throw new Error('Continuous config server step down only available with CSRS');
    }

    /**
     * This method is disabled because it runs aggregation, which doesn't handle config server
     * stepdown correctly.
     */
    this.printShardingStatus = function() {

    };

    assert.eq(this.configRS.getReplSetConfigFromNode().settings.electionTimeoutMillis, 5000,
        "Failed to set the electionTimeoutMillis to 5000 milliseconds");

    // Start the continuous config server stepdown thread
    this.configRS.startContinuousFailover();
};

Object.extend(ShardingTest, originalShardingTest);

})();