summaryrefslogtreecommitdiff
path: root/jstests/libs/override_methods/sharding_continuous_config_stepdown.js
blob: d0d2814fc90306ed2fc1c86b3f5fe711d67be515 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
/**
 * Loading this file extends the prototype for ReplSetTest to spawn a thread, which continuously
 * step down the primary.
 */

// Contains the declaration for ScopedThread and CountDownLatch
load('jstests/libs/parallelTester.js');
load("jstests/replsets/rslib.js");

/**
 * Executes the specified function and if it fails due to exception, which is related to network
 * error retries the call once. If the second attempt also fails, simply throws the last
 * exception.
 *
 * Returns the return value of the input call.
 */
function retryOnNetworkError(func) {
    var networkErrorRetriesLeft = 1;

    while (true) {
        try {
            return func();
        } catch (e) {
            if (e.toString().indexOf("network error") > -1 && networkErrorRetriesLeft > 0) {
                print("Network error occurred and the call will be retried: " +
                      tojson({error: e.toString(), stack: e.stack}));
                networkErrorRetriesLeft--;
            } else {
                throw e;
            }
        }
    }
}

(function() {
    'use strict';

    // Preserve the original ReplSetTest and ShardingTest constructors, because we are overriding
    // them
    var originalReplSetTest = ReplSetTest;
    var originalShardingTest = ShardingTest;

    /**
     * Overrides the ReplSetTest constructor to start the continuous config server stepdown thread.
     */
    ReplSetTest = function ReplSetTestWithContinuousPrimaryStepdown() {
        // Construct the original object
        originalReplSetTest.apply(this, arguments);

        /**
         * This function is intended to be called in a separate thread and it continuously steps
         * down the current primary for a number of attempts.
         *
         * @param {string} seedNode The connection string of a node from which to discover the
         *      primary of the replica set.
         * @param {CountDownLatch} stopCounter Object, which can be used to stop the thread.
         *
         * @return Object with the following fields:
         *      ok {integer}: 0 if it failed, 1 if it succeeded.
         *      error {string}: Only present if ok == 0. Contains the cause for the error.
         *      stack {string}: Only present if ok == 0. Contains the stack at the time of the
         *          error.
         */
        function _continuousPrimaryStepdownFn(seedNode, stopCounter) {
            'use strict';

            load('jstests/libs/override_methods/sharding_continuous_config_stepdown.js');

            var stepdownDelaySeconds = 10;

            print('*** Continuous stepdown thread running with seed node ' + seedNode);

            try {
                // The config primary may unexpectedly step down during startup if under heavy load
                // and too slowly processing heartbeats. When it steps down, it closes all of its
                // connections. This can happen during the call to new ReplSetTest, so in order to
                // account for this and make the tests stable, retry discovery of the replica set's
                // configuration once (SERVER-22794).
                var replSet = retryOnNetworkError(function() {
                    return new ReplSetTest(seedNode);
                });

                var primary = replSet.getPrimary();

                while (stopCounter.getCount() > 0) {
                    print('*** Stepping down ' + primary);

                    assert.throws(function() {
                        var result = primary.adminCommand(
                            {replSetStepDown: stepdownDelaySeconds, force: true});
                        print('replSetStepDown command did not throw and returned: ' +
                              tojson(result));

                        // The call to replSetStepDown should never succeed
                        assert.commandWorked(result);
                    });

                    // Wait for primary to get elected and allow the test to make some progress
                    // before
                    // attempting another stepdown.
                    if (stopCounter.getCount() > 0)
                        primary = replSet.getPrimary();

                    if (stopCounter.getCount() > 0)
                        sleep(8000);
                }

                print('*** Continuous stepdown thread completed successfully');
                return {ok: 1};
            } catch (e) {
                print('*** Continuous stepdown thread caught exception: ' + tojson(e));
                return {ok: 0, error: e.toString(), stack: e.stack};
            }
        }

        // Preserve the original stopSet method, because we are overriding it to stop the continuous
        // stepdown thread.
        var _originalStartSetFn = this.startSet;
        var _originalStopSetFn = this.stopSet;

        // We override these methods to retry on network errors
        var _originalAwaitLastOpCommitted = this.awaitLastOpCommitted;

        // These two manage the scoped failover thread
        var _scopedPrimaryStepdownThread;
        var _scopedPrimaryStepdownThreadStopCounter;

        /**
         * Overrides the startSet call so we can increase the logging verbosity
         */
        this.startSet = function(options) {
            if (!options) {
                options = {};
            }
            options.verbose = 2;
            return _originalStartSetFn.call(this, options);
        };

        /**
         * Overrides the stopSet call so it terminates the failover thread.
         */
        this.stopSet = function() {
            this.stopContinuousFailover();
            _originalStopSetFn.apply(this, arguments);
        };

        /**
         * Overrides the awaitLastOpCommitted to retry on network errors.
         */
        this.awaitLastOpCommitted = function() {
            return retryOnNetworkError(_originalAwaitLastOpCommitted.bind(this));
        };

        /**
         * Spawns a thread to invoke continuousPrimaryStepdownFn. See its comments for more
         * information.
         */
        this.startContinuousFailover = function() {
            if (_scopedPrimaryStepdownThread) {
                throw new Error('Continuous failover thread is already active');
            }

            _scopedPrimaryStepdownThreadStopCounter = new CountDownLatch(1);
            _scopedPrimaryStepdownThread =
                new ScopedThread(_continuousPrimaryStepdownFn,
                                 this.nodes[0].host,
                                 _scopedPrimaryStepdownThreadStopCounter);
            _scopedPrimaryStepdownThread.start();
        };

        /**
         * Blocking method, which tells the thread running continuousPrimaryStepdownFn to stop and
         * waits
         * for it to terminate.
         */
        this.stopContinuousFailover = function() {
            if (!_scopedPrimaryStepdownThread) {
                return;
            }

            _scopedPrimaryStepdownThreadStopCounter.countDown();
            _scopedPrimaryStepdownThreadStopCounter = null;

            _scopedPrimaryStepdownThread.join();

            var retVal = _scopedPrimaryStepdownThread.returnData();
            _scopedPrimaryStepdownThread = null;

            return assert.commandWorked(retVal);
        };
    };

    Object.extend(ReplSetTest, originalReplSetTest);

    /**
     * Overrides the ShardingTest constructor to start the continuous config server stepdown thread.
     */
    ShardingTest = function ShardingTestWithContinuousConfigPrimaryStepdown() {
        if (!arguments[0].other) {
            arguments[0].other = {};
        }
        arguments[0].verbose = 2;

        // Set electionTimeoutMillis to 5 seconds, from 10, so that chunk migrations don't
        // time out because of the CSRS primary being down so often for so long.
        arguments[0].configReplSetTestOptions =
            Object.merge(arguments[0].configReplSetTestOptions, {
                settings: {
                    electionTimeoutMillis: 5000,
                },
            });

        // Construct the original object
        originalShardingTest.apply(this, arguments);

        if (!this.configRS) {
            throw new Error('Continuous config server step down only available with CSRS');
        }

        /**
         * This method is disabled because it runs aggregation, which doesn't handle config server
         * stepdown correctly.
         */
        this.printShardingStatus = function() {

        };

        assert.eq(this.configRS.getReplSetConfigFromNode().settings.electionTimeoutMillis,
                  5000,
                  "Failed to set the electionTimeoutMillis to 5000 milliseconds");

        // Start the continuous config server stepdown thread
        this.configRS.startContinuousFailover();
    };

    Object.extend(ShardingTest, originalShardingTest);

})();