summaryrefslogtreecommitdiff
path: root/jstests/hooks/run_dbcheck_background.js
blob: bb4a67d85af8f2ec1884e8e350a85f6b69b062e6 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
/**
 * Runs dbCheck in background.
 */
'use strict';

(function() {
load('jstests/libs/discover_topology.js');  // For Topology and DiscoverTopology.
load('jstests/libs/parallelTester.js');     // For Thread.

if (typeof db === 'undefined') {
    throw new Error(
        "Expected mongo shell to be connected a server, but global 'db' object isn't defined");
}

TestData = TestData || {};

// Disable implicit sessions so FSM workloads that kill random sessions won't interrupt the
// operations in this test that aren't resilient to interruptions.
TestData.disableImplicitSessions = true;

const conn = db.getMongo();
const topology = DiscoverTopology.findConnectedNodes(conn);

const exceptionFilteredBackgroundDbCheck = function(hosts) {
    const runBackgroundDbCheck = function(hosts) {
        const quietly = (func) => {
            const printOriginal = print;
            try {
                print = Function.prototype;
                func();
            } finally {
                print = printOriginal;
            }
        };

        let rst;
        // We construct the ReplSetTest instance with the print() function overridden to be a no-op
        // in order to suppress the log messages about the replica set configuration. The
        // run_dbcheck_background.js hook is executed frequently by resmoke.py and would
        // otherwise lead to generating an overwhelming amount of log messages.
        quietly(() => {
            rst = new ReplSetTest(hosts[0]);
        });

        const dbNames = new Set();
        const primary = rst.getPrimary();

        const version = assert
                            .commandWorked(primary.adminCommand(
                                {getParameter: 1, featureCompatibilityVersion: 1}))
                            .featureCompatibilityVersion.version;
        if (version != latestFCV) {
            print("Not running dbCheck in FCV " + version);
            return {ok: 1};
        }

        print("Running dbCheck for: " + rst.getURL());

        const adminDb = primary.getDB('admin');
        let res = assert.commandWorked(adminDb.runCommand({listDatabases: 1, nameOnly: true}));
        for (let dbInfo of res.databases) {
            dbNames.add(dbInfo.name);
        }

        // Transactions cannot be run on the following databases so we don't attempt to read at a
        // clusterTime on them either. (The "local" database is also not replicated.)
        // The config.transactions collection is different between primaries and secondaries.
        dbNames.delete('config');
        dbNames.delete('local');

        dbNames.forEach((dbName) => {
            assert.commandWorked(primary.getDB(dbName).runCommand({dbCheck: 1}));
            jsTestLog("dbCheck done on database " + dbName);

            const dbCheckCompleted = (db) => {
                return db.currentOp({$all: true}).inprog.filter(x => x["desc"] === "dbCheck")[0] ===
                    undefined;
            };

            assert.soon(() => dbCheckCompleted(adminDb),
                        "timed out waiting for dbCheck to finish on database: " + dbName);
        });

        // Wait for all secondaries to finish applying all dbcheck batches.
        rst.awaitReplication();

        const nodes = [
            rst.getPrimary(),
            ...rst.getSecondaries().filter(conn => {
                return !conn.adminCommand({isMaster: 1}).arbiterOnly;
            })
        ];
        nodes.forEach((node) => {
            // Assert no errors (i.e., found inconsistencies). Allow warnings. Tolerate
            // SnapshotTooOld errors, as they can occur if the primary is slow enough processing a
            // batch that the secondary is unable to obtain the timestamp the primary used.
            const healthlog = node.getDB('local').system.healthlog;
            // Regex matching strings that start without "SnapshotTooOld"
            const regexStringWithoutSnapTooOld = /^((?!^SnapshotTooOld).)*$/;

            // healthlog is a capped collection, truncation during scan might cause cursor
            // invalidation. Truncated data is most likely from previous tests in the fixture, so we
            // should still be able to catch errors by retrying.
            assert.soon(() => {
                try {
                    let errs = healthlog.find(
                        {"severity": "error", "data.error": regexStringWithoutSnapTooOld});
                    if (errs.hasNext()) {
                        const err = "dbCheck found inconsistency on " + node.host;
                        jsTestLog(err + ". Errors: ");
                        for (let count = 0; errs.hasNext() && count < 20; count++) {
                            jsTestLog(tojson(errs.next()));
                        }
                        assert(false, err);
                    }
                    return true;
                } catch (e) {
                    if (e.code !== ErrorCodes.CappedPositionLost) {
                        throw e;
                    }
                    jsTestLog(`Retrying on CappedPositionLost error: ${tojson(e)}`);
                    return false;
                }
            }, "healthlog scan could not complete.", 60000);

            jsTestLog("Checked health log on " + node.host);
        });

        return {ok: 1};
    };

    const onDrop = function(e) {
        jsTestLog("Skipping dbCheck due to transient error: " + tojson(e));
        return {ok: 1};
    };

    return assert.dropExceptionsWithCode(() => {
        return runBackgroundDbCheck(hosts);
    }, [ErrorCodes.NamespaceNotFound, ErrorCodes.LockTimeout, ErrorCodes.Interrupted], onDrop);
};

if (topology.type === Topology.kReplicaSet) {
    let res = exceptionFilteredBackgroundDbCheck(topology.nodes);
    assert.commandWorked(res, () => 'dbCheck replication consistency check failed: ' + tojson(res));
} else if (topology.type === Topology.kShardedCluster) {
    const threads = [];
    try {
        if (topology.configsvr.type === Topology.kReplicaSet) {
            const thread = new Thread(exceptionFilteredBackgroundDbCheck, topology.configsvr.nodes);
            threads.push(thread);
            thread.start();
        }

        for (let shardName of Object.keys(topology.shards)) {
            const shard = topology.shards[shardName];
            if (shard.type === Topology.kReplicaSet) {
                const thread = new Thread(exceptionFilteredBackgroundDbCheck, shard.nodes);
                threads.push(thread);
                thread.start();
            } else {
                throw new Error('Unrecognized topology format: ' + tojson(topology));
            }
        }
    } finally {
        // Wait for each thread to finish. Throw an error if any thread fails.
        let exception;
        const returnData = threads.map(thread => {
            try {
                thread.join();
                return thread.returnData();
            } catch (e) {
                if (!exception) {
                    exception = e;
                }
            }
        });
        if (exception) {
            throw exception;
        }

        returnData.forEach(res => {
            assert.commandWorked(
                res, () => 'dbCheck replication consistency check failed: ' + tojson(res));
        });
    }
} else {
    throw new Error('Unsupported topology configuration: ' + tojson(topology));
}
})();