From e5b712f24e019326516ad4cb8b1f865825e46997 Mon Sep 17 00:00:00 2001 From: Yu Jin Kang Park Date: Tue, 22 Mar 2022 09:54:36 +0000 Subject: SERVER-64244 Drop exceptions due to transient errors for whole runBackgroundDbCheck function --- jstests/hooks/run_dbcheck_background.js | 185 ++++++++++++++++---------------- src/mongo/shell/assert.js | 22 ++++ 2 files changed, 115 insertions(+), 92 deletions(-) diff --git a/jstests/hooks/run_dbcheck_background.js b/jstests/hooks/run_dbcheck_background.js index 660fe3321e6..383644bbf95 100644 --- a/jstests/hooks/run_dbcheck_background.js +++ b/jstests/hooks/run_dbcheck_background.js @@ -17,115 +17,116 @@ TestData = TestData || {}; const conn = db.getMongo(); const topology = DiscoverTopology.findConnectedNodes(conn); -function runBackgroundDbCheck(hosts) { - const quietly = (func) => { - const printOriginal = print; - try { - print = Function.prototype; - func(); - } finally { - print = printOriginal; - } - }; +const exceptionFilteredBackgroundDbCheck = function(hosts) { + const runBackgroundDbCheck = function(hosts) { + const quietly = (func) => { + const printOriginal = print; + try { + print = Function.prototype; + func(); + } finally { + print = printOriginal; + } + }; - let rst; - // We construct the ReplSetTest instance with the print() function overridden to be a no-op - // in order to suppress the log messages about the replica set configuration. The - // run_dbcheck_background.js hook is executed frequently by resmoke.py and would - // otherwise lead to generating an overwhelming amount of log messages. - quietly(() => { - rst = new ReplSetTest(hosts[0]); - }); - - const dbNames = new Set(); - const primary = rst.getPrimary(); - - const version = - assert - .commandWorked(primary.adminCommand({getParameter: 1, featureCompatibilityVersion: 1})) - .featureCompatibilityVersion.version; - if (version != latestFCV) { - print("Not running dbCheck in FCV " + version); - return {ok: 1}; - } + let rst; + // We construct the ReplSetTest instance with the print() function overridden to be a no-op + // in order to suppress the log messages about the replica set configuration. The + // run_dbcheck_background.js hook is executed frequently by resmoke.py and would + // otherwise lead to generating an overwhelming amount of log messages. + quietly(() => { + rst = new ReplSetTest(hosts[0]); + }); - print("Running dbCheck for: " + rst.getURL()); + const dbNames = new Set(); + const primary = rst.getPrimary(); - const adminDb = primary.getDB('admin'); - let res = assert.commandWorked(adminDb.runCommand({listDatabases: 1, nameOnly: true})); - for (let dbInfo of res.databases) { - dbNames.add(dbInfo.name); - } + const version = assert + .commandWorked(primary.adminCommand( + {getParameter: 1, featureCompatibilityVersion: 1})) + .featureCompatibilityVersion.version; + if (version != latestFCV) { + print("Not running dbCheck in FCV " + version); + return {ok: 1}; + } - // Transactions cannot be run on the following databases so we don't attempt to read at a - // clusterTime on them either. (The "local" database is also not replicated.) - // The config.transactions collection is different between primaries and secondaries. - dbNames.delete('config'); - dbNames.delete('local'); + print("Running dbCheck for: " + rst.getURL()); + + const adminDb = primary.getDB('admin'); + let res = assert.commandWorked(adminDb.runCommand({listDatabases: 1, nameOnly: true})); + for (let dbInfo of res.databases) { + dbNames.add(dbInfo.name); + } - dbNames.forEach((dbName) => { - try { + // Transactions cannot be run on the following databases so we don't attempt to read at a + // clusterTime on them either. (The "local" database is also not replicated.) + // The config.transactions collection is different between primaries and secondaries. + dbNames.delete('config'); + dbNames.delete('local'); + + dbNames.forEach((dbName) => { assert.commandWorked(primary.getDB(dbName).runCommand({dbCheck: 1})); jsTestLog("dbCheck done on database " + dbName); - } catch (e) { - if (e.code === ErrorCodes.NamespaceNotFound || e.code === ErrorCodes.LockTimeout || - e.code == ErrorCodes.Interrupted) { - jsTestLog("Skipping dbCheck on database " + dbName + - " due to transient error: " + tojson(e)); - return; - } else { - throw e; - } - } - const dbCheckCompleted = (db) => { - return db.currentOp().inprog.filter(x => x["desc"] == "dbCheck")[0] === undefined; - }; + const dbCheckCompleted = (db) => { + return db.currentOp({$all: true}).inprog.filter(x => x["desc"] === "dbCheck")[0] === + undefined; + }; + + assert.soon(() => dbCheckCompleted(adminDb), + "timed out waiting for dbCheck to finish on database: " + dbName); + }); - assert.soon(() => dbCheckCompleted(adminDb), - "timed out waiting for dbCheck to finish on database: " + dbName); - }); - - // Wait for all secondaries to finish applying all dbcheck batches. - rst.awaitReplication(); - - const nodes = [ - rst.getPrimary(), - ...rst.getSecondaries().filter(conn => { - return !conn.adminCommand({isMaster: 1}).arbiterOnly; - }) - ]; - nodes.forEach((node) => { - // Assert no errors (i.e., found inconsistencies). Allow warnings. Tolerate SnapshotTooOld - // errors, as they can occur if the primary is slow enough processing a batch that the - // secondary is unable to obtain the timestamp the primary used. - const healthlog = node.getDB('local').system.healthlog; - // Regex matching strings that start without "SnapshotTooOld" - const regexStringWithoutSnapTooOld = /^((?!^SnapshotTooOld).)*$/; - let errs = - healthlog.find({"severity": "error", "data.error": regexStringWithoutSnapTooOld}); - if (errs.hasNext()) { - const err = "dbCheck found inconsistency on " + node.host; - jsTestLog(err + ". Errors: "); - for (let count = 0; errs.hasNext() && count < 20; count++) { - jsTestLog(tojson(errs.next())); + // Wait for all secondaries to finish applying all dbcheck batches. + rst.awaitReplication(); + + const nodes = [ + rst.getPrimary(), + ...rst.getSecondaries().filter(conn => { + return !conn.adminCommand({isMaster: 1}).arbiterOnly; + }) + ]; + nodes.forEach((node) => { + // Assert no errors (i.e., found inconsistencies). Allow warnings. Tolerate + // SnapshotTooOld errors, as they can occur if the primary is slow enough processing a + // batch that the secondary is unable to obtain the timestamp the primary used. + const healthlog = node.getDB('local').system.healthlog; + // Regex matching strings that start without "SnapshotTooOld" + const regexStringWithoutSnapTooOld = /^((?!^SnapshotTooOld).)*$/; + let errs = + healthlog.find({"severity": "error", "data.error": regexStringWithoutSnapTooOld}); + if (errs.hasNext()) { + const err = "dbCheck found inconsistency on " + node.host; + jsTestLog(err + ". Errors: "); + for (let count = 0; errs.hasNext() && count < 20; count++) { + jsTestLog(tojson(errs.next())); + } + assert(false, err); } - assert(false, err); - } - jsTestLog("Checked health log on " + node.host); - }); + jsTestLog("Checked health log on " + node.host); + }); - return {ok: 1}; -} + return {ok: 1}; + }; + + const onDrop = function(e) { + jsTestLog("Skipping dbCheck due to transient error: " + tojson(e)); + return {ok: 1}; + }; + + return assert.dropExceptionsWithCode(() => { + return runBackgroundDbCheck(hosts); + }, [ErrorCodes.NamespaceNotFound, ErrorCodes.LockTimeout, ErrorCodes.Interrupted], onDrop); +}; if (topology.type === Topology.kReplicaSet) { - let res = runBackgroundDbCheck(topology.nodes); + let res = exceptionFilteredBackgroundDbCheck(topology.nodes); assert.commandWorked(res, () => 'dbCheck replication consistency check failed: ' + tojson(res)); } else if (topology.type === Topology.kShardedCluster) { const threads = []; try { if (topology.configsvr.type === Topology.kReplicaSet) { - const thread = new Thread(runBackgroundDbCheck, topology.configsvr.nodes); + const thread = new Thread(exceptionFilteredBackgroundDbCheck, topology.configsvr.nodes); threads.push(thread); thread.start(); } @@ -133,7 +134,7 @@ if (topology.type === Topology.kReplicaSet) { for (let shardName of Object.keys(topology.shards)) { const shard = topology.shards[shardName]; if (shard.type === Topology.kReplicaSet) { - const thread = new Thread(runBackgroundDbCheck, shard.nodes); + const thread = new Thread(exceptionFilteredBackgroundDbCheck, shard.nodes); threads.push(thread); thread.start(); } else { diff --git a/src/mongo/shell/assert.js b/src/mongo/shell/assert.js index 1d15cdc774d..44bada6e7bf 100644 --- a/src/mongo/shell/assert.js +++ b/src/mongo/shell/assert.js @@ -612,6 +612,28 @@ assert = (function() { return res; }; + assert.dropExceptionsWithCode = function(func, dropCodes, onDrop) { + if (typeof func !== "function") { + doassert('assert.dropExceptionsWithCode 1st argument must be a function'); + } + if (typeof onDrop !== "function") { + doassert('assert.dropExceptionsWithCode 3rd argument must be a function'); + } + if (!Array.isArray(dropCodes)) { + dropCodes = [dropCodes]; + } + + try { + return func(); + } catch (e) { + if (dropCodes.some((ec) => e.code === ec)) { + return onDrop(e); + } else { + throw e; + } + } + }; + assert.throws.automsg = function(func, params) { if (arguments.length === 1) params = []; -- cgit v1.2.1