summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVesselina Ratcheva <vesselina.ratcheva@10gen.com>2020-02-27 03:03:47 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2020-02-27 09:19:44 +0000
commitb4f0cfe1d7fc292728f27edf7b3eee90ba5fd5b4 (patch)
tree3ae20abe90c6c81c102a99de2f81bef6a844d367
parentfcd2dd41189fffc6e67a8645b99974178f87ca04 (diff)
downloadmongo-b4f0cfe1d7fc292728f27edf7b3eee90ba5fd5b4.tar.gz
SERVER-43867 Work around unrecoverability of rollback via refetch in tests SERVER-43867 Move check_log inside src/mongo/shell/
(cherry picked from commit 7c42573b96ecfa43fbf76aa01cd659ee00fc6103) (cherry picked from commit 9e69da32cea83fc45724213215a8de40d2635f45)
-rw-r--r--jstests/replsets/libs/rollback_test.js31
-rw-r--r--jstests/replsets/unrecoverable_rollback_early_exit.js66
-rw-r--r--src/mongo/scripting/engine.cpp2
-rw-r--r--src/mongo/shell/SConscript1
-rw-r--r--src/mongo/shell/check_log.js120
-rw-r--r--src/mongo/shell/replsettest.js45
6 files changed, 264 insertions, 1 deletions
diff --git a/jstests/replsets/libs/rollback_test.js b/jstests/replsets/libs/rollback_test.js
index ce026c6a1f5..ed20b18bb3b 100644
--- a/jstests/replsets/libs/rollback_test.js
+++ b/jstests/replsets/libs/rollback_test.js
@@ -61,6 +61,8 @@ function RollbackTest(name = "RollbackTest", replSet) {
const SIGTERM = 15;
const kNumDataBearingNodes = 2;
+ let awaitSecondaryNodesForRollbackTimeout;
+
let rst;
let curPrimary;
let curSecondary;
@@ -213,7 +215,25 @@ function RollbackTest(name = "RollbackTest", replSet) {
`may prevent a rollback here.`);
}
- rst.awaitSecondaryNodes();
+ // If the rollback node has {enableMajorityReadConcern:false} set, it will use the
+ // rollbackViaRefetch algorithm. That can lead to unrecoverable rollbacks, particularly
+ // in unclean shutdown suites, as it it is possible in rare cases for the sync source to
+ // lose the entry corresponding to the optime the rollback node chose as its minValid.
+ try {
+ rst.awaitSecondaryNodesForRollbackTest(
+ awaitSecondaryNodesForRollbackTimeout,
+ curSecondary /* connToCheckForUnrecoverableRollback */);
+ } catch (e) {
+ if (e.unrecoverableRollbackDetected) {
+ log(`Detected unrecoverable rollback on ${curSecondary.host}. Ending test.`,
+ true /* important */);
+ TestData.skipCheckDBHashes = true;
+ rst.stopSet();
+ quit();
+ }
+ // Re-throw the original exception in all other cases.
+ throw e;
+ }
rst.awaitReplication();
log(`Rollback on ${curSecondary.host} (if needed) and awaitReplication completed`, true);
@@ -391,4 +411,13 @@ function RollbackTest(name = "RollbackTest", replSet) {
this.getTestFixture = function() {
return rst;
};
+
+ /**
+ * Use this to control the timeout being used in the awaitSecondaryNodesForRollbackTest call
+ * in transitionToSteadyStateOperations.
+ * For use only in tests that expect unrecoverable rollbacks.
+ */
+ this.setAwaitSecondaryNodesForRollbackTimeout = function(timeoutMillis) {
+ awaitSecondaryNodesForRollbackTimeout = timeoutMillis;
+ };
}
diff --git a/jstests/replsets/unrecoverable_rollback_early_exit.js b/jstests/replsets/unrecoverable_rollback_early_exit.js
new file mode 100644
index 00000000000..d729b34dd0c
--- /dev/null
+++ b/jstests/replsets/unrecoverable_rollback_early_exit.js
@@ -0,0 +1,66 @@
+/**
+ * This test exercises an unrecoverable rollback using rollback_test.js, expecting it to terminate
+ * cleanly early instead of failing.
+ * An unrecoverable rollback can happen with EMRC:false, as it is possible for rollback via refetch
+ * to set a minValid based on oplog entries that the sync source may have failed to recover after
+ * an unclean shutdown. The rollback node will need to sync and apply oplog entries up to minValid
+ * to be consistent, but if those oplog entries no longer exist, then it will be stuck in sync
+ * source selection and unable to complete recovery.
+ * This test reproduces this scenario in a simpler form by modifying the minValid on the rollback
+ * node very far forward, so that we do not have to simulate anything happening to the sync source.
+ */
+
+(function() {
+ "use strict";
+
+ load("jstests/replsets/libs/rollback_test.js");
+
+ function tsToDate(ts) {
+ return new Date(ts.getTime() * 1000);
+ }
+
+ const testName = jsTestName();
+
+ const rst = new ReplSetTest({
+ name: testName,
+ nodes: [{}, {}, {rsConfig: {arbiterOnly: true}}],
+ useBridge: true,
+ settings: {chainingAllowed: false},
+ nodeOptions: {enableMajorityReadConcern: "false"}
+ });
+ rst.startSet();
+ rst.initiate();
+
+ const rollbackTest = new RollbackTest(testName, rst);
+ const rollbackNode = rollbackTest.transitionToRollbackOperations();
+
+ assert.commandWorked(rollbackNode.adminCommand(
+ {configureFailPoint: "rollbackHangBeforeFinish", mode: "alwaysOn"}));
+
+ rollbackTest.transitionToSyncSourceOperationsBeforeRollback();
+ rollbackTest.transitionToSyncSourceOperationsDuringRollback();
+
+ checkLog.contains(rollbackNode, "rollbackHangBeforeFinish fail point enabled.");
+
+ const farFutureTS = new Timestamp(
+ Math.floor(new Date().getTime() / 1000) + (60 * 60 * 24 * 5 /* in five days*/), 0);
+
+ jsTest.log("future TS: " + tojson(farFutureTS) + ", date:" + tsToDate(farFutureTS));
+
+ const mMinvalid = rollbackNode.getDB("local").getCollection("replset.minvalid");
+
+ const minValidUpdate = {$set: {ts: farFutureTS}};
+ jsTestLog("Current minValid is " + tojson(mMinvalid.findOne()));
+ jsTestLog("Updating minValid to: " + tojson(minValidUpdate));
+ printjson(assert.commandWorked(mMinvalid.update({}, minValidUpdate)));
+
+ assert.commandWorked(
+ rollbackNode.adminCommand({configureFailPoint: "rollbackHangBeforeFinish", mode: "off"}));
+
+ rollbackTest.setAwaitSecondaryNodesForRollbackTimeout(5 * 1000);
+
+ // We will detect an unrecoverable rollback here.
+ rollbackTest.transitionToSteadyStateOperations();
+
+ rollbackTest.stop();
+})(); \ No newline at end of file
diff --git a/src/mongo/scripting/engine.cpp b/src/mongo/scripting/engine.cpp
index 6ecabd1fd8d..952fd67ac9d 100644
--- a/src/mongo/scripting/engine.cpp
+++ b/src/mongo/scripting/engine.cpp
@@ -293,6 +293,7 @@ ScriptingFunction Scope::createFunction(const char* code) {
}
namespace JSFiles {
+extern const JSFile check_log;
extern const JSFile collection;
extern const JSFile crud_api;
extern const JSFile db;
@@ -320,6 +321,7 @@ void Scope::execCoreFiles() {
execSetup(JSFiles::query);
execSetup(JSFiles::bulk_api);
execSetup(JSFiles::error_codes);
+ execSetup(JSFiles::check_log);
execSetup(JSFiles::collection);
execSetup(JSFiles::crud_api);
execSetup(JSFiles::explain_query);
diff --git a/src/mongo/shell/SConscript b/src/mongo/shell/SConscript
index 79e956ab06e..e534658b9ae 100644
--- a/src/mongo/shell/SConscript
+++ b/src/mongo/shell/SConscript
@@ -33,6 +33,7 @@ js_header = env.JSHeader(
source=[
"assert.js",
"bulk_api.js",
+ "check_log.js",
"collection.js",
"crud_api.js",
"db.js",
diff --git a/src/mongo/shell/check_log.js b/src/mongo/shell/check_log.js
new file mode 100644
index 00000000000..d49955f5786
--- /dev/null
+++ b/src/mongo/shell/check_log.js
@@ -0,0 +1,120 @@
+/*
+ * Helper functions which connect to a server, and check its logs for particular strings.
+ */
+var checkLog;
+
+(function() {
+ "use strict";
+
+ if (checkLog) {
+ return; // Protect against this file being double-loaded.
+ }
+
+ checkLog = (function() {
+ var getGlobalLog = function(conn) {
+ var cmdRes;
+ try {
+ cmdRes = conn.adminCommand({getLog: 'global'});
+ } catch (e) {
+ // Retry with network errors.
+ print("checkLog ignoring failure: " + e);
+ return null;
+ }
+
+ return assert.commandWorked(cmdRes).log;
+ };
+
+ /*
+ * Calls the 'getLog' function on the provided connection 'conn' to see if the provided msg
+ * is found in the logs. Note: this function does not throw an exception, so the return
+ * value should not be ignored.
+ */
+ const checkContainsOnce = function(conn, msg) {
+ const logMessages = getGlobalLog(conn);
+ if (logMessages === null) {
+ return false;
+ }
+ for (let logMsg of logMessages) {
+ if (logMsg.includes(msg)) {
+ return true;
+ }
+ }
+ return false;
+ };
+
+ /*
+ * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until
+ * the provided 'msg' is found in the logs, or it times out. Throws an exception on timeout.
+ */
+ var contains = function(conn, msg, timeout = 5 * 60 * 1000) {
+ assert.soon(function() {
+ return checkContainsOnce(conn, msg);
+ }, 'Could not find log entries containing the following message: ' + msg, timeout, 300);
+ };
+
+ /*
+ * Calls the 'getLog' function at regular intervals on the provided connection 'conn' until
+ * the provided 'msg' is found in the logs exactly 'expectedCount' times, or it times out.
+ * Throws an exception on timeout.
+ */
+ var containsWithCount = function(conn, msg, expectedCount, timeout = 5 * 60 * 1000) {
+ var count = 0;
+ assert.soon(
+ function() {
+ count = 0;
+ var logMessages = getGlobalLog(conn);
+ if (logMessages === null) {
+ return false;
+ }
+ for (var i = 0; i < logMessages.length; i++) {
+ if (logMessages[i].indexOf(msg) != -1) {
+ count++;
+ }
+ }
+
+ return expectedCount === count;
+ },
+ 'Expected ' + expectedCount + ', but instead saw ' + count +
+ ' log entries containing the following message: ' + msg,
+ timeout,
+ 300);
+ };
+
+ /*
+ * Converts a scalar or object to a string format suitable for matching against log output.
+ * Field names are not quoted, and by default strings which are not within an enclosing
+ * object are not escaped. Similarly, integer values without an enclosing object are
+ * serialized as integers, while those within an object are serialized as floats to one
+ * decimal point. NumberLongs are unwrapped prior to serialization.
+ */
+ const formatAsLogLine = function(value, escapeStrings, toDecimal) {
+ if (typeof value === "string") {
+ return (escapeStrings ? `"${value}"` : value);
+ } else if (typeof value === "number") {
+ return (Number.isInteger(value) && toDecimal ? value.toFixed(1) : value);
+ } else if (value instanceof NumberLong) {
+ return `${value}`.match(/NumberLong..(.*)../m)[1];
+ } else if (typeof value !== "object") {
+ return value;
+ } else if (Object.keys(value).length === 0) {
+ return Array.isArray(value) ? "[]" : "{}";
+ }
+ let serialized = [];
+ escapeStrings = toDecimal = true;
+ for (let fieldName in value) {
+ const valueStr = formatAsLogLine(value[fieldName], escapeStrings, toDecimal);
+ serialized.push(Array.isArray(value) ? valueStr : `${fieldName}: ${valueStr}`);
+ }
+ return (Array.isArray(value) ? `[ ${serialized.join(', ')} ]`
+ : `{ ${serialized.join(', ')} }`);
+ };
+
+ return {
+ getGlobalLog: getGlobalLog,
+ checkContainsOnce: checkContainsOnce,
+ contains: contains,
+ containsWithCount: containsWithCount,
+ formatAsLogLine: formatAsLogLine
+ };
+ })();
+})();
diff --git a/src/mongo/shell/replsettest.js b/src/mongo/shell/replsettest.js
index dee9b4c75a0..05c4b584ff2 100644
--- a/src/mongo/shell/replsettest.js
+++ b/src/mongo/shell/replsettest.js
@@ -570,6 +570,51 @@ var ReplSetTest = function(opts) {
};
/**
+ * A special version of awaitSecondaryNodes() used exclusively by rollback_test.js.
+ * Wraps around awaitSecondaryNodes() itself and checks for an unrecoverable rollback
+ * if it throws.
+ */
+ this.awaitSecondaryNodesForRollbackTest = function(timeout,
+ connToCheckForUnrecoverableRollback) {
+ try {
+ this.awaitSecondaryNodes(timeout);
+ } catch (originalEx) {
+ // There is a special case where we expect the (rare) possibility of unrecoverable
+ // rollbacks with EMRC:false in rollback suites with unclean shutdowns.
+ jsTestLog("Exception in 'awaitSecondaryNodes', checking for unrecoverable rollback");
+ if (connToCheckForUnrecoverableRollback) {
+ const conn = connToCheckForUnrecoverableRollback;
+
+ const statusRes = assert.commandWorked(conn.adminCommand({replSetGetStatus: 1}));
+ const isRecovering = (statusRes.myState === ReplSetTest.State.RECOVERING);
+ const hasNoSyncSource = (statusRes.syncSourceId === -1);
+
+ const cmdLineOptsRes = assert.commandWorked(conn.adminCommand("getCmdLineOpts"));
+ const hasEMRCFalse =
+ (cmdLineOptsRes.parsed.replication.enableMajorityReadConcern === false);
+
+ if (isRecovering && hasNoSyncSource && hasEMRCFalse) {
+ try {
+ const n = this.getNodeId(conn);
+ const connToCheck = _useBridge ? _unbridgedNodes[n] : this.nodes[n];
+ // Confirm that the node is unable to recover after rolling back.
+ checkLog.contains(
+ connToCheck,
+ "remote oplog does not contain entry with optime matching our required optime ",
+ 60 * 1000);
+ } catch (checkLogEx) {
+ throw originalEx;
+ }
+ // Add this info to the original exception.
+ originalEx.unrecoverableRollbackDetected = true;
+ }
+ }
+ // Re-throw the original exception in all cases.
+ throw originalEx;
+ }
+ };
+
+ /**
* Blocks until the specified node says it's syncing from the given upstream node.
*/
this.awaitSyncSource = function(node, upstreamNode, timeout) {