From 320e2d28396c250dfd69640dcf865dff50ea0b55 Mon Sep 17 00:00:00 2001 From: Jason Chan Date: Wed, 28 Apr 2021 19:53:03 +0000 Subject: SERVER-55305 Add new step to replication rollback to restore the txns table to be consistent with the stableTimestamp --- ...d_txn_table_updates_during_oplog_application.js | 183 +++++++++++++++++++++ ...sced_txn_table_updates_from_vectored_inserts.js | 115 +++++++++++++ 2 files changed, 298 insertions(+) create mode 100644 jstests/replsets/rollback_with_coalesced_txn_table_updates_during_oplog_application.js create mode 100644 jstests/replsets/rollback_with_coalesced_txn_table_updates_from_vectored_inserts.js (limited to 'jstests') diff --git a/jstests/replsets/rollback_with_coalesced_txn_table_updates_during_oplog_application.js b/jstests/replsets/rollback_with_coalesced_txn_table_updates_during_oplog_application.js new file mode 100644 index 00000000000..39ceaa77713 --- /dev/null +++ b/jstests/replsets/rollback_with_coalesced_txn_table_updates_during_oplog_application.js @@ -0,0 +1,183 @@ +/** + * Tests that the rollback procedure will update the 'config.transactions' table to be consistent + * with the node data at the 'stableTimestamp', specifically in the case where multiple derived ops + * to the 'config.transactions' table were coalesced into a single operation during secondary oplog + * application. + * We also test that if a node crashes after oplog truncation during rollback, the update made to + * the 'config.transactions' table is persisted on startup. + * + * @tags: [requires_persistence] + */ + +(function() { +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/write_concern_util.js"); + +const oplogApplierBatchSize = 100; + +function runTest(crashAfterRollbackTruncation) { + const rst = new ReplSetTest({ + nodes: { + n0: {}, + // Set the 'syncdelay' to 1s to speed up checkpointing. Also explicitly set the batch + // size for oplog application to ensure the number of retryable write statements being + // made majority committed isn't a multiple of it. + n1: {syncdelay: 1, setParameter: {replBatchLimitOperations: oplogApplierBatchSize}}, + // Set the bgSyncOplogFetcherBatchSize to 1 oplog entry to guarantee replication + // progress with the stopReplProducerOnDocument failpoint. + n2: {setParameter: {bgSyncOplogFetcherBatchSize: 1}}, + n3: {setParameter: {bgSyncOplogFetcherBatchSize: 1}}, + n4: {setParameter: {bgSyncOplogFetcherBatchSize: 1}}, + }, + // Force secondaries to sync from the primary to guarantee replication progress with the + // stopReplProducerOnDocument failpoint. Also disable primary catchup because some + // replicated retryable write statements are intentionally not being made majority + // committed. + settings: {chainingAllowed: false, catchUpTimeoutMillis: 0}, + }); + rst.startSet(); + rst.initiate(); + + const primary = rst.getPrimary(); + const ns = "test.retryable_write_partial_rollback"; + assert.commandWorked( + primary.getCollection(ns).insert({_id: 0, counter: 0}, {writeConcern: {w: 5}})); + + const [secondary1, secondary2, secondary3, secondary4] = rst.getSecondaries(); + + // Disable replication on all of the secondaries to manually control the replication progress. + const stopReplProducerFailpoints = [secondary1, secondary2, secondary3, secondary4].map( + conn => configureFailPoint(conn, 'stopReplProducer')); + + // While replication is still entirely disabled, additionally disable replication partway into + // the retryable write on all but the first secondary. The idea is that while secondary1 will + // apply all of the oplog entries in a single batch, the other secondaries will only apply up to + // counterMajorityCommitted oplog entries. + const counterTotal = oplogApplierBatchSize; + const counterMajorityCommitted = counterTotal - 2; + const stopReplProducerOnDocumentFailpoints = [secondary2, secondary3, secondary4].map( + conn => configureFailPoint(conn, + 'stopReplProducerOnDocument', + {document: {"diff.u.counter": counterMajorityCommitted + 1}})); + + const lsid = ({id: UUID()}); + + assert.commandWorked(primary.getCollection(ns).runCommand("update", { + updates: Array.from({length: counterTotal}, () => ({q: {_id: 0}, u: {$inc: {counter: 1}}})), + lsid, + txnNumber: NumberLong(1), + })); + + const stmtMajorityCommitted = primary.getCollection("local.oplog.rs") + .findOne({ns, "o.diff.u.counter": counterMajorityCommitted}); + assert.neq(null, stmtMajorityCommitted); + + for (const fp of stopReplProducerFailpoints) { + fp.off(); + + // Wait for the secondary to have applied through the counterMajorityCommitted retryable + // write statement. We do this for each secondary individually, starting with secondary1, to + // guarantee that secondary1 will advance its stable_timestamp when learning of the other + // secondaries also having applied through counterMajorityCommitted. + assert.soon(() => { + const {optimes: {appliedOpTime, durableOpTime}} = + assert.commandWorked(fp.conn.adminCommand({replSetGetStatus: 1})); + + print(`${fp.conn.host}: ${tojsononeline({ + appliedOpTime, + durableOpTime, + stmtMajorityCommittedTimestamp: stmtMajorityCommitted.ts + })}`); + + return bsonWoCompare(appliedOpTime.ts, stmtMajorityCommitted.ts) >= 0 && + bsonWoCompare(durableOpTime.ts, stmtMajorityCommitted.ts) >= 0; + }); + } + + // Wait for secondary1 to have advanced its stable_timestamp. + assert.soon(() => { + const {lastStableRecoveryTimestamp} = + assert.commandWorked(secondary1.adminCommand({replSetGetStatus: 1})); + + print(`${secondary1.host}: ${tojsononeline({ + lastStableRecoveryTimestamp, + stmtMajorityCommittedTimestamp: stmtMajorityCommitted.ts + })}`); + + return bsonWoCompare(lastStableRecoveryTimestamp, stmtMajorityCommitted.ts) >= 0; + }); + + // Step up one of the other secondaries and do a write which becomes majority committed to force + // secondary1 to go into rollback. + rst.freeze(secondary1); + assert.commandWorked(secondary2.adminCommand({replSetStepUp: 1})); + rst.freeze(primary); + rst.awaitNodesAgreeOnPrimary(undefined, undefined, secondary2); + + let hangAfterTruncate; + if (crashAfterRollbackTruncation) { + hangAfterTruncate = configureFailPoint(secondary1, 'hangAfterOplogTruncationInRollback'); + } + + for (const fp of stopReplProducerOnDocumentFailpoints) { + fp.off(); + } + + // Wait for secondary2 to be a writable primary. + rst.getPrimary(); + + // Do a write which becomes majority committed and wait for secondary1 to complete its rollback. + assert.commandWorked( + secondary2.getCollection("test.dummy").insert({}, {writeConcern: {w: 'majority'}})); + + if (crashAfterRollbackTruncation) { + // Entering rollback will close connections so we expect some network errors when waiting + // on the failpoint. + assert.soonNoExcept(() => { + hangAfterTruncate.wait(); + return true; + }, `failed to wait for fail point ${hangAfterTruncate.failPointName}`); + + // Crash the node after it performs oplog truncation. + rst.stop(secondary1, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL}); + node = rst.restart(secondary1, { + "noReplSet": false, + setParameter: 'failpoint.stopReplProducer=' + tojson({mode: 'alwaysOn'}) + }); + rst.waitForState(secondary1, ReplSetTest.State.SECONDARY); + secondary1.setSecondaryOk(); + // On startup, we expect to see the update persisted in the 'config.transactions' table. + let restoredDoc = + secondary1.getCollection('config.transactions').findOne({"_id.id": lsid.id}); + assert.neq(null, restoredDoc); + secondary1.adminCommand({configureFailPoint: "stopReplProducer", mode: "off"}); + } + + // Reconnect to secondary1 after it completes its rollback and step it up to be the new primary. + rst.awaitNodesAgreeOnPrimary(undefined, undefined, secondary2); + assert.commandWorked(secondary1.adminCommand({replSetFreeze: 0})); + rst.stepUp(secondary1, {awaitWritablePrimary: false}); + + const docBeforeRetry = secondary1.getCollection(ns).findOne({_id: 0}); + assert.eq(docBeforeRetry, {_id: 0, counter: counterMajorityCommitted}); + + assert.commandWorked(secondary1.getCollection(ns).runCommand("update", { + updates: Array.from({length: counterTotal}, () => ({q: {_id: 0}, u: {$inc: {counter: 1}}})), + lsid, + txnNumber: NumberLong(1), + writeConcern: {w: 5}, + })); + + const docAfterRetry = secondary1.getCollection(ns).findOne({_id: 0}); + assert.eq(docAfterRetry, {_id: 0, counter: counterTotal}); + + rst.stopSet(); +} + +// Test the general scenario where we perform the appropriate update to the 'config.transactions' +// table during rollback. +runTest(false); +// Extends the test to crash the secondary in the middle of rollback right after oplog truncation. +// We assert that the update made to the 'config.transactions' table persisted on startup. +runTest(true); +})(); \ No newline at end of file diff --git a/jstests/replsets/rollback_with_coalesced_txn_table_updates_from_vectored_inserts.js b/jstests/replsets/rollback_with_coalesced_txn_table_updates_from_vectored_inserts.js new file mode 100644 index 00000000000..e6ca0056c31 --- /dev/null +++ b/jstests/replsets/rollback_with_coalesced_txn_table_updates_from_vectored_inserts.js @@ -0,0 +1,115 @@ +/** + * Tests that the rollback procedure will update the 'config.transactions' table to be consistent + * with the node data at the 'stableTimestamp', specifically in the case where multiple derived ops + * to the 'config.transactions' table were coalesced into a single operation when performing + * vectored inserts on the primary. + */ + +(function() { +load("jstests/libs/fail_point_util.js"); +load("jstests/libs/write_concern_util.js"); + +const rst = new ReplSetTest({ + nodes: { + // Set the syncdelay to 1s to speed up checkpointing. + n0: {syncdelay: 1}, + // Set the bgSyncOplogFetcherBatchSize to 1 oplog entry to guarantee replication progress + // with the stopReplProducerOnDocument failpoint. + n1: {setParameter: {bgSyncOplogFetcherBatchSize: 1}}, + n2: {setParameter: {bgSyncOplogFetcherBatchSize: 1}}, + }, + // Force secondaries to sync from the primary to guarantee replication progress with the + // stopReplProducerOnDocument failpoint. Also disable primary catchup because some replicated + // retryable write statements are intentionally not being made majority committed. + settings: {chainingAllowed: false, catchUpTimeoutMillis: 0}, +}); +rst.startSet(); +rst.initiate(); + +const primary = rst.getPrimary(); +const ns = "test.retryable_write_coalesced_txn_updates"; +assert.commandWorked(primary.getCollection(ns).insert({_id: -1}, {writeConcern: {w: 3}})); + +const [secondary1, secondary2] = rst.getSecondaries(); + +// Disable replication partway into the retryable write on all of the secondaries. The idea is that +// while the primary will apply all of the writes in a single storage transaction, the secondaries +// will only apply up to insertBatchMajorityCommitted oplog entries. +const insertBatchTotal = 20; +const insertBatchMajorityCommitted = insertBatchTotal - 2; +const stopReplProducerOnDocumentFailpoints = [secondary1, secondary2].map( + conn => configureFailPoint( + conn, 'stopReplProducerOnDocument', {document: {"_id": insertBatchMajorityCommitted + 1}})); + +const lsid = ({id: UUID()}); + +assert.commandWorked(primary.getCollection(ns).runCommand("insert", { + documents: Array.from({length: insertBatchTotal}, (_, i) => ({_id: i})), + lsid, + txnNumber: NumberLong(1), +})); + +const stmtMajorityCommitted = + primary.getCollection("local.oplog.rs").findOne({ns, "o._id": insertBatchMajorityCommitted}); +assert.neq(null, stmtMajorityCommitted); + +// Wait for the primary to have advanced its stable_timestamp. +assert.soon(() => { + const {lastStableRecoveryTimestamp} = + assert.commandWorked(primary.adminCommand({replSetGetStatus: 1})); + + const wtStatus = assert.commandWorked(primary.adminCommand({serverStatus: 1})).wiredTiger; + const latestMajority = + wtStatus["snapshot-window-settings"]["latest majority snapshot timestamp available"]; + + print(`${primary.host}: ${tojsononeline({ + lastStableRecoveryTimestamp, + stmtMajorityCommittedTimestamp: stmtMajorityCommitted.ts, + "latest majority snapshot timestamp available": latestMajority + })}`); + + // Make sure 'secondary1' has a 'lastApplied' optime equal to 'stmtMajorityCommitted.ts'. + // Otherwise, it can fail to win the election later. + const {optimes: {appliedOpTime}} = + assert.commandWorked(secondary1.adminCommand({replSetGetStatus: 1})); + print(`${secondary1.host}: ${tojsononeline({appliedOpTime})}`); + + return bsonWoCompare(lastStableRecoveryTimestamp, stmtMajorityCommitted.ts) >= 0 && + bsonWoCompare(appliedOpTime.ts, stmtMajorityCommitted.ts) >= 0; +}); + +// Step up one of the secondaries and do a write which becomes majority committed to force the +// current primary to go into rollback. +assert.commandWorked(secondary1.adminCommand({replSetStepUp: 1})); +rst.freeze(primary); +rst.awaitNodesAgreeOnPrimary(undefined, undefined, secondary1); + +for (const fp of stopReplProducerOnDocumentFailpoints) { + fp.off(); +} + +rst.getPrimary(); // Wait for secondary1 to be a writable primary. + +// Do a write which becomes majority committed and wait for the old primary to have completed its +// rollback. +assert.commandWorked(secondary1.getCollection("test.dummy").insert({}, {writeConcern: {w: 3}})); + +// Reconnect to the primary after it completes its rollback and step it up to be the primary again. +rst.awaitNodesAgreeOnPrimary(undefined, undefined, secondary1); +assert.commandWorked(primary.adminCommand({replSetFreeze: 0})); +rst.stepUp(primary); + +print(`${primary.host} session txn record: ${ + tojson(primary.getCollection("config.transactions").findOne({"_id.id": lsid.id}))}`); + +// Make sure we don't re-execute operations that have already been inserted by making sure we +// we don't get a 'DuplicateKeyError'. +assert.commandWorked(primary.getCollection(ns).runCommand("insert", { + documents: Array.from({length: insertBatchTotal}, (_, i) => ({_id: i})), + lsid, + txnNumber: NumberLong(1), + writeConcern: {w: 3}, +})); + +rst.stopSet(); +})(); \ No newline at end of file -- cgit v1.2.1