diff options
author | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2019-02-11 15:43:27 -0500 |
---|---|---|
committer | Max Hirschhorn <max.hirschhorn@mongodb.com> | 2019-02-11 15:43:27 -0500 |
commit | 9db1a8dffe753808bea0d8c47d9fc959eaea9ea0 (patch) | |
tree | 26b5750c5088d745ab1fb93596d010501b0b4cbe /jstests/hooks | |
parent | 691ab6da0c38f52f32c1028a8fa7447997ced255 (diff) | |
download | mongo-9db1a8dffe753808bea0d8c47d9fc959eaea9ea0.tar.gz |
SERVER-39169 Add $_internalReadAtClusterTime option to find and dbHash.
The new $_internalReadAtClusterTime option replaces all usages of
running the dbHash command inside of a multi-statement transaction. It
can be used to read from a consistent snapshot in place of specifying an
atClusterTime read concern.
Unlike multi-statement transactions, the new $_internalReadAtClusterTime
option doesn't cause locks to be left on the server after returning a
network response. It instead restores the snapshot to read from as part
of handling the request.
Diffstat (limited to 'jstests/hooks')
-rw-r--r-- | jstests/hooks/run_check_repl_dbhash_background.js | 85 |
1 files changed, 22 insertions, 63 deletions
diff --git a/jstests/hooks/run_check_repl_dbhash_background.js b/jstests/hooks/run_check_repl_dbhash_background.js index 8e5b942a2eb..8cd338b2d32 100644 --- a/jstests/hooks/run_check_repl_dbhash_background.js +++ b/jstests/hooks/run_check_repl_dbhash_background.js @@ -3,15 +3,17 @@ * * Unlike run_check_repl_dbhash.js, this version of the hook doesn't require that all operations * have finished replicating, nor does it require that the test has finished running. The dbHash - * command is run inside of a transaction specifying atClusterTime in order for an identical - * snapshot to be used by all members of the replica set. + * command reads at a particular clusterTime in order for an identical snapshot to be used by all + * members of the replica set. * - * The find and getMore commands used to generate the collection diff run as part of the same - * transaction as the dbHash command. This ensures the diagnostics for a dbhash mismatch aren't - * subjected to changes from any operations in flight. + * The find and getMore commands used to generate the collection diff read at the same clusterTime + * as the dbHash command. While this ensures the diagnostics for a dbhash mismatch aren't subjected + * to changes from any operations in flight, it is possible for the collection or an index on the + * collection to be dropped due to no locks being held. * - * If a transient transaction error occurs, then the dbhash check is retried until it succeeds, or - * until it fails with a non-transient error. + * If a transient error occurs, then the dbhash check is retried until it succeeds, or until it + * fails with a non-transient error. The most common case of a transient error is attempting to read + * from a collection after a catalog operation has been performed on the collection or database. */ 'use strict'; @@ -26,7 +28,7 @@ let debugInfo = []; // We turn off printing the JavaScript stacktrace in doassert() to avoid generating an - // overwhelming amount of log messages when handling transient transaction errors. + // overwhelming amount of log messages when handling transient errors. TestData = TestData || {}; TestData.traceExceptions = false; @@ -69,13 +71,8 @@ const kForeverSeconds = 1e9; const dbNames = new Set(); - // We enable the "WTPreserveSnapshotHistoryIndefinitely" failpoint and extend the - // "transactionLifetimeLimitSeconds" server parameter to ensure that - // - // (1) the same snapshot will be available to read at on the primary and secondaries, and - // - // (2) the potentally long-running transaction isn't killed while we are part-way through - // verifying data consistency. + // We enable the "WTPreserveSnapshotHistoryIndefinitely" failpoint to ensure that the same + // snapshot will be available to read at on the primary and secondaries. for (let session of sessions) { const db = session.getDatabase('admin'); @@ -96,19 +93,6 @@ mode: 'off', })); }); - - const res = assert.commandWorked(db.runCommand({ - setParameter: 1, - transactionLifetimeLimitSeconds: kForeverSeconds, - })); - - const transactionLifetimeLimitSecondsOriginal = res.was; - resetFns.push(() => { - assert.commandWorked(db.runCommand({ - setParameter: 1, - transactionLifetimeLimitSeconds: transactionLifetimeLimitSecondsOriginal, - })); - }); } for (let session of sessions) { @@ -124,8 +108,8 @@ }); } - // Transactions cannot be run on the following databases. (The "local" database is also not - // replicated.) + // Transactions cannot be run on the following databases so we don't attempt to read at a + // clusterTime on them either. (The "local" database is also not replicated.) dbNames.delete('admin'); dbNames.delete('config'); dbNames.delete('local'); @@ -133,9 +117,8 @@ const results = []; // The waitForSecondaries() function waits for all secondaries to have applied up to - // 'clusterTime' locally. This ensures that a later atClusterTime read inside a transaction - // doesn't stall as a result of a pending global X lock (e.g. from a dropDatabase command) on - // the primary preventing getMores on the oplog from receiving a response. + // 'clusterTime' locally. This ensures that a later $_internalReadAtClusterTime read doesn't + // fail as a result of the secondary's clusterTime being behind 'clusterTime'. const waitForSecondaries = (clusterTime, signedClusterTime) => { debugInfo.push({"waitForSecondaries": clusterTime, "signedClusterTime": signedClusterTime}); for (let i = 1; i < sessions.length; ++i) { @@ -198,9 +181,10 @@ // isn't multi-versioned. Unlike with ReplSetTest#checkReplicatedDataHashes(), it is possible // for a collection catalog operation (e.g. a drop or rename) to have been applied on the // primary but not yet applied on the secondary. - const checkCollectionHashesForDB = (dbName) => { + const checkCollectionHashesForDB = (dbName, clusterTime) => { const result = []; - const hashes = rst.getHashesUsingSessions(sessions, dbName); + const hashes = + rst.getHashesUsingSessions(sessions, dbName, {readAtClusterTime: clusterTime}); const hashesByUUID = hashes.map((response, i) => { const info = {}; @@ -281,9 +265,7 @@ // ReplSetTest#getCollectionDiffUsingSessions() upon detecting a dbHash mismatch. It is // presumed to still useful to know that a bug exists even if we cannot get more // diagnostics for it. - if ((e.hasOwnProperty('errorLabels') && - e.errorLabels.includes('TransientTransactionError')) || - e.code === ErrorCodes.Interrupted) { + if (e.code === ErrorCodes.Interrupted || e.code === ErrorCodes.SnapshotUnavailable) { hasTransientError = true; return true; } @@ -310,23 +292,15 @@ debugInfo.push({ "node": session.getClient(), "session": session, - "startTransaction": clusterTime + "readAtClusterTime": clusterTime }); - session.startTransaction( - {readConcern: {level: 'snapshot', atClusterTime: clusterTime}}); } hasTransientError = false; try { - result = checkCollectionHashesForDB(dbName); + result = checkCollectionHashesForDB(dbName, clusterTime); } catch (e) { - // We abort each of the transactions started on the nodes if one of them returns an - // error while running the dbHash check. - for (let session of sessions) { - session.abortTransaction_forTesting(); - } - if (isTransientError(e)) { debugInfo.push({"transientError": e}); continue; @@ -335,21 +309,6 @@ jsTestLog(debugInfo); throw e; } - - // We then attempt to commit each of the transactions started on the nodes to confirm - // the data we read was actually majority-committed. If one of them returns an error, - // then we still try to commit the transactions started on subsequent nodes in order to - // clear their transaction state. - for (let session of sessions) { - try { - session.commitTransaction(); - } catch (e) { - if (!isTransientError(e)) { - jsTestLog(debugInfo); - throw e; - } - } - } } while (hasTransientError); for (let mismatchInfo of result) { |