diff options
-rw-r--r-- | jstests/noPassthrough/prepare_recordid_initialization.js | 133 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp | 20 |
2 files changed, 149 insertions, 4 deletions
diff --git a/jstests/noPassthrough/prepare_recordid_initialization.js b/jstests/noPassthrough/prepare_recordid_initialization.js new file mode 100644 index 00000000000..e53aeefe74c --- /dev/null +++ b/jstests/noPassthrough/prepare_recordid_initialization.js @@ -0,0 +1,133 @@ +/** + * This test reproduces a bug, described in SERVER-58409, where reconstructing a prepared + * transaction during startup recovery re-uses a RecordId of a deleted document and timestamps it + * in the past. This generates an out-of-order update chain in WiredTiger and can return wrong + * results for some reads. + * + * Consider the following sequence with durable history: + * - Set OldestTimestamp 1 + * - Insert RecordId(1) -> A at TimeStamp(10) + * - Insert RID(2) -> B at TS(20) + * - Delete RID(2) (B) at TS(30) + * + * If we were to restart and initialize the next record id, we'd start issuing new documents at + * RID(2). Normally this is fine. Any new replicated user writes must be generated with a timestamp + * larger than 30, so the update chain for RID(2) will remain valid. + * + * However, when reconstructing prepared transactions, the prepare timestamp (and thus any following + * commit timestamp, but not the durable timestamp) may be arbitrarily old. In this example, after + * initializing the next RID to 2, if we were to reconstruct a prepared transaction from TS(10) that + * performs an insert on this collection, we'd get the following update chain (from oldest to + * newest): + * - RID(2) => B @ TS(20) -> <tombstone> @ TS(30) -> PreparedInsert @ TS(10) + * + * Committing the prepared insert at a value between 10 and 30 results in wrong results/inconsistent + * data when reading at those timestamps. For example, a reader reading before TS 30 and after TS 10 + * would not see the document at RID(2) even though it should. + * + * @tags: [ + * requires_persistence, + * requires_replication + * ] + */ +(function() { +"use strict"; + +TestData.skipEnforceFastCountOnValidate = true; + +load("jstests/core/txns/libs/prepare_helpers.js"); +load("jstests/aggregation/extras/utils.js"); // For arrayEq + +function incTs(ts) { + return Timestamp(ts.t, ts.i + 1); +} + +let replTest = new ReplSetTest({ + name: "prepare_recordid_initialization", + nodes: 1, + nodeOptions: { + setParameter: { + logComponentVerbosity: tojson({storage: {recovery: 2}, transaction: 2, assert: 1}), + // Set the history window to zero to explicitly control the oldest timestamp. This is + // necessary to predictably exercise the minimum visible timestamp initialization of + // collections and indexes across a restart. + minSnapshotHistoryWindowInSeconds: 0, + } + } +}); +replTest.startSet(); +replTest.initiate(); +let primary = replTest.getPrimary(); +let coll = primary.getDB("test")["foo"]; + +let origInsertTs = primary.getDB("test").runCommand( + {insert: "foo", documents: [{_id: 1}], writeConcern: {w: "majority"}})["operationTime"]; + +// Pin with an arbitrarily small timestamp. Let the rounding tell us where the pin ended up. The +// write to the `mdb_testing.pinned_timestamp` collection is not logged/replayed during replication +// recovery. Repinning across startup happens before replication recovery. Do a majority write for +// predictability of the test. +assert.commandWorked(primary.adminCommand( + {"pinHistoryReplicated": incTs(origInsertTs), round: true, writeConcern: {w: "majority"}})); + +let s1 = primary.startSession(); +let s1DB = s1.getDatabase("test"); +let s1Coll = s1DB.getCollection("foo"); +s1.startTransaction(); + +assert.commandWorked(s1Coll.insert({_id: 2, prepared: true})); // RID: 2 +let prepTs = PrepareHelpers.prepareTransaction(s1); + +assert.commandWorked(coll.insert({_id: 3, cnt: 1})); // RID: 3 +let readCollidingTs = assert.commandWorked(primary.getDB("test").runCommand( + {insert: "foo", documents: [{_id: 4, cnt: 1}]}))["operationTime"]; // RID: 4 +assert.commandWorked(coll.remove({_id: 4})); + +// After deleting _id: 4, the highest visible RID will be 3. When reconstructing the prepared insert +// that was previously at RID 2, we should not insert at RID 4. Instead, we will determine that RID +// 4 is not visible and insert at RID 5. +replTest.restart(primary); +primary = replTest.getPrimary(); +replTest.awaitLastOpCommitted(); + +const lsid = s1.getSessionId(); +const txnNumber = s1.getTxnNumber_forTesting(); + +s1 = PrepareHelpers.createSessionWithGivenId(primary, lsid); +s1.setTxnNumber_forTesting(txnNumber); +let sessionDb = s1.getDatabase("test"); +assert.commandWorked(sessionDb.adminCommand({ + commitTransaction: 1, + commitTimestamp: prepTs, + txnNumber: NumberLong(txnNumber), + autocommit: false, +})); + +let s2 = primary.startSession(); +sessionDb = s2.getDatabase("test"); +s2.startTransaction({readConcern: {level: "snapshot", atClusterTime: readCollidingTs}}); +let docs = sessionDb["foo"].find().showRecordId().toArray(); +assert(arrayEq(docs, + [ + {"_id": 1, "$recordId": NumberLong(1)}, + {"_id": 3, "cnt": 1, "$recordId": NumberLong(3)}, + {"_id": 4, "cnt": 1, "$recordId": NumberLong(4)}, + {"_id": 2, "prepared": true, "$recordId": NumberLong(5)} + ]), + tojson(docs)); +assert.commandWorked(s2.commitTransaction_forTesting()); + +coll = primary.getDB("test")["foo"]; +assert.commandWorked(coll.insert({_id: 6})); // Should not re-use any RecordIds +docs = sessionDb["foo"].find().showRecordId().toArray(); +assert(arrayEq(docs, + [ + {"_id": 1, "$recordId": NumberLong(1)}, + {"_id": 3, "cnt": 1, "$recordId": NumberLong(3)}, + {"_id": 2, "prepared": true, "$recordId": NumberLong(5)}, + {"_id": 6, "$recordId": NumberLong(6)} + ]), + tojson(docs)); + +replTest.stopSet(); +})(); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp index 8ce5277ac24..c5d3ba831fc 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp @@ -1771,10 +1771,22 @@ void WiredTigerRecordStore::_initNextIdIfNeeded(OperationContext* opCtx) { // Need to start at 1 so we are always higher than RecordId::minLong() int64_t nextId = 1; - // Find the largest RecordId currently in use. - std::unique_ptr<SeekableRecordCursor> cursor = getCursor(opCtx, /*forward=*/false); - if (auto record = cursor->next()) { - nextId = record->id.getLong() + 1; + // Initialize the highest seen RecordId in a session without a read timestamp because that is + // required by the largest_key API. + WiredTigerSessionCache* cache = WiredTigerRecoveryUnit::get(opCtx)->getSessionCache(); + auto sessRaii = cache->getSession(); + auto cachedCursor = sessRaii->getCachedCursor(_tableId, ""); + auto cursor = cachedCursor ? cachedCursor : sessRaii->getNewCursor(_uri); + ON_BLOCK_EXIT([&] { sessRaii->releaseCursor(_tableId, cursor, ""); }); + + // Find the largest RecordId in the table and add 1 to generate our next RecordId. The + // largest_key API returns the largest key in the table regardless of visibility. This ensures + // we don't re-use RecordIds that are not visible. + int ret = cursor->largest_key(cursor); + if (ret != WT_NOTFOUND) { + invariantWTOK(ret); + auto recordId = getKey(cursor); + nextId = recordId.getLong() + 1; } _nextIdNum.store(nextId); |