2 files changed, 149 insertions, 4 deletions
diff --git a/jstests/noPassthrough/prepare_recordid_initialization.js b/jstests/noPassthrough/prepare_recordid_initialization.js
new file mode 100644
index 00000000000..e53aeefe74c
--- /dev/null
+++ b/jstests/noPassthrough/prepare_recordid_initialization.js
@@ -0,0 +1,133 @@
+/**
+ * This test reproduces a bug, described in SERVER-58409, where reconstructing a prepared
+ * transaction during startup recovery re-uses a RecordId of a deleted document and timestamps it
+ * in the past. This generates an out-of-order update chain in WiredTiger and can return wrong
+ * results for some reads.
+ *
+ * Consider the following sequence with durable history:
+ * - Set OldestTimestamp 1
+ * - Insert RecordId(1) -> A at TimeStamp(10)
+ * - Insert RID(2) -> B at TS(20)
+ * - Delete RID(2) (B) at TS(30)
+ *
+ * If we were to restart and initialize the next record id, we'd start issuing new documents at
+ * RID(2). Normally this is fine. Any new replicated user writes must be generated with a timestamp
+ * larger than 30, so the update chain for RID(2) will remain valid.
+ *
+ * However, when reconstructing prepared transactions, the prepare timestamp (and thus any following
+ * commit timestamp, but not the durable timestamp) may be arbitrarily old. In this example, after
+ * initializing the next RID to 2, if we were to reconstruct a prepared transaction from TS(10) that
+ * performs an insert on this collection, we'd get the following update chain (from oldest to
+ * newest):
+ *  - RID(2) => B @ TS(20) -> <tombstone> @ TS(30) -> PreparedInsert @ TS(10)
+ *
+ * Committing the prepared insert at a value between 10 and 30 results in wrong results/inconsistent
+ * data when reading at those timestamps. For example, a reader reading before TS 30 and after TS 10
+ * would not see the document at RID(2) even though it should.
+ *
+ * @tags: [
+ *   requires_persistence,
+ *   requires_replication
+ * ]
+ */
+(function() {
+"use strict";
+
+TestData.skipEnforceFastCountOnValidate = true;
+
+load("jstests/core/txns/libs/prepare_helpers.js");
+load("jstests/aggregation/extras/utils.js");  // For arrayEq
+
+function incTs(ts) {
+    return Timestamp(ts.t, ts.i + 1);
+}
+
+let replTest = new ReplSetTest({
+    name: "prepare_recordid_initialization",
+    nodes: 1,
+    nodeOptions: {
+        setParameter: {
+            logComponentVerbosity: tojson({storage: {recovery: 2}, transaction: 2, assert: 1}),
+            // Set the history window to zero to explicitly control the oldest timestamp. This is
+            // necessary to predictably exercise the minimum visible timestamp initialization of
+            // collections and indexes across a restart.
+            minSnapshotHistoryWindowInSeconds: 0,
+        }
+    }
+});
+replTest.startSet();
+replTest.initiate();
+let primary = replTest.getPrimary();
+let coll = primary.getDB("test")["foo"];
+
+let origInsertTs = primary.getDB("test").runCommand(
+    {insert: "foo", documents: [{_id: 1}], writeConcern: {w: "majority"}})["operationTime"];
+
+// Pin with an arbitrarily small timestamp. Let the rounding tell us where the pin ended up. The
+// write to the `mdb_testing.pinned_timestamp` collection is not logged/replayed during replication
+// recovery. Repinning across startup happens before replication recovery. Do a majority write for
+// predictability of the test.
+assert.commandWorked(primary.adminCommand(
+    {"pinHistoryReplicated": incTs(origInsertTs), round: true, writeConcern: {w: "majority"}}));
+
+let s1 = primary.startSession();
+let s1DB = s1.getDatabase("test");
+let s1Coll = s1DB.getCollection("foo");
+s1.startTransaction();
+
+assert.commandWorked(s1Coll.insert({_id: 2, prepared: true}));  // RID: 2
+let prepTs = PrepareHelpers.prepareTransaction(s1);
+
+assert.commandWorked(coll.insert({_id: 3, cnt: 1}));  // RID: 3
+let readCollidingTs = assert.commandWorked(primary.getDB("test").runCommand(
+    {insert: "foo", documents: [{_id: 4, cnt: 1}]}))["operationTime"];  // RID: 4
+assert.commandWorked(coll.remove({_id: 4}));
+
+// After deleting _id: 4, the highest visible RID will be 3. When reconstructing the prepared insert
+// that was previously at RID 2, we should not insert at RID 4. Instead, we will determine that RID
+// 4 is not visible and insert at RID 5.
+replTest.restart(primary);
+primary = replTest.getPrimary();
+replTest.awaitLastOpCommitted();
+
+const lsid = s1.getSessionId();
+const txnNumber = s1.getTxnNumber_forTesting();
+
+s1 = PrepareHelpers.createSessionWithGivenId(primary, lsid);
+s1.setTxnNumber_forTesting(txnNumber);
+let sessionDb = s1.getDatabase("test");
+assert.commandWorked(sessionDb.adminCommand({
+    commitTransaction: 1,
+    commitTimestamp: prepTs,
+    txnNumber: NumberLong(txnNumber),
+    autocommit: false,
+}));
+
+let s2 = primary.startSession();
+sessionDb = s2.getDatabase("test");
+s2.startTransaction({readConcern: {level: "snapshot", atClusterTime: readCollidingTs}});
+let docs = sessionDb["foo"].find().showRecordId().toArray();
+assert(arrayEq(docs,
+               [
+                   {"_id": 1, "$recordId": NumberLong(1)},
+                   {"_id": 3, "cnt": 1, "$recordId": NumberLong(3)},
+                   {"_id": 4, "cnt": 1, "$recordId": NumberLong(4)},
+                   {"_id": 2, "prepared": true, "$recordId": NumberLong(5)}
+               ]),
+       tojson(docs));
+assert.commandWorked(s2.commitTransaction_forTesting());
+
+coll = primary.getDB("test")["foo"];
+assert.commandWorked(coll.insert({_id: 6}));  // Should not re-use any RecordIds
+docs = sessionDb["foo"].find().showRecordId().toArray();
+assert(arrayEq(docs,
+               [
+                   {"_id": 1, "$recordId": NumberLong(1)},
+                   {"_id": 3, "cnt": 1, "$recordId": NumberLong(3)},
+                   {"_id": 2, "prepared": true, "$recordId": NumberLong(5)},
+                   {"_id": 6, "$recordId": NumberLong(6)}
+               ]),
+       tojson(docs));
+
+replTest.stopSet();
+})();
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
index 8ce5277ac24..c5d3ba831fc 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
@@ -1771,10 +1771,22 @@ void WiredTigerRecordStore::_initNextIdIfNeeded(OperationContext* opCtx) {
     // Need to start at 1 so we are always higher than RecordId::minLong()
     int64_t nextId = 1;
 
-    // Find the largest RecordId currently in use.
-    std::unique_ptr<SeekableRecordCursor> cursor = getCursor(opCtx, /*forward=*/false);
-    if (auto record = cursor->next()) {
-        nextId = record->id.getLong() + 1;
+    // Initialize the highest seen RecordId in a session without a read timestamp because that is
+    // required by the largest_key API.
+    WiredTigerSessionCache* cache = WiredTigerRecoveryUnit::get(opCtx)->getSessionCache();
+    auto sessRaii = cache->getSession();
+    auto cachedCursor = sessRaii->getCachedCursor(_tableId, "");
+    auto cursor = cachedCursor ? cachedCursor : sessRaii->getNewCursor(_uri);
+    ON_BLOCK_EXIT([&] { sessRaii->releaseCursor(_tableId, cursor, ""); });
+
+    // Find the largest RecordId in the table and add 1 to generate our next RecordId. The
+    // largest_key API returns the largest key in the table regardless of visibility. This ensures
+    // we don't re-use RecordIds that are not visible.
+    int ret = cursor->largest_key(cursor);
+    if (ret != WT_NOTFOUND) {
+        invariantWTOK(ret);
+        auto recordId = getKey(cursor);
+        nextId = recordId.getLong() + 1;
     }
 
     _nextIdNum.store(nextId);