summaryrefslogtreecommitdiff
path: root/jstests/noPassthrough/prepare_recordid_initialization.js
blob: e53aeefe74cde70691e55c47ceaa3367aad97c2a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
/**
 * This test reproduces a bug, described in SERVER-58409, where reconstructing a prepared
 * transaction during startup recovery re-uses a RecordId of a deleted document and timestamps it
 * in the past. This generates an out-of-order update chain in WiredTiger and can return wrong
 * results for some reads.
 *
 * Consider the following sequence with durable history:
 * - Set OldestTimestamp 1
 * - Insert RecordId(1) -> A at TimeStamp(10)
 * - Insert RID(2) -> B at TS(20)
 * - Delete RID(2) (B) at TS(30)
 *
 * If we were to restart and initialize the next record id, we'd start issuing new documents at
 * RID(2). Normally this is fine. Any new replicated user writes must be generated with a timestamp
 * larger than 30, so the update chain for RID(2) will remain valid.
 *
 * However, when reconstructing prepared transactions, the prepare timestamp (and thus any following
 * commit timestamp, but not the durable timestamp) may be arbitrarily old. In this example, after
 * initializing the next RID to 2, if we were to reconstruct a prepared transaction from TS(10) that
 * performs an insert on this collection, we'd get the following update chain (from oldest to
 * newest):
 *  - RID(2) => B @ TS(20) -> <tombstone> @ TS(30) -> PreparedInsert @ TS(10)
 *
 * Committing the prepared insert at a value between 10 and 30 results in wrong results/inconsistent
 * data when reading at those timestamps. For example, a reader reading before TS 30 and after TS 10
 * would not see the document at RID(2) even though it should.
 *
 * @tags: [
 *   requires_persistence,
 *   requires_replication
 * ]
 */
(function() {
"use strict";

TestData.skipEnforceFastCountOnValidate = true;

load("jstests/core/txns/libs/prepare_helpers.js");
load("jstests/aggregation/extras/utils.js");  // For arrayEq

function incTs(ts) {
    return Timestamp(ts.t, ts.i + 1);
}

let replTest = new ReplSetTest({
    name: "prepare_recordid_initialization",
    nodes: 1,
    nodeOptions: {
        setParameter: {
            logComponentVerbosity: tojson({storage: {recovery: 2}, transaction: 2, assert: 1}),
            // Set the history window to zero to explicitly control the oldest timestamp. This is
            // necessary to predictably exercise the minimum visible timestamp initialization of
            // collections and indexes across a restart.
            minSnapshotHistoryWindowInSeconds: 0,
        }
    }
});
replTest.startSet();
replTest.initiate();
let primary = replTest.getPrimary();
let coll = primary.getDB("test")["foo"];

let origInsertTs = primary.getDB("test").runCommand(
    {insert: "foo", documents: [{_id: 1}], writeConcern: {w: "majority"}})["operationTime"];

// Pin with an arbitrarily small timestamp. Let the rounding tell us where the pin ended up. The
// write to the `mdb_testing.pinned_timestamp` collection is not logged/replayed during replication
// recovery. Repinning across startup happens before replication recovery. Do a majority write for
// predictability of the test.
assert.commandWorked(primary.adminCommand(
    {"pinHistoryReplicated": incTs(origInsertTs), round: true, writeConcern: {w: "majority"}}));

let s1 = primary.startSession();
let s1DB = s1.getDatabase("test");
let s1Coll = s1DB.getCollection("foo");
s1.startTransaction();

assert.commandWorked(s1Coll.insert({_id: 2, prepared: true}));  // RID: 2
let prepTs = PrepareHelpers.prepareTransaction(s1);

assert.commandWorked(coll.insert({_id: 3, cnt: 1}));  // RID: 3
let readCollidingTs = assert.commandWorked(primary.getDB("test").runCommand(
    {insert: "foo", documents: [{_id: 4, cnt: 1}]}))["operationTime"];  // RID: 4
assert.commandWorked(coll.remove({_id: 4}));

// After deleting _id: 4, the highest visible RID will be 3. When reconstructing the prepared insert
// that was previously at RID 2, we should not insert at RID 4. Instead, we will determine that RID
// 4 is not visible and insert at RID 5.
replTest.restart(primary);
primary = replTest.getPrimary();
replTest.awaitLastOpCommitted();

const lsid = s1.getSessionId();
const txnNumber = s1.getTxnNumber_forTesting();

s1 = PrepareHelpers.createSessionWithGivenId(primary, lsid);
s1.setTxnNumber_forTesting(txnNumber);
let sessionDb = s1.getDatabase("test");
assert.commandWorked(sessionDb.adminCommand({
    commitTransaction: 1,
    commitTimestamp: prepTs,
    txnNumber: NumberLong(txnNumber),
    autocommit: false,
}));

let s2 = primary.startSession();
sessionDb = s2.getDatabase("test");
s2.startTransaction({readConcern: {level: "snapshot", atClusterTime: readCollidingTs}});
let docs = sessionDb["foo"].find().showRecordId().toArray();
assert(arrayEq(docs,
               [
                   {"_id": 1, "$recordId": NumberLong(1)},
                   {"_id": 3, "cnt": 1, "$recordId": NumberLong(3)},
                   {"_id": 4, "cnt": 1, "$recordId": NumberLong(4)},
                   {"_id": 2, "prepared": true, "$recordId": NumberLong(5)}
               ]),
       tojson(docs));
assert.commandWorked(s2.commitTransaction_forTesting());

coll = primary.getDB("test")["foo"];
assert.commandWorked(coll.insert({_id: 6}));  // Should not re-use any RecordIds
docs = sessionDb["foo"].find().showRecordId().toArray();
assert(arrayEq(docs,
               [
                   {"_id": 1, "$recordId": NumberLong(1)},
                   {"_id": 3, "cnt": 1, "$recordId": NumberLong(3)},
                   {"_id": 2, "prepared": true, "$recordId": NumberLong(5)},
                   {"_id": 6, "$recordId": NumberLong(6)}
               ]),
       tojson(docs));

replTest.stopSet();
})();