summaryrefslogtreecommitdiff
path: root/jstests/replsets/tenant_migration_recipient_resumes_on_donor_failover.js
blob: b5af732a93542beb3aa987a99d7d593995b62f01 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/*
 * Tests that the recipient will retry a migration on donor sync source failure in the following
 * scenarios:
 * - donor shuts down when the recipient oplog fetcher is created but cloning has yet to start
 * - donor shuts down in the middle of the cloning phase
 * - donor shuts down after cloning is finished but the recipient has yet to declare that the data
 *   is consistent
 *
 * @tags: [requires_majority_read_concern, requires_fcv_49, incompatible_with_windows_tls,
 * incompatible_with_eft, incompatible_with_macos, requires_persistence]
 */

(function() {
"use strict";

load("jstests/libs/fail_point_util.js");
load("jstests/libs/uuid_util.js");
load("jstests/libs/write_concern_util.js");
load("jstests/replsets/libs/tenant_migration_test.js");
load("jstests/replsets/libs/tenant_migration_util.js");
load('jstests/replsets/rslib.js');

function runTest(failPoint) {
    const recipientRst = new ReplSetTest({
        nodes: 2,
        name: jsTestName() + "_recipient",
        nodeOptions: Object.assign(TenantMigrationUtil.makeX509OptionsForTest().recipient, {
            setParameter: {
                // Use a batch size of 2 so that collection cloner requires more than a single batch
                // to complete.
                collectionClonerBatchSize: 2,
                // Allow reads on recipient before migration completes for testing.
                'failpoint.tenantMigrationRecipientNotRejectReads': tojson({mode: 'alwaysOn'}),
            }
        })
    });
    recipientRst.startSet();
    recipientRst.initiateWithHighElectionTimeout();

    const tenantMigrationTest =
        new TenantMigrationTest({name: jsTestName(), recipientRst, sharedOptions: {nodes: 3}});
    if (!tenantMigrationTest.isFeatureFlagEnabled()) {
        jsTestLog("Skipping test because the tenant migrations feature flag is disabled");
        recipientRst.stopSet();
        return false;
    }
    jsTestLog("Running test with failpoint: " + failPoint);
    const tenantId = "testTenantId";
    const tenantDB = tenantMigrationTest.tenantDB(tenantId, "DB");
    const collName = "testColl";

    const donorRst = tenantMigrationTest.getDonorRst();
    const donorPrimary = tenantMigrationTest.getDonorPrimary();
    const donorSecondary = donorRst.getSecondary();

    const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
    const recipientDb = recipientPrimary.getDB(tenantDB);
    let recipientColl = recipientDb.getCollection(collName);

    tenantMigrationTest.insertDonorDB(tenantDB, collName);

    let waitInFailPoint;
    if (failPoint === 'tenantMigrationHangCollectionClonerAfterHandlingBatchResponse') {
        waitInFailPoint =
            configureFailPoint(recipientPrimary, failPoint, {nss: recipientColl.getFullName()});
    } else {
        waitInFailPoint = configureFailPoint(recipientPrimary, failPoint, {action: "hang"});
    }

    const migrationUuid = UUID();
    const migrationOpts = {
        migrationIdString: extractUUIDFromObject(migrationUuid),
        tenantId,
        readPreference: {mode: 'primary'}
    };

    jsTestLog("Starting the tenant migration to wait in failpoint: " + failPoint);
    assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
    waitInFailPoint.wait();
    let res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
    let currOp = res.inprog[0];
    // We should start the migration syncing from the primary.
    assert.eq(donorPrimary.host,
              currOp.donorSyncSource,
              `the recipient should start with 'donorPrimary' as the sync source`);
    let configRecipientNs = recipientPrimary.getCollection(TenantMigrationTest.kConfigRecipientsNS);
    let recipientDoc = configRecipientNs.find({"_id": migrationUuid}).toArray();
    const expectedMigrationState = (failPoint === "fpAfterDataConsistentMigrationRecipientInstance")
        ? "consistent"
        : "started";
    assert.eq(recipientDoc[0].state, expectedMigrationState, recipientDoc[0]);
    assert.eq(recipientDoc[0].numRestartsDueToDonorConnectionFailure, 0, recipientDoc[0]);

    jsTestLog("Stopping the donor primary");
    donorRst.stop(donorPrimary);
    waitInFailPoint.off();
    assert.soon(() => {
        // We expect that the recipient is retrying the migration as the donor has shutdown. We will
        // fail trying to find a sync source until a new donor primary is discovered as we will
        // honor the original read preference.
        let recipientDoc = configRecipientNs.find({"_id": migrationUuid}).toArray();
        jsTestLog("recipientDoc:" + tojson(recipientDoc));
        return recipientDoc[0].numRestartsDueToDonorConnectionFailure == 1;
    });

    let hangOnRetry = configureFailPoint(recipientPrimary,
                                         'fpAfterStartingOplogFetcherMigrationRecipientInstance',
                                         {action: "hang"});
    // Step up a new donor primary.
    assert.commandWorked(donorSecondary.adminCommand({replSetStepUp: 1}));
    hangOnRetry.wait();
    res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
    currOp = res.inprog[0];
    // The recipient should resume the migration against the new donor primary.
    assert.eq(donorSecondary.host, currOp.donorSyncSource, currOp);
    hangOnRetry.off();

    TenantMigrationTest.assertCommitted(
        tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
    // Remove 'donorPrimary' so that the test can complete properly.
    donorRst.remove(donorPrimary);
    recipientRst.stopSet();
    tenantMigrationTest.stop();
    return true;
}

// Test case where donor is shutdown after the recipient has started the oplog fetcher but not the
// cloner.
let testEnabled = runTest('fpAfterStartingOplogFetcherMigrationRecipientInstance');
if (testEnabled) {
    // Test case where donor is shutdown in the middle of the cloning phase.
    runTest('tenantMigrationHangCollectionClonerAfterHandlingBatchResponse');
    // Test case where donor is shutdown after cloning has finished but before the donor is notified
    // that the recipient is in the consistent state.
    runTest('fpAfterStartingOplogApplierMigrationRecipientInstance');
    // Test case where donor is shutdown after the recipient responds to the first
    // 'RecipientSyncData' cmd, indicating that the data is consistent.
    runTest('fpAfterDataConsistentMigrationRecipientInstance');
}
})();