From 4d28f81e867485d0bef470bdd7f9e2b0344e6597 Mon Sep 17 00:00:00 2001 From: Daniel Gottlieb Date: Wed, 2 Jun 2021 22:40:10 -0400 Subject: SERVER-57352: Have initial_sync_test_fixture_test.js be more resilient to irrelevant changes. --- jstests/replsets/initial_sync_test_fixture_test.js | 326 ++++++++++----------- jstests/replsets/libs/initial_sync_test.js | 15 +- 2 files changed, 174 insertions(+), 167 deletions(-) diff --git a/jstests/replsets/initial_sync_test_fixture_test.js b/jstests/replsets/initial_sync_test_fixture_test.js index b520edae668..400f0c9c028 100644 --- a/jstests/replsets/initial_sync_test_fixture_test.js +++ b/jstests/replsets/initial_sync_test_fixture_test.js @@ -102,196 +102,192 @@ rst.startSet(); rst.initiate(); const initialSyncTest = new InitialSyncTest("InitialSyncTest", rst); -const primary = initialSyncTest.getPrimary(); -let secondary = initialSyncTest.getSecondary(); -const db = primary.getDB("test"); -const largeString = 'z'.repeat(10 * 1024 * 1024); - -assert.commandWorked(db.foo.insert({a: 1})); -assert.commandWorked(db.bar.insert({b: 1})); - -// Prepare a transaction so that we know that step() can restart the secondary even if there is -// a prepared transaction. The prepareTimestamp will be used as the beginFetchingTimestamp and -// beginApplyingTimestamp during initial sync. -const session = primary.startSession({causalConsistency: false}); -const sessionDB = session.getDatabase("test"); -const sessionColl = sessionDB.getCollection("foo"); -session.startTransaction(); -assert.commandWorked(sessionColl.insert({c: 1})); -let prepareTimestamp = PrepareHelpers.prepareTransaction(session); - -// This step call restarts the secondary and causes it to go into initial sync. It will pause -// initial sync after the node has fetched the defaultBeginFetchingTimestamp. -assert(!initialSyncTest.step()); - -secondary = initialSyncTest.getSecondary(); -secondary.setSecondaryOk(); - -// Make sure that we cannot read from this node yet. -assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), - ErrorCodes.NotPrimaryOrSecondary); - -// Make sure that we see that the node got the defaultBeginFetchingTimestamp, but hasn't gotten the -// beginFetchingTimestamp yet. -checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, true); -checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, false); -checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, false); - -// This step call will resume initial sync and pause it again after the node gets the -// beginFetchingTimestamp from its sync source. -assert(!initialSyncTest.step()); - -// Make sure that we cannot read from this node yet. -assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), - ErrorCodes.NotPrimaryOrSecondary); - -// Make sure that we see that the node got the beginFetchingTimestamp, but hasn't gotten the -// beginApplyingTimestamp yet. -checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, false); -checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, true); -checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, false); - -// This step call will resume initial sync and pause it again after the node gets the -// beginApplyingTimestamp from its sync source. -assert(!initialSyncTest.step()); - -// Make sure that we cannot read from this node yet. -assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), - ErrorCodes.NotPrimaryOrSecondary); - -// Make sure that we see that the node got the beginApplyingTimestamp, but that we don't see the -// listDatabases call yet. -checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, false); -checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, false); -checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, true); -checkLogForCollectionClonerMsg(secondary, "listDatabases", "admin", false); - -// This step call will resume initial sync and pause it again after the node gets the -// listDatabases result from its sync source. -assert(!initialSyncTest.step()); - -// Make sure that we cannot read from this node yet. -assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), - ErrorCodes.NotPrimaryOrSecondary); - -// Make sure that we saw the listDatabases call in the log messages, but didn't see any -// listCollections or listIndexes call. -checkLogForCollectionClonerMsg(secondary, "listDatabases", "admin", true); -checkLogForCollectionClonerMsg(secondary, "listCollections", "admin", false); -checkLogForCollectionClonerMsg(secondary, "listIndexes", "admin", false); - -// Do same listDatabases command as CollectionCloner. -const databases = - assert.commandWorked(primary.adminCommand({listDatabases: 1, nameOnly: true})).databases; - -// Iterate over the databases and collections in the same order that the test fixture would so -// that we can check the log messages to make sure initial sync is paused as expected. -for (let dbObj of databases) { - const dbname = dbObj.name; - - // We skip the local db during the collection cloning phase of initial sync. - if (dbname === "local") { - continue; - } +try { + // If the test fails, the initial syncing node may be left with an engaged failpoint that + // instructs it to hang. This `try` block is to guarantee we call `initialSyncTest.fail()` which + // allows the test to gracefully exit with an error. + + const primary = initialSyncTest.getPrimary(); + let secondary = initialSyncTest.getSecondary(); + const db = primary.getDB("test"); + let maxLargeStringsInBatch = 9; + // If we can fit exactly 9+1=10 large strings in a batch, the small overhead for each oplog + // entry means we expect only 9 oplog entries per batch. + let largeStringSize = initialSyncTest.replBatchLimitBytes / (maxLargeStringsInBatch + 1); + const largeString = 'z'.repeat(largeStringSize); + + assert.commandWorked(db.foo.insert({a: 1})); + assert.commandWorked(db.bar.insert({b: 1})); + + // Prepare a transaction so that we know that step() can restart the secondary even if there is + // a prepared transaction. The prepareTimestamp will be used as the beginFetchingTimestamp and + // beginApplyingTimestamp during initial sync. + const session = primary.startSession({causalConsistency: false}); + const sessionDB = session.getDatabase("test"); + const sessionColl = sessionDB.getCollection("foo"); + session.startTransaction(); + assert.commandWorked(sessionColl.insert({c: 1})); + let prepareTimestamp = PrepareHelpers.prepareTransaction(session); + + // This step call restarts the secondary and causes it to go into initial sync. It will pause + // initial sync after the node has fetched the defaultBeginFetchingTimestamp. + assert(!initialSyncTest.step()); + + secondary = initialSyncTest.getSecondary(); + secondary.setSecondaryOk(); - const database = primary.getDB(dbname); + // Make sure that we cannot read from this node yet. + assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), + ErrorCodes.NotPrimaryOrSecondary); - // Do same listCollections command as CollectionCloner. - const res = assert.commandWorked(database.runCommand( - {listCollections: 1, filter: {$or: [{type: "collection"}, {type: {$exists: false}}]}})); + // Make sure that we see that the node got the defaultBeginFetchingTimestamp, but hasn't gotten + // the beginFetchingTimestamp yet. + checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, true); + checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, false); + checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, false); - // Make sure that there is only one batch. - assert.eq(NumberLong(0), res.cursor.id, res); + // This step call will resume initial sync and pause it again after the node gets the + // beginFetchingTimestamp from its sync source. + assert(!initialSyncTest.step()); - const collectionsCursor = res.cursor; + // Make sure that we cannot read from this node yet. + assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), + ErrorCodes.NotPrimaryOrSecondary); - // For each database, CollectionCloner will first call listCollections. + // Make sure that we see that the node got the beginFetchingTimestamp, but hasn't gotten the + // beginApplyingTimestamp yet. + checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, false); + checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, true); + checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, false); + + // This step call will resume initial sync and pause it again after the node gets the + // beginApplyingTimestamp from its sync source. assert(!initialSyncTest.step()); // Make sure that we cannot read from this node yet. assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), ErrorCodes.NotPrimaryOrSecondary); - // Make sure that we saw the listCollections call in the log messages, but didn't see a - // listIndexes call. - checkLogForCollectionClonerMsg(secondary, "listCollections", dbname, true); + // Make sure that we see that the node got the beginApplyingTimestamp, but that we don't see the + // listDatabases call yet. + checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, false); + checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, false); + checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, true); + checkLogForCollectionClonerMsg(secondary, "listDatabases", "admin", false); + + // This step call will resume initial sync and pause it again after the node gets the + // listDatabases result from its sync source. + assert(!initialSyncTest.step()); + + // Make sure that we cannot read from this node yet. + assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), + ErrorCodes.NotPrimaryOrSecondary); + + // Make sure that we saw the listDatabases call in the log messages, but didn't see any + // listCollections or listIndexes call. + checkLogForCollectionClonerMsg(secondary, "listDatabases", "admin", true); + checkLogForCollectionClonerMsg(secondary, "listCollections", "admin", false); checkLogForCollectionClonerMsg(secondary, "listIndexes", "admin", false); - for (let collectionObj of collectionsCursor.firstBatch) { - assert(collectionObj.info, collectionObj); - const collUUID = collectionObj.info.uuid; + // Do same listDatabases command as CollectionCloner. + const databases = + assert.commandWorked(primary.adminCommand({listDatabases: 1, nameOnly: true})).databases; + + // Iterate over the databases and collections in the same order that the test fixture would so + // that we can check the log messages to make sure initial sync is paused as expected. + for (let dbObj of databases) { + const dbname = dbObj.name; - // For each collection, CollectionCloner will call listIndexes. + // We skip the local db during the collection cloning phase of initial sync. + if (dbname === "local") { + continue; + } + + const database = primary.getDB(dbname); + + // Do same listCollections command as CollectionCloner. + const res = assert.commandWorked(database.runCommand( + {listCollections: 1, filter: {$or: [{type: "collection"}, {type: {$exists: false}}]}})); + + // Make sure that there is only one batch. + assert.eq(NumberLong(0), res.cursor.id, res); + + const collectionsCursor = res.cursor; + + // For each database, CollectionCloner will first call listCollections. assert(!initialSyncTest.step()); // Make sure that we cannot read from this node yet. assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), ErrorCodes.NotPrimaryOrSecondary); - // Make sure that we saw the listIndexes call in the log messages, but didn't - // see a listCollections call. - checkLogForCollectionClonerMsg(secondary, "listIndexes", dbname, true, collUUID); - checkLogForCollectionClonerMsg(secondary, "listCollections", "admin", false); + // Make sure that we saw the listCollections call in the log messages, but didn't see a + // listIndexes call. + checkLogForCollectionClonerMsg(secondary, "listCollections", dbname, true); + checkLogForCollectionClonerMsg(secondary, "listIndexes", "admin", false); + + for (let collectionObj of collectionsCursor.firstBatch) { + assert(collectionObj.info, collectionObj); + const collUUID = collectionObj.info.uuid; + + // For each collection, CollectionCloner will call listIndexes. + assert(!initialSyncTest.step()); + + // Make sure that we cannot read from this node yet. + assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}), + ErrorCodes.NotPrimaryOrSecondary); - // Perform large operations during collection cloning so that we will need multiple - // batches during oplog application. + // Make sure that we saw the listIndexes call in the log messages, but didn't + // see a listCollections call. + checkLogForCollectionClonerMsg(secondary, "listIndexes", dbname, true, collUUID); + checkLogForCollectionClonerMsg(secondary, "listCollections", "admin", false); + } + } + + // Perform large operations during collection cloning so that we will need multiple batches + // during oplog application. For simplicity, guarantee we will create only two batches during + // the oplog application phase of initial sync. + const docsToInsertPerCollectionDuringOplogApplication = maxLargeStringsInBatch - 1; + const totalDocsInserted = 2 * docsToInsertPerCollectionDuringOplogApplication; + for (let count = 0; count < docsToInsertPerCollectionDuringOplogApplication; ++count) { assert.commandWorked(db.foo.insert({d: largeString})); assert.commandWorked(db.bar.insert({e: largeString})); } -} -// Check that we see the expected number of batches during oplog application. - -// This batch should correspond to the 'prepare' op. -assert(!initialSyncTest.step()); -checkLogForOplogApplicationMsg(secondary, 1); -assert(!initialSyncTest.step()); -checkLogForOplogApplicationMsg(secondary, 9); -assert(!initialSyncTest.step()); - -// To compensate for multiversion where the feature flag doesn't exist, we use this expression -// to evaluate as: -// -// * Parameter does not exist error (i.e: old version) -> false -// * Parameter exists and is false -> false -// * Parameter exists and is true -> true -const newConfigImageCollectionExists = - (db.adminCommand({getParameter: 1, featureFlagRetryableFindAndModify: 1}) - .featureFlagRetryableFindAndModify || - { - value: false - }).value; -if (newConfigImageCollectionExists) { - // This test writes two documents per collection in the system. The collection that - // `featureFlagRetryableFindAndModify` creates results in bumping the expected value of the - // third batch from 3 to 5. It's unclear why those show up in the third batch. - checkLogForOplogApplicationMsg(secondary, 5); -} else { - checkLogForOplogApplicationMsg(secondary, 3); -} + // Check that we see the expected number of batches during oplog application. -assert(initialSyncTest.step(), "Expected initial sync to have completed, but it did not"); - -// Abort transaction so that the data consistency checks in stop() can run. -assert.commandWorked(session.abortTransaction_forTesting()); - -// Issue a w:2 write to make sure the secondary has replicated the abortTransaction oplog entry. -assert.commandWorked(primary.getDB("otherDB").otherColl.insert({x: 1}, {writeConcern: {w: 2}})); - -// Confirm that node can be read from and that it has the inserts that were made while the node -// was in initial sync. -if (newConfigImageCollectionExists) { - assert.eq(secondary.getDB("test").foo.find().count(), 8); - assert.eq(secondary.getDB("test").bar.find().count(), 8); - assert.eq(secondary.getDB("test").foo.find().itcount(), 8); - assert.eq(secondary.getDB("test").bar.find().itcount(), 8); -} else { - assert.eq(secondary.getDB("test").foo.find().count(), 7); - assert.eq(secondary.getDB("test").bar.find().count(), 7); - assert.eq(secondary.getDB("test").foo.find().itcount(), 7); - assert.eq(secondary.getDB("test").bar.find().itcount(), 7); + // This batch should correspond to the 'prepare' op. + assert(!initialSyncTest.step()); + checkLogForOplogApplicationMsg(secondary, 1); + assert(!initialSyncTest.step()); + checkLogForOplogApplicationMsg(secondary, maxLargeStringsInBatch); + assert(!initialSyncTest.step()); + checkLogForOplogApplicationMsg(secondary, totalDocsInserted - maxLargeStringsInBatch); + + assert(initialSyncTest.step(), "Expected initial sync to have completed, but it did not"); + + // Abort transaction so that the data consistency checks in stop() can run. + assert.commandWorked(session.abortTransaction_forTesting()); + + // Issue a w:2 write to make sure the secondary has replicated the abortTransaction oplog entry. + assert.commandWorked(primary.getDB("otherDB").otherColl.insert({x: 1}, {writeConcern: {w: 2}})); + + // Confirm that node can be read from and that it has the inserts that were made while the node + // was in initial sync. We inserted `docsToInsertPerCollectionDuringOplogApplication` + 1 + // additional document prior to the oplog application phase to each of `foo` and `bar`. + assert.eq(secondary.getDB("test").foo.find().count(), + docsToInsertPerCollectionDuringOplogApplication + 1); + assert.eq(secondary.getDB("test").bar.find().count(), + docsToInsertPerCollectionDuringOplogApplication + 1); + assert.eq(secondary.getDB("test").foo.find().itcount(), + docsToInsertPerCollectionDuringOplogApplication + 1); + assert.eq(secondary.getDB("test").bar.find().itcount(), + docsToInsertPerCollectionDuringOplogApplication + 1); + + // Do data consistency checks at the end. + initialSyncTest.stop(); +} catch (errorDuringTest) { + initialSyncTest.fail(); + throw errorDuringTest; } - -// Do data consistency checks at the end. -initialSyncTest.stop(); })(); diff --git a/jstests/replsets/libs/initial_sync_test.js b/jstests/replsets/libs/initial_sync_test.js index 26fee339490..235fa279b16 100644 --- a/jstests/replsets/libs/initial_sync_test.js +++ b/jstests/replsets/libs/initial_sync_test.js @@ -27,7 +27,9 @@ load('jstests/replsets/rslib.js'); * @param {Object} [replSet] the ReplSetTest instance to adopt * @param {int} [timeout] how long to wait for initial sync to start */ -function InitialSyncTest(name = "InitialSyncTest", replSet, timeout) { +function InitialSyncTest( + name = "InitialSyncTest", replSet, timeout, replBatchLimitBytes = 100 * 1024 * 1024) { + this.replBatchLimitBytes = replBatchLimitBytes; const State = { kBeforeInitialSync: "kBeforeInitialSync", kDuringInitialSync: "kDuringInitialSync", @@ -64,7 +66,8 @@ function InitialSyncTest(name = "InitialSyncTest", replSet, timeout) { let nodeOptions = {}; if (TestData.logComponentVerbosity) { nodeOptions["setParameter"] = { - "logComponentVerbosity": tojsononeline(TestData.logComponentVerbosity) + "logComponentVerbosity": tojsononeline(TestData.logComponentVerbosity), + "replBatchLimitBytes": replBatchLimitBytes, }; } @@ -148,6 +151,7 @@ function InitialSyncTest(name = "InitialSyncTest", replSet, timeout) { startClean: true, setParameter: { 'failpoint.initialSyncFuzzerSynchronizationPoint1': tojson({mode: 'alwaysOn'}), + 'replBatchLimitBytes': replBatchLimitBytes, }, }; @@ -270,6 +274,13 @@ function InitialSyncTest(name = "InitialSyncTest", replSet, timeout) { return secondary; }; + this.fail = function() { + assert.commandWorked(secondary.adminCommand( + {"configureFailPoint": 'initialSyncFuzzerSynchronizationPoint1', "mode": 'off'})); + assert.commandWorked(secondary.adminCommand( + {"configureFailPoint": 'initialSyncFuzzerSynchronizationPoint2', "mode": 'off'})); + }; + /** * Performs data consistency checks and then stops the replica set. Will fail if there is a * transaction that wasn't aborted or committed. -- cgit v1.2.1