summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Gottlieb <daniel.gottlieb@mongodb.com>2021-06-02 22:40:10 -0400
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-06-03 03:11:10 +0000
commit4d28f81e867485d0bef470bdd7f9e2b0344e6597 (patch)
treef0570db7c0d21e125e99707c9e4abea7b9996926
parent1274f16682fd01d35c98db39b202d7acdf3e554b (diff)
downloadmongo-4d28f81e867485d0bef470bdd7f9e2b0344e6597.tar.gz
SERVER-57352: Have initial_sync_test_fixture_test.js be more resilient to irrelevant changes.
-rw-r--r--jstests/replsets/initial_sync_test_fixture_test.js326
-rw-r--r--jstests/replsets/libs/initial_sync_test.js15
2 files changed, 174 insertions, 167 deletions
diff --git a/jstests/replsets/initial_sync_test_fixture_test.js b/jstests/replsets/initial_sync_test_fixture_test.js
index b520edae668..400f0c9c028 100644
--- a/jstests/replsets/initial_sync_test_fixture_test.js
+++ b/jstests/replsets/initial_sync_test_fixture_test.js
@@ -102,196 +102,192 @@ rst.startSet();
rst.initiate();
const initialSyncTest = new InitialSyncTest("InitialSyncTest", rst);
-const primary = initialSyncTest.getPrimary();
-let secondary = initialSyncTest.getSecondary();
-const db = primary.getDB("test");
-const largeString = 'z'.repeat(10 * 1024 * 1024);
-
-assert.commandWorked(db.foo.insert({a: 1}));
-assert.commandWorked(db.bar.insert({b: 1}));
-
-// Prepare a transaction so that we know that step() can restart the secondary even if there is
-// a prepared transaction. The prepareTimestamp will be used as the beginFetchingTimestamp and
-// beginApplyingTimestamp during initial sync.
-const session = primary.startSession({causalConsistency: false});
-const sessionDB = session.getDatabase("test");
-const sessionColl = sessionDB.getCollection("foo");
-session.startTransaction();
-assert.commandWorked(sessionColl.insert({c: 1}));
-let prepareTimestamp = PrepareHelpers.prepareTransaction(session);
-
-// This step call restarts the secondary and causes it to go into initial sync. It will pause
-// initial sync after the node has fetched the defaultBeginFetchingTimestamp.
-assert(!initialSyncTest.step());
-
-secondary = initialSyncTest.getSecondary();
-secondary.setSecondaryOk();
-
-// Make sure that we cannot read from this node yet.
-assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}),
- ErrorCodes.NotPrimaryOrSecondary);
-
-// Make sure that we see that the node got the defaultBeginFetchingTimestamp, but hasn't gotten the
-// beginFetchingTimestamp yet.
-checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, true);
-checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, false);
-checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, false);
-
-// This step call will resume initial sync and pause it again after the node gets the
-// beginFetchingTimestamp from its sync source.
-assert(!initialSyncTest.step());
-
-// Make sure that we cannot read from this node yet.
-assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}),
- ErrorCodes.NotPrimaryOrSecondary);
-
-// Make sure that we see that the node got the beginFetchingTimestamp, but hasn't gotten the
-// beginApplyingTimestamp yet.
-checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, false);
-checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, true);
-checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, false);
-
-// This step call will resume initial sync and pause it again after the node gets the
-// beginApplyingTimestamp from its sync source.
-assert(!initialSyncTest.step());
-
-// Make sure that we cannot read from this node yet.
-assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}),
- ErrorCodes.NotPrimaryOrSecondary);
-
-// Make sure that we see that the node got the beginApplyingTimestamp, but that we don't see the
-// listDatabases call yet.
-checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, false);
-checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, false);
-checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, true);
-checkLogForCollectionClonerMsg(secondary, "listDatabases", "admin", false);
-
-// This step call will resume initial sync and pause it again after the node gets the
-// listDatabases result from its sync source.
-assert(!initialSyncTest.step());
-
-// Make sure that we cannot read from this node yet.
-assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}),
- ErrorCodes.NotPrimaryOrSecondary);
-
-// Make sure that we saw the listDatabases call in the log messages, but didn't see any
-// listCollections or listIndexes call.
-checkLogForCollectionClonerMsg(secondary, "listDatabases", "admin", true);
-checkLogForCollectionClonerMsg(secondary, "listCollections", "admin", false);
-checkLogForCollectionClonerMsg(secondary, "listIndexes", "admin", false);
-
-// Do same listDatabases command as CollectionCloner.
-const databases =
- assert.commandWorked(primary.adminCommand({listDatabases: 1, nameOnly: true})).databases;
-
-// Iterate over the databases and collections in the same order that the test fixture would so
-// that we can check the log messages to make sure initial sync is paused as expected.
-for (let dbObj of databases) {
- const dbname = dbObj.name;
-
- // We skip the local db during the collection cloning phase of initial sync.
- if (dbname === "local") {
- continue;
- }
+try {
+ // If the test fails, the initial syncing node may be left with an engaged failpoint that
+ // instructs it to hang. This `try` block is to guarantee we call `initialSyncTest.fail()` which
+ // allows the test to gracefully exit with an error.
+
+ const primary = initialSyncTest.getPrimary();
+ let secondary = initialSyncTest.getSecondary();
+ const db = primary.getDB("test");
+ let maxLargeStringsInBatch = 9;
+ // If we can fit exactly 9+1=10 large strings in a batch, the small overhead for each oplog
+ // entry means we expect only 9 oplog entries per batch.
+ let largeStringSize = initialSyncTest.replBatchLimitBytes / (maxLargeStringsInBatch + 1);
+ const largeString = 'z'.repeat(largeStringSize);
+
+ assert.commandWorked(db.foo.insert({a: 1}));
+ assert.commandWorked(db.bar.insert({b: 1}));
+
+ // Prepare a transaction so that we know that step() can restart the secondary even if there is
+ // a prepared transaction. The prepareTimestamp will be used as the beginFetchingTimestamp and
+ // beginApplyingTimestamp during initial sync.
+ const session = primary.startSession({causalConsistency: false});
+ const sessionDB = session.getDatabase("test");
+ const sessionColl = sessionDB.getCollection("foo");
+ session.startTransaction();
+ assert.commandWorked(sessionColl.insert({c: 1}));
+ let prepareTimestamp = PrepareHelpers.prepareTransaction(session);
+
+ // This step call restarts the secondary and causes it to go into initial sync. It will pause
+ // initial sync after the node has fetched the defaultBeginFetchingTimestamp.
+ assert(!initialSyncTest.step());
+
+ secondary = initialSyncTest.getSecondary();
+ secondary.setSecondaryOk();
- const database = primary.getDB(dbname);
+ // Make sure that we cannot read from this node yet.
+ assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}),
+ ErrorCodes.NotPrimaryOrSecondary);
- // Do same listCollections command as CollectionCloner.
- const res = assert.commandWorked(database.runCommand(
- {listCollections: 1, filter: {$or: [{type: "collection"}, {type: {$exists: false}}]}}));
+ // Make sure that we see that the node got the defaultBeginFetchingTimestamp, but hasn't gotten
+ // the beginFetchingTimestamp yet.
+ checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, true);
+ checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, false);
+ checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, false);
- // Make sure that there is only one batch.
- assert.eq(NumberLong(0), res.cursor.id, res);
+ // This step call will resume initial sync and pause it again after the node gets the
+ // beginFetchingTimestamp from its sync source.
+ assert(!initialSyncTest.step());
- const collectionsCursor = res.cursor;
+ // Make sure that we cannot read from this node yet.
+ assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}),
+ ErrorCodes.NotPrimaryOrSecondary);
- // For each database, CollectionCloner will first call listCollections.
+ // Make sure that we see that the node got the beginFetchingTimestamp, but hasn't gotten the
+ // beginApplyingTimestamp yet.
+ checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, false);
+ checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, true);
+ checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, false);
+
+ // This step call will resume initial sync and pause it again after the node gets the
+ // beginApplyingTimestamp from its sync source.
assert(!initialSyncTest.step());
// Make sure that we cannot read from this node yet.
assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}),
ErrorCodes.NotPrimaryOrSecondary);
- // Make sure that we saw the listCollections call in the log messages, but didn't see a
- // listIndexes call.
- checkLogForCollectionClonerMsg(secondary, "listCollections", dbname, true);
+ // Make sure that we see that the node got the beginApplyingTimestamp, but that we don't see the
+ // listDatabases call yet.
+ checkLogForGetTimestampMsg(secondary, "defaultBeginFetchingTimestamp", prepareTimestamp, false);
+ checkLogForGetTimestampMsg(secondary, "beginFetchingTimestamp", prepareTimestamp, false);
+ checkLogForGetTimestampMsg(secondary, "beginApplyingTimestamp", prepareTimestamp, true);
+ checkLogForCollectionClonerMsg(secondary, "listDatabases", "admin", false);
+
+ // This step call will resume initial sync and pause it again after the node gets the
+ // listDatabases result from its sync source.
+ assert(!initialSyncTest.step());
+
+ // Make sure that we cannot read from this node yet.
+ assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}),
+ ErrorCodes.NotPrimaryOrSecondary);
+
+ // Make sure that we saw the listDatabases call in the log messages, but didn't see any
+ // listCollections or listIndexes call.
+ checkLogForCollectionClonerMsg(secondary, "listDatabases", "admin", true);
+ checkLogForCollectionClonerMsg(secondary, "listCollections", "admin", false);
checkLogForCollectionClonerMsg(secondary, "listIndexes", "admin", false);
- for (let collectionObj of collectionsCursor.firstBatch) {
- assert(collectionObj.info, collectionObj);
- const collUUID = collectionObj.info.uuid;
+ // Do same listDatabases command as CollectionCloner.
+ const databases =
+ assert.commandWorked(primary.adminCommand({listDatabases: 1, nameOnly: true})).databases;
+
+ // Iterate over the databases and collections in the same order that the test fixture would so
+ // that we can check the log messages to make sure initial sync is paused as expected.
+ for (let dbObj of databases) {
+ const dbname = dbObj.name;
- // For each collection, CollectionCloner will call listIndexes.
+ // We skip the local db during the collection cloning phase of initial sync.
+ if (dbname === "local") {
+ continue;
+ }
+
+ const database = primary.getDB(dbname);
+
+ // Do same listCollections command as CollectionCloner.
+ const res = assert.commandWorked(database.runCommand(
+ {listCollections: 1, filter: {$or: [{type: "collection"}, {type: {$exists: false}}]}}));
+
+ // Make sure that there is only one batch.
+ assert.eq(NumberLong(0), res.cursor.id, res);
+
+ const collectionsCursor = res.cursor;
+
+ // For each database, CollectionCloner will first call listCollections.
assert(!initialSyncTest.step());
// Make sure that we cannot read from this node yet.
assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}),
ErrorCodes.NotPrimaryOrSecondary);
- // Make sure that we saw the listIndexes call in the log messages, but didn't
- // see a listCollections call.
- checkLogForCollectionClonerMsg(secondary, "listIndexes", dbname, true, collUUID);
- checkLogForCollectionClonerMsg(secondary, "listCollections", "admin", false);
+ // Make sure that we saw the listCollections call in the log messages, but didn't see a
+ // listIndexes call.
+ checkLogForCollectionClonerMsg(secondary, "listCollections", dbname, true);
+ checkLogForCollectionClonerMsg(secondary, "listIndexes", "admin", false);
+
+ for (let collectionObj of collectionsCursor.firstBatch) {
+ assert(collectionObj.info, collectionObj);
+ const collUUID = collectionObj.info.uuid;
+
+ // For each collection, CollectionCloner will call listIndexes.
+ assert(!initialSyncTest.step());
+
+ // Make sure that we cannot read from this node yet.
+ assert.commandFailedWithCode(secondary.getDB("test").runCommand({count: "foo"}),
+ ErrorCodes.NotPrimaryOrSecondary);
- // Perform large operations during collection cloning so that we will need multiple
- // batches during oplog application.
+ // Make sure that we saw the listIndexes call in the log messages, but didn't
+ // see a listCollections call.
+ checkLogForCollectionClonerMsg(secondary, "listIndexes", dbname, true, collUUID);
+ checkLogForCollectionClonerMsg(secondary, "listCollections", "admin", false);
+ }
+ }
+
+ // Perform large operations during collection cloning so that we will need multiple batches
+ // during oplog application. For simplicity, guarantee we will create only two batches during
+ // the oplog application phase of initial sync.
+ const docsToInsertPerCollectionDuringOplogApplication = maxLargeStringsInBatch - 1;
+ const totalDocsInserted = 2 * docsToInsertPerCollectionDuringOplogApplication;
+ for (let count = 0; count < docsToInsertPerCollectionDuringOplogApplication; ++count) {
assert.commandWorked(db.foo.insert({d: largeString}));
assert.commandWorked(db.bar.insert({e: largeString}));
}
-}
-// Check that we see the expected number of batches during oplog application.
-
-// This batch should correspond to the 'prepare' op.
-assert(!initialSyncTest.step());
-checkLogForOplogApplicationMsg(secondary, 1);
-assert(!initialSyncTest.step());
-checkLogForOplogApplicationMsg(secondary, 9);
-assert(!initialSyncTest.step());
-
-// To compensate for multiversion where the feature flag doesn't exist, we use this expression
-// to evaluate as:
-//
-// * Parameter does not exist error (i.e: old version) -> false
-// * Parameter exists and is false -> false
-// * Parameter exists and is true -> true
-const newConfigImageCollectionExists =
- (db.adminCommand({getParameter: 1, featureFlagRetryableFindAndModify: 1})
- .featureFlagRetryableFindAndModify ||
- {
- value: false
- }).value;
-if (newConfigImageCollectionExists) {
- // This test writes two documents per collection in the system. The collection that
- // `featureFlagRetryableFindAndModify` creates results in bumping the expected value of the
- // third batch from 3 to 5. It's unclear why those show up in the third batch.
- checkLogForOplogApplicationMsg(secondary, 5);
-} else {
- checkLogForOplogApplicationMsg(secondary, 3);
-}
+ // Check that we see the expected number of batches during oplog application.
-assert(initialSyncTest.step(), "Expected initial sync to have completed, but it did not");
-
-// Abort transaction so that the data consistency checks in stop() can run.
-assert.commandWorked(session.abortTransaction_forTesting());
-
-// Issue a w:2 write to make sure the secondary has replicated the abortTransaction oplog entry.
-assert.commandWorked(primary.getDB("otherDB").otherColl.insert({x: 1}, {writeConcern: {w: 2}}));
-
-// Confirm that node can be read from and that it has the inserts that were made while the node
-// was in initial sync.
-if (newConfigImageCollectionExists) {
- assert.eq(secondary.getDB("test").foo.find().count(), 8);
- assert.eq(secondary.getDB("test").bar.find().count(), 8);
- assert.eq(secondary.getDB("test").foo.find().itcount(), 8);
- assert.eq(secondary.getDB("test").bar.find().itcount(), 8);
-} else {
- assert.eq(secondary.getDB("test").foo.find().count(), 7);
- assert.eq(secondary.getDB("test").bar.find().count(), 7);
- assert.eq(secondary.getDB("test").foo.find().itcount(), 7);
- assert.eq(secondary.getDB("test").bar.find().itcount(), 7);
+ // This batch should correspond to the 'prepare' op.
+ assert(!initialSyncTest.step());
+ checkLogForOplogApplicationMsg(secondary, 1);
+ assert(!initialSyncTest.step());
+ checkLogForOplogApplicationMsg(secondary, maxLargeStringsInBatch);
+ assert(!initialSyncTest.step());
+ checkLogForOplogApplicationMsg(secondary, totalDocsInserted - maxLargeStringsInBatch);
+
+ assert(initialSyncTest.step(), "Expected initial sync to have completed, but it did not");
+
+ // Abort transaction so that the data consistency checks in stop() can run.
+ assert.commandWorked(session.abortTransaction_forTesting());
+
+ // Issue a w:2 write to make sure the secondary has replicated the abortTransaction oplog entry.
+ assert.commandWorked(primary.getDB("otherDB").otherColl.insert({x: 1}, {writeConcern: {w: 2}}));
+
+ // Confirm that node can be read from and that it has the inserts that were made while the node
+ // was in initial sync. We inserted `docsToInsertPerCollectionDuringOplogApplication` + 1
+ // additional document prior to the oplog application phase to each of `foo` and `bar`.
+ assert.eq(secondary.getDB("test").foo.find().count(),
+ docsToInsertPerCollectionDuringOplogApplication + 1);
+ assert.eq(secondary.getDB("test").bar.find().count(),
+ docsToInsertPerCollectionDuringOplogApplication + 1);
+ assert.eq(secondary.getDB("test").foo.find().itcount(),
+ docsToInsertPerCollectionDuringOplogApplication + 1);
+ assert.eq(secondary.getDB("test").bar.find().itcount(),
+ docsToInsertPerCollectionDuringOplogApplication + 1);
+
+ // Do data consistency checks at the end.
+ initialSyncTest.stop();
+} catch (errorDuringTest) {
+ initialSyncTest.fail();
+ throw errorDuringTest;
}
-
-// Do data consistency checks at the end.
-initialSyncTest.stop();
})();
diff --git a/jstests/replsets/libs/initial_sync_test.js b/jstests/replsets/libs/initial_sync_test.js
index 26fee339490..235fa279b16 100644
--- a/jstests/replsets/libs/initial_sync_test.js
+++ b/jstests/replsets/libs/initial_sync_test.js
@@ -27,7 +27,9 @@ load('jstests/replsets/rslib.js');
* @param {Object} [replSet] the ReplSetTest instance to adopt
* @param {int} [timeout] how long to wait for initial sync to start
*/
-function InitialSyncTest(name = "InitialSyncTest", replSet, timeout) {
+function InitialSyncTest(
+ name = "InitialSyncTest", replSet, timeout, replBatchLimitBytes = 100 * 1024 * 1024) {
+ this.replBatchLimitBytes = replBatchLimitBytes;
const State = {
kBeforeInitialSync: "kBeforeInitialSync",
kDuringInitialSync: "kDuringInitialSync",
@@ -64,7 +66,8 @@ function InitialSyncTest(name = "InitialSyncTest", replSet, timeout) {
let nodeOptions = {};
if (TestData.logComponentVerbosity) {
nodeOptions["setParameter"] = {
- "logComponentVerbosity": tojsononeline(TestData.logComponentVerbosity)
+ "logComponentVerbosity": tojsononeline(TestData.logComponentVerbosity),
+ "replBatchLimitBytes": replBatchLimitBytes,
};
}
@@ -148,6 +151,7 @@ function InitialSyncTest(name = "InitialSyncTest", replSet, timeout) {
startClean: true,
setParameter: {
'failpoint.initialSyncFuzzerSynchronizationPoint1': tojson({mode: 'alwaysOn'}),
+ 'replBatchLimitBytes': replBatchLimitBytes,
},
};
@@ -270,6 +274,13 @@ function InitialSyncTest(name = "InitialSyncTest", replSet, timeout) {
return secondary;
};
+ this.fail = function() {
+ assert.commandWorked(secondary.adminCommand(
+ {"configureFailPoint": 'initialSyncFuzzerSynchronizationPoint1', "mode": 'off'}));
+ assert.commandWorked(secondary.adminCommand(
+ {"configureFailPoint": 'initialSyncFuzzerSynchronizationPoint2', "mode": 'off'}));
+ };
+
/**
* Performs data consistency checks and then stops the replica set. Will fail if there is a
* transaction that wasn't aborted or committed.