summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVishnu Kaushik <vishnu.kaushik@mongodb.com>2021-03-18 18:38:23 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-03-19 00:43:32 +0000
commit8f965fa363779b60173fc9f4459a70d56355a2ea (patch)
treeb912a3c10515b67c1c7e54c466908d37589540fa
parentab3197b1f9e631713a208826244d2a1d82637d5c (diff)
downloadmongo-8f965fa363779b60173fc9f4459a70d56355a2ea.tar.gz
SERVER-54266 Add data size and estimated time stats to recipient currentOp
-rw-r--r--jstests/replsets/libs/tenant_migration_test.js5
-rw-r--r--jstests/replsets/tenant_migration_cloner_stats.js178
-rw-r--r--jstests/replsets/tenant_migration_recipient_current_op.js126
-rw-r--r--src/mongo/db/repl/tenant_all_database_cloner.cpp34
-rw-r--r--src/mongo/db/repl/tenant_all_database_cloner.h9
-rw-r--r--src/mongo/db/repl/tenant_collection_cloner.cpp19
-rw-r--r--src/mongo/db/repl/tenant_collection_cloner.h4
-rw-r--r--src/mongo/db/repl/tenant_collection_cloner_test.cpp3
-rw-r--r--src/mongo/db/repl/tenant_database_cloner.cpp4
-rw-r--r--src/mongo/db/repl/tenant_database_cloner.h2
-rw-r--r--src/mongo/db/repl/tenant_database_cloner_test.cpp1
-rw-r--r--src/mongo/db/repl/tenant_migration_recipient_service.cpp17
12 files changed, 349 insertions, 53 deletions
diff --git a/jstests/replsets/libs/tenant_migration_test.js b/jstests/replsets/libs/tenant_migration_test.js
index 2ceb9d892b2..35338ec6002 100644
--- a/jstests/replsets/libs/tenant_migration_test.js
+++ b/jstests/replsets/libs/tenant_migration_test.js
@@ -125,7 +125,10 @@ function TenantMigrationTest({
privileges: [
{resource: {cluster: true}, actions: ["listDatabases", "useUUID"]},
{resource: {db: "", collection: ""}, actions: ["listCollections"]},
- {resource: {anyResource: true}, actions: ["collStats", "find", "listIndexes"]}
+ {
+ resource: {anyResource: true},
+ actions: ["dbStats", "collStats", "find", "listIndexes"]
+ }
],
roles: []
}));
diff --git a/jstests/replsets/tenant_migration_cloner_stats.js b/jstests/replsets/tenant_migration_cloner_stats.js
new file mode 100644
index 00000000000..10f9100fa7c
--- /dev/null
+++ b/jstests/replsets/tenant_migration_cloner_stats.js
@@ -0,0 +1,178 @@
+/**
+ * Tests cloner stats such as 'approxTotalDataSize', 'approxTotalBytesCopied' across multiple
+ * databases and collections in the absence of failovers.
+ *
+ * @tags: [requires_fcv_49, requires_majority_read_concern, requires_persistence,
+ * incompatible_with_eft, incompatible_with_windows_tls]
+ */
+
+(function() {
+"use strict";
+load("jstests/libs/uuid_util.js"); // For extractUUIDFromObject().
+load("jstests/libs/fail_point_util.js"); // For configureFailPoint().
+load("jstests/replsets/libs/tenant_migration_test.js");
+load("jstests/replsets/libs/tenant_migration_util.js");
+
+// Limit the batch size to test the stat in between batches.
+const tenantMigrationTest = new TenantMigrationTest(
+ {name: jsTestName(), sharedOptions: {setParameter: {collectionClonerBatchSize: 10}}});
+
+if (!tenantMigrationTest.isFeatureFlagEnabled()) {
+ jsTestLog("Skipping test because the tenant migrations feature flag is disabled");
+ return;
+}
+
+const kMigrationId = UUID();
+const kTenantId = 'testTenantId';
+const kReadPreference = {
+ mode: "primary"
+};
+const migrationOpts = {
+ migrationIdString: extractUUIDFromObject(kMigrationId),
+ tenantId: kTenantId,
+ readPreference: kReadPreference
+};
+
+const dbName = tenantMigrationTest.tenantDB(kTenantId, "testDB");
+const collName = "testColl";
+
+const dbName1 = dbName + '_db_1';
+const dbName2 = dbName + '_db_2';
+const collName1 = collName + "_coll_1";
+const collName2 = collName + "_coll_2";
+const collNameDb2 = collName + "_only_coll";
+
+const dataForEachCollection = [...Array(100).keys()].map((i) => ({a: i, b: 'A very long string.'}));
+tenantMigrationTest.insertDonorDB(dbName1, collName1, dataForEachCollection);
+tenantMigrationTest.insertDonorDB(dbName1, collName2, dataForEachCollection);
+tenantMigrationTest.insertDonorDB(dbName2, collNameDb2, dataForEachCollection);
+
+jsTestLog("Set up fail points on recipient.");
+const recipientPrimary = tenantMigrationTest.getRecipientPrimary();
+const fpAfterPersistingStateDoc =
+ configureFailPoint(recipientPrimary,
+ "fpAfterPersistingTenantMigrationRecipientInstanceStateDoc",
+ {action: "hang"});
+const fpAfterCreateFirstCollection = configureFailPoint(
+ recipientPrimary, "tenantCollectionClonerHangAfterCreateCollection", {action: "hang"});
+
+const donorPrimary = tenantMigrationTest.getDonorPrimary();
+const donorDB1 = donorPrimary.getDB(dbName1);
+
+const db1Size = assert.commandWorked(donorDB1.runCommand({dbStats: 1})).dataSize;
+const db2Size = assert.commandWorked(donorPrimary.getDB(dbName2).runCommand({dbStats: 1})).dataSize;
+
+const db1Collection1Size = assert.commandWorked(donorDB1.runCommand({collStats: collName1})).size;
+const db1Collection2Size = assert.commandWorked(donorDB1.runCommand({collStats: collName2})).size;
+
+const donorStats = {
+ db1Size,
+ db2Size,
+ db1Collection1Size,
+ db1Collection2Size
+};
+
+jsTestLog("Collected the following stats on the donor: " + tojson(donorStats));
+
+jsTestLog("Starting tenant migration with migrationId: " + kMigrationId +
+ ", tenantId: " + kTenantId);
+assert.commandWorked(tenantMigrationTest.startMigration(migrationOpts));
+
+// In this case, we do not expect the stats to exist yet, as the cloner has not been started.
+jsTestLog("Waiting until the state doc has been persisted.");
+fpAfterPersistingStateDoc.wait();
+let res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+let currOp = res.inprog[0];
+assert(!currOp.hasOwnProperty("approxTotalDataSize"), res);
+assert(!currOp.hasOwnProperty("approxTotalBytesCopied"), res);
+assert(!currOp.hasOwnProperty("totalReceiveElapsedMillis"), res);
+assert(!currOp.hasOwnProperty("remainingReceiveEstimatedMillis"), res);
+fpAfterPersistingStateDoc.off();
+
+// At this point, the total data size stat will have been obtained. However, nothing has been
+// copied yet.
+jsTestLog("Wait until the cloner has created the first collection");
+fpAfterCreateFirstCollection.wait();
+res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+currOp = res.inprog[0];
+assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res);
+assert.eq(currOp.approxTotalBytesCopied, 0, res);
+assert.gt(currOp.totalReceiveElapsedMillis, 0, res);
+assert.gt(currOp.remainingReceiveEstimatedMillis, 0, res);
+
+// Before proceeding, set the failpoint to pause after cloning a single batch.
+jsTestLog("Setting failpoint to pause after cloning single batch.");
+const fpAfterFirstBatch = configureFailPoint(
+ recipientPrimary, "tenantMigrationHangCollectionClonerAfterHandlingBatchResponse");
+fpAfterCreateFirstCollection.off();
+
+// After copying one batch, the amount of data copied should be non-zero, but less than the size
+// of the collection.
+jsTestLog("Waiting for a single batch of documents to have been cloned.");
+fpAfterFirstBatch.wait();
+
+// Since documents are inserted on a separate thread, wait until the expected stats are seen. The
+// failpoint needs to be maintained so that the next batch isn't processed.
+assert.soon(() => {
+ res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+ currOp = res.inprog[0];
+
+ // Wait until one batch of documents has been copied.
+ return currOp.approxTotalBytesCopied > 0;
+}, res);
+
+assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res);
+assert.gt(currOp.approxTotalBytesCopied, 0, res);
+assert.lt(currOp.approxTotalBytesCopied, db1Collection1Size, res);
+assert.gt(currOp.totalReceiveElapsedMillis, 0, res);
+// At this point, most of the data is un-cloned.
+assert.gt(currOp.remainingReceiveEstimatedMillis, currOp.totalReceiveElapsedMillis, res);
+
+// Before proceeding, set fail point to pause at the next create collection boundary.
+const fpAfterCreateSecondCollection = configureFailPoint(
+ recipientPrimary, "tenantCollectionClonerHangAfterCreateCollection", {action: "hang"});
+fpAfterFirstBatch.off();
+
+// One collection should have been cloned completely. The stats should reflect this.
+jsTestLog("Waiting for the second collection to be created.");
+fpAfterCreateSecondCollection.wait();
+res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+currOp = res.inprog[0];
+assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res);
+assert.eq(currOp.approxTotalBytesCopied, db1Collection1Size, res);
+assert.gt(currOp.totalReceiveElapsedMillis, 0, res);
+assert.gt(currOp.remainingReceiveEstimatedMillis, currOp.totalReceiveElapsedMillis, res);
+let prevTotalElapsedMillis = currOp.totalReceiveElapsedMillis;
+const prevRemainingMillis = currOp.remainingReceiveEstimatedMillis;
+
+// Before proceeding, set fail point to pause before copying the second database.
+const fpBeforeCopyingSecondDB =
+ configureFailPoint(recipientPrimary, "tenantDatabaseClonerHangAfterGettingOperationTime");
+fpAfterCreateSecondCollection.off();
+
+jsTestLog("Wait until the second database is about to be cloned.");
+fpBeforeCopyingSecondDB.wait();
+res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+currOp = res.inprog[0];
+assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res);
+assert.eq(currOp.approxTotalBytesCopied, db1Size, res);
+assert.gt(currOp.totalReceiveElapsedMillis, prevTotalElapsedMillis, res);
+// We have copied most of the data.
+assert.lt(currOp.remainingReceiveEstimatedMillis, currOp.totalReceiveElapsedMillis, res);
+assert.lt(currOp.remainingReceiveEstimatedMillis, prevRemainingMillis);
+prevTotalElapsedMillis = currOp.totalReceiveElapsedMillis;
+fpBeforeCopyingSecondDB.off();
+
+// After the migration completes, the total bytes copied should be equal to the total data size.
+jsTestLog("Waiting for migration to complete.");
+assert.commandWorked(tenantMigrationTest.waitForMigrationToComplete(migrationOpts));
+res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
+currOp = res.inprog[0];
+assert.eq(currOp.approxTotalDataSize, db1Size + db2Size, res);
+assert.eq(currOp.approxTotalBytesCopied, db1Size + db2Size, res);
+assert.gt(currOp.totalReceiveElapsedMillis, prevTotalElapsedMillis, res);
+// We have finished cloning, therefore time remaining is zero.
+assert.eq(currOp.remainingReceiveEstimatedMillis, 0, res);
+
+tenantMigrationTest.stop();
+})();
diff --git a/jstests/replsets/tenant_migration_recipient_current_op.js b/jstests/replsets/tenant_migration_recipient_current_op.js
index 19933a21da1..1a9e31704f8 100644
--- a/jstests/replsets/tenant_migration_recipient_current_op.js
+++ b/jstests/replsets/tenant_migration_recipient_current_op.js
@@ -97,15 +97,19 @@ fpAfterPersistingStateDoc.wait();
let res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
checkStandardFieldsOK(res);
let currOp = res.inprog[0];
-assert.eq(currOp.state, migrationStates.kStarted, tojson(res));
-assert.eq(currOp.migrationCompleted, false, tojson(res));
-assert.eq(currOp.dataSyncCompleted, false, tojson(res));
-assert(!currOp.startFetchingDonorOpTime, tojson(res));
-assert(!currOp.startApplyingDonorOpTime, tojson(res));
-assert(!currOp.dataConsistentStopDonorOpTime, tojson(res));
-assert(!currOp.cloneFinishedRecipientOpTime, tojson(res));
-assert(!currOp.expireAt, tojson(res));
-assert(!currOp.donorSyncSource, tojson(res));
+assert.eq(currOp.state, migrationStates.kStarted, res);
+assert.eq(currOp.migrationCompleted, false, res);
+assert.eq(currOp.dataSyncCompleted, false, res);
+assert(!currOp.hasOwnProperty("startFetchingDonorOpTime"), res);
+assert(!currOp.hasOwnProperty("startApplyingDonorOpTime"), res);
+assert(!currOp.hasOwnProperty("dataConsistentStopDonorOpTime"), res);
+assert(!currOp.hasOwnProperty("cloneFinishedRecipientOpTime"), res);
+assert(!currOp.hasOwnProperty("expireAt"), res);
+assert(!currOp.hasOwnProperty("donorSyncSource"), res);
+assert(!currOp.hasOwnProperty("approxTotalDataSize"), res);
+assert(!currOp.hasOwnProperty("approxTotalBytesCopied"), res);
+assert(!currOp.hasOwnProperty("totalReceiveElapsedMillis"), res);
+assert(!currOp.hasOwnProperty("remainingReceiveEstimatedMillis"), res);
fpAfterPersistingStateDoc.off();
// Allow the migration to move to the point where the startFetchingDonorOpTime has been obtained.
@@ -115,17 +119,21 @@ fpAfterRetrievingStartOpTime.wait();
res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
checkStandardFieldsOK(res);
currOp = res.inprog[0];
-assert.eq(currOp.state, migrationStates.kStarted, tojson(res));
-assert.eq(currOp.migrationCompleted, false, tojson(res));
-assert.eq(currOp.dataSyncCompleted, false, tojson(res));
-assert(!currOp.dataConsistentStopDonorOpTime, tojson(res));
-assert(!currOp.cloneFinishedRecipientOpTime, tojson(res));
-assert(!currOp.expireAt, tojson(res));
assert.gt(new Date(), currOp.receiveStart, tojson(res));
+assert.eq(currOp.state, migrationStates.kStarted, res);
+assert.eq(currOp.migrationCompleted, false, res);
+assert.eq(currOp.dataSyncCompleted, false, res);
+assert(!currOp.hasOwnProperty("dataConsistentStopDonorOpTime"), res);
+assert(!currOp.hasOwnProperty("cloneFinishedRecipientOpTime"), res);
+assert(!currOp.hasOwnProperty("expireAt"), res);
+assert(!currOp.hasOwnProperty("approxTotalDataSize"), res);
+assert(!currOp.hasOwnProperty("approxTotalBytesCopied"), res);
+assert(!currOp.hasOwnProperty("totalReceiveElapsedMillis"), res);
+assert(!currOp.hasOwnProperty("remainingReceiveEstimatedMillis"), res);
// Must exist now.
-assert(currOp.startFetchingDonorOpTime, tojson(res));
-assert(currOp.startApplyingDonorOpTime, tojson(res));
-assert(currOp.donorSyncSource, tojson(res));
+assert(currOp.hasOwnProperty("startFetchingDonorOpTime"), res);
+assert(currOp.hasOwnProperty("startApplyingDonorOpTime"), res);
+assert(currOp.hasOwnProperty("donorSyncSource"), res);
fpAfterRetrievingStartOpTime.off();
// Wait until collection cloning is done, and cloneFinishedRecipientOpTime
@@ -136,16 +144,20 @@ fpAfterCollectionCloner.wait();
res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
checkStandardFieldsOK(res);
currOp = res.inprog[0];
-assert.eq(currOp.state, migrationStates.kStarted, tojson(res));
-assert.eq(currOp.migrationCompleted, false, tojson(res));
-assert.eq(currOp.dataSyncCompleted, false, tojson(res));
-assert(!currOp.expireAt, tojson(res));
+assert.eq(currOp.state, migrationStates.kStarted, res);
+assert.eq(currOp.migrationCompleted, false, res);
+assert.eq(currOp.dataSyncCompleted, false, res);
+assert(!currOp.hasOwnProperty("expireAt"), res);
// Must exist now.
-assert(currOp.startFetchingDonorOpTime, tojson(res));
-assert(currOp.startApplyingDonorOpTime, tojson(res));
-assert(currOp.donorSyncSource, tojson(res));
-assert(currOp.dataConsistentStopDonorOpTime, tojson(res));
-assert(currOp.cloneFinishedRecipientOpTime, tojson(res));
+assert(currOp.hasOwnProperty("startFetchingDonorOpTime"), res);
+assert(currOp.hasOwnProperty("startApplyingDonorOpTime"), res);
+assert(currOp.hasOwnProperty("donorSyncSource"), res);
+assert(currOp.hasOwnProperty("dataConsistentStopDonorOpTime"), res);
+assert(currOp.hasOwnProperty("cloneFinishedRecipientOpTime"), res);
+assert(currOp.hasOwnProperty("approxTotalDataSize"), res);
+assert(currOp.hasOwnProperty("approxTotalBytesCopied"), res);
+assert(currOp.hasOwnProperty("totalReceiveElapsedMillis"), res);
+assert(currOp.hasOwnProperty("remainingReceiveEstimatedMillis"), res);
fpAfterCollectionCloner.off();
// Wait for the "kConsistent" state to be reached.
@@ -156,14 +168,18 @@ res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient mi
checkStandardFieldsOK(res);
currOp = res.inprog[0];
// State should have changed.
-assert.eq(currOp.state, migrationStates.kConsistent, tojson(res));
-assert.eq(currOp.migrationCompleted, false, tojson(res));
-assert.eq(currOp.dataSyncCompleted, false, tojson(res));
-assert(currOp.startFetchingDonorOpTime, tojson(res));
-assert(currOp.startApplyingDonorOpTime, tojson(res));
-assert(currOp.dataConsistentStopDonorOpTime, tojson(res));
-assert(currOp.cloneFinishedRecipientOpTime, tojson(res));
-assert(!currOp.expireAt, tojson(res));
+assert.eq(currOp.state, migrationStates.kConsistent, res);
+assert.eq(currOp.migrationCompleted, false, res);
+assert.eq(currOp.dataSyncCompleted, false, res);
+assert(currOp.hasOwnProperty("startFetchingDonorOpTime"), res);
+assert(currOp.hasOwnProperty("startApplyingDonorOpTime"), res);
+assert(currOp.hasOwnProperty("dataConsistentStopDonorOpTime"), res);
+assert(currOp.hasOwnProperty("cloneFinishedRecipientOpTime"), res);
+assert(currOp.hasOwnProperty("approxTotalDataSize"), res);
+assert(currOp.hasOwnProperty("approxTotalBytesCopied"), res);
+assert(currOp.hasOwnProperty("totalReceiveElapsedMillis"), res);
+assert(currOp.hasOwnProperty("remainingReceiveEstimatedMillis"), res);
+assert(!currOp.hasOwnProperty("expireAt"), res);
// The oplog applier should have applied at least the noop resume token.
assert.gte(currOp.numOpsApplied, 1, tojson(res));
fpAfterDataConsistent.off();
@@ -185,15 +201,19 @@ fpAfterForgetMigration.wait();
res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
checkStandardFieldsOK(res);
currOp = res.inprog[0];
-assert.eq(currOp.state, migrationStates.kConsistent, tojson(res));
-assert.eq(currOp.migrationCompleted, false, tojson(res));
+assert.eq(currOp.state, migrationStates.kConsistent, res);
+assert.eq(currOp.migrationCompleted, false, res);
// dataSyncCompleted should have changed.
-assert.eq(currOp.dataSyncCompleted, true, tojson(res));
-assert(currOp.startFetchingDonorOpTime, tojson(res));
-assert(currOp.startApplyingDonorOpTime, tojson(res));
-assert(currOp.dataConsistentStopDonorOpTime, tojson(res));
-assert(currOp.cloneFinishedRecipientOpTime, tojson(res));
-assert(!currOp.expireAt, tojson(res));
+assert.eq(currOp.dataSyncCompleted, true, res);
+assert(currOp.hasOwnProperty("startFetchingDonorOpTime"), res);
+assert(currOp.hasOwnProperty("startApplyingDonorOpTime"), res);
+assert(currOp.hasOwnProperty("dataConsistentStopDonorOpTime"), res);
+assert(currOp.hasOwnProperty("cloneFinishedRecipientOpTime"), res);
+assert(currOp.hasOwnProperty("approxTotalDataSize"), res);
+assert(currOp.hasOwnProperty("approxTotalBytesCopied"), res);
+assert(currOp.hasOwnProperty("totalReceiveElapsedMillis"), res);
+assert(currOp.hasOwnProperty("remainingReceiveEstimatedMillis"), res);
+assert(!currOp.hasOwnProperty("expireAt"), res);
jsTestLog("Allow the forgetMigration to complete.");
fpAfterForgetMigration.off();
@@ -202,15 +222,19 @@ assert.commandWorked(forgetMigrationThread.returnData());
res = recipientPrimary.adminCommand({currentOp: true, desc: "tenant recipient migration"});
checkStandardFieldsOK(res);
currOp = res.inprog[0];
-assert.eq(currOp.dataSyncCompleted, true, tojson(res));
-assert(currOp.startFetchingDonorOpTime, tojson(res));
-assert(currOp.startApplyingDonorOpTime, tojson(res));
-assert(currOp.dataConsistentStopDonorOpTime, tojson(res));
-assert(currOp.cloneFinishedRecipientOpTime, tojson(res));
+assert.eq(currOp.dataSyncCompleted, true, res);
+assert(currOp.hasOwnProperty("startFetchingDonorOpTime"), res);
+assert(currOp.hasOwnProperty("startApplyingDonorOpTime"), res);
+assert(currOp.hasOwnProperty("dataConsistentStopDonorOpTime"), res);
+assert(currOp.hasOwnProperty("cloneFinishedRecipientOpTime"), res);
+assert(currOp.hasOwnProperty("approxTotalDataSize"), res);
+assert(currOp.hasOwnProperty("approxTotalBytesCopied"), res);
+assert(currOp.hasOwnProperty("totalReceiveElapsedMillis"), res);
+assert(currOp.hasOwnProperty("remainingReceiveEstimatedMillis"), res);
// State, completion status and expireAt should have changed.
-assert.eq(currOp.state, migrationStates.kDone, tojson(res));
-assert.eq(currOp.migrationCompleted, true, tojson(res));
-assert(currOp.expireAt, tojson(res));
+assert.eq(currOp.state, migrationStates.kDone, res);
+assert.eq(currOp.migrationCompleted, true, res);
+assert(currOp.hasOwnProperty("expireAt"), res);
assert(currOp.hasOwnProperty("databases"));
assert.eq(0, currOp.databases.databasesClonedBeforeFailover, tojson(res));
diff --git a/src/mongo/db/repl/tenant_all_database_cloner.cpp b/src/mongo/db/repl/tenant_all_database_cloner.cpp
index dd0e1f3e113..2c32d3155de 100644
--- a/src/mongo/db/repl/tenant_all_database_cloner.cpp
+++ b/src/mongo/db/repl/tenant_all_database_cloner.cpp
@@ -65,6 +65,11 @@ BaseCloner::ClonerStages TenantAllDatabaseCloner::getStages() {
return {&_listDatabasesStage, &_listExistingDatabasesStage};
}
+void TenantAllDatabaseCloner::preStage() {
+ stdx::lock_guard lk(_mutex);
+ _stats.start = getSharedData()->getClock()->now();
+}
+
BaseCloner::AfterStageBehavior TenantAllDatabaseCloner::listDatabasesStage() {
// This will be set after a successful listDatabases command.
_operationTime = Timestamp();
@@ -126,7 +131,6 @@ BaseCloner::AfterStageBehavior TenantAllDatabaseCloner::listExistingDatabasesSta
auto opCtx = cc().makeOperationContext();
DBDirectClient client(opCtx.get());
- BSONObj res;
const BSONObj filter = ClonerUtils::makeTenantDatabaseFilter(_tenantId);
auto databasesArray = client.getDatabaseInfos(filter, true /* nameOnly */);
@@ -189,6 +193,26 @@ BaseCloner::AfterStageBehavior TenantAllDatabaseCloner::listExistingDatabasesSta
void TenantAllDatabaseCloner::postStage() {
{
+ // Finish calculating the size of the databases that were either partially cloned or
+ // completely un-cloned from a previous migration. Perform this before grabbing the _mutex,
+ // as commands are being sent over the network.
+ long long approxTotalDataSize = 0;
+ for (const auto& dbName : _databases) {
+ BSONObj res;
+ getClient()->runCommand(dbName, BSON("dbStats" << 1), res);
+ if (auto status = getStatusFromCommandResult(res); !status.isOK()) {
+ LOGV2_WARNING(5426600,
+ "Skipping recording of data size metrics for database due to failure "
+ "in the 'dbStats' command, tenant migration stats may be inaccurate.",
+ "db"_attr = dbName,
+ "migrationId"_attr = getSharedData()->getMigrationId(),
+ "tenantId"_attr = _tenantId,
+ "status"_attr = status);
+ } else {
+ approxTotalDataSize += res.getField("dataSize").safeNumberLong();
+ }
+ }
+
stdx::lock_guard<Latch> lk(_mutex);
_stats.databasesCloned = 0;
_stats.databasesToClone = _databases.size();
@@ -197,7 +221,9 @@ void TenantAllDatabaseCloner::postStage() {
_stats.databaseStats.emplace_back();
_stats.databaseStats.back().dbname = dbName;
}
+ _stats.approxTotalDataSize = approxTotalDataSize;
}
+
for (const auto& dbName : _databases) {
{
stdx::lock_guard<Latch> lk(_mutex);
@@ -231,6 +257,8 @@ void TenantAllDatabaseCloner::postStage() {
{
stdx::lock_guard<Latch> lk(_mutex);
_stats.databaseStats[_stats.databasesCloned] = _currentDatabaseCloner->getStats();
+ _stats.approxTotalBytesCopied +=
+ _stats.databaseStats[_stats.databasesCloned].approxTotalBytesCopied;
_currentDatabaseCloner = nullptr;
_stats.databasesCloned++;
}
@@ -242,6 +270,8 @@ TenantAllDatabaseCloner::Stats TenantAllDatabaseCloner::getStats() const {
TenantAllDatabaseCloner::Stats stats = _stats;
if (_currentDatabaseCloner) {
stats.databaseStats[_stats.databasesCloned] = _currentDatabaseCloner->getStats();
+ stats.approxTotalBytesCopied +=
+ stats.databaseStats[stats.databasesCloned].approxTotalBytesCopied;
}
return stats;
}
@@ -269,6 +299,8 @@ void TenantAllDatabaseCloner::Stats::append(BSONObjBuilder* builder) const {
static_cast<long long>(databasesClonedBeforeFailover));
builder->appendNumber("databasesToClone", static_cast<long long>(databasesToClone));
builder->appendNumber("databasesCloned", static_cast<long long>(databasesCloned));
+ builder->appendNumber("approxTotalDataSize", approxTotalDataSize);
+ builder->appendNumber("approxTotalBytesCopied", approxTotalBytesCopied);
for (auto&& db : databaseStats) {
BSONObjBuilder dbBuilder(builder->subobjStart(db.dbname));
db.append(&dbBuilder);
diff --git a/src/mongo/db/repl/tenant_all_database_cloner.h b/src/mongo/db/repl/tenant_all_database_cloner.h
index 86729055fd0..627f1507f00 100644
--- a/src/mongo/db/repl/tenant_all_database_cloner.h
+++ b/src/mongo/db/repl/tenant_all_database_cloner.h
@@ -46,6 +46,10 @@ public:
size_t databasesCloned{0};
size_t databasesClonedBeforeFailover{0};
std::vector<TenantDatabaseCloner::Stats> databaseStats;
+ Date_t start;
+
+ long long approxTotalDataSize{0};
+ long long approxTotalBytesCopied{0};
std::string toString() const;
BSONObj toBSON() const;
@@ -98,6 +102,11 @@ private:
AfterStageBehavior listExistingDatabasesStage();
/**
+ * The preStage sets the start time in _stats.
+ */
+ void preStage() final;
+
+ /**
*
* The postStage creates and runs the individual TenantDatabaseCloners on each database found on
* the sync source.
diff --git a/src/mongo/db/repl/tenant_collection_cloner.cpp b/src/mongo/db/repl/tenant_collection_cloner.cpp
index a1beb962892..d63cc3fe63b 100644
--- a/src/mongo/db/repl/tenant_collection_cloner.cpp
+++ b/src/mongo/db/repl/tenant_collection_cloner.cpp
@@ -176,10 +176,27 @@ BaseCloner::AfterStageBehavior TenantCollectionCloner::countStage() {
count = 0;
}
+ BSONObj res;
+ getClient()->runCommand(
+ _sourceNss.db().toString(), BSON("collStats" << _sourceNss.coll()), res);
+ auto status = getStatusFromCommandResult(res);
+ if (!status.isOK()) {
+ LOGV2_WARNING(5426601,
+ "Skipping recording of data size metrics for collection due to failure in the"
+ " 'collStats' command, tenant migration stats may be inaccurate.",
+ "nss"_attr = _sourceNss,
+ "migrationId"_attr = getSharedData()->getMigrationId(),
+ "tenantId"_attr = _tenantId,
+ "status"_attr = status);
+ }
+
_progressMeter.setTotalWhileRunning(static_cast<unsigned long long>(count));
{
stdx::lock_guard<Latch> lk(_mutex);
_stats.documentToCopy = count;
+ _stats.approxTotalDataSize = status.isOK() ? res.getField("size").safeNumberLong() : 0;
+ _stats.avgObjSize =
+ _stats.approxTotalDataSize ? res.getField("avgObjSize").safeNumberLong() : 0;
}
return kContinueNormally;
}
@@ -342,6 +359,7 @@ BaseCloner::AfterStageBehavior TenantCollectionCloner::createCollectionStage() {
{
stdx::lock_guard<Latch> lk(_mutex);
_stats.documentsCopied += count;
+ _stats.approxTotalBytesCopied = ((long)_stats.documentsCopied) * _stats.avgObjSize;
_progressMeter.hit(count);
}
} else {
@@ -508,6 +526,7 @@ void TenantCollectionCloner::insertDocumentsCallback(
}
_documentsToInsert.swap(docs);
_stats.documentsCopied += docs.size();
+ _stats.approxTotalBytesCopied = ((long)_stats.documentsCopied) * _stats.avgObjSize;
++_stats.insertedBatches;
_progressMeter.hit(int(docs.size()));
}
diff --git a/src/mongo/db/repl/tenant_collection_cloner.h b/src/mongo/db/repl/tenant_collection_cloner.h
index a0acf734429..4615670690b 100644
--- a/src/mongo/db/repl/tenant_collection_cloner.h
+++ b/src/mongo/db/repl/tenant_collection_cloner.h
@@ -55,6 +55,10 @@ public:
size_t indexes{0};
size_t insertedBatches{0};
size_t receivedBatches{0};
+ long long avgObjSize{0};
+ long long approxTotalDataSize{0};
+ long long approxTotalBytesCopied{0};
+
std::string toString() const;
BSONObj toBSON() const;
diff --git a/src/mongo/db/repl/tenant_collection_cloner_test.cpp b/src/mongo/db/repl/tenant_collection_cloner_test.cpp
index 8b2d79e25e9..d87bd5654c2 100644
--- a/src/mongo/db/repl/tenant_collection_cloner_test.cpp
+++ b/src/mongo/db/repl/tenant_collection_cloner_test.cpp
@@ -117,6 +117,9 @@ protected:
TenantClonerTestFixture::setUp();
_mockServer->assignCollectionUuid(_nss.ns(), _collUuid);
+ _mockServer->setCommandReply("dbStats", StatusWith<BSONObj>(BSON("dataSize" << 1)));
+ _mockServer->setCommandReply("collStats", BSON("size" << 1));
+
_mockClient->setOperationTime(_operationTime);
{
diff --git a/src/mongo/db/repl/tenant_database_cloner.cpp b/src/mongo/db/repl/tenant_database_cloner.cpp
index faeafb32516..7ee0594e672 100644
--- a/src/mongo/db/repl/tenant_database_cloner.cpp
+++ b/src/mongo/db/repl/tenant_database_cloner.cpp
@@ -302,6 +302,8 @@ void TenantDatabaseCloner::postStage() {
{
stdx::lock_guard<Latch> lk(_mutex);
_stats.collectionStats[_stats.clonedCollections] = _currentCollectionCloner->getStats();
+ _stats.approxTotalBytesCopied +=
+ _stats.collectionStats[_stats.clonedCollections].approxTotalBytesCopied;
_currentCollectionCloner = nullptr;
// Abort the tenant database cloner if the collection clone failed.
if (!collStatus.isOK())
@@ -318,6 +320,8 @@ TenantDatabaseCloner::Stats TenantDatabaseCloner::getStats() const {
TenantDatabaseCloner::Stats stats = _stats;
if (_currentCollectionCloner) {
stats.collectionStats[_stats.clonedCollections] = _currentCollectionCloner->getStats();
+ stats.approxTotalBytesCopied +=
+ stats.collectionStats[stats.clonedCollections].approxTotalBytesCopied;
}
return stats;
}
diff --git a/src/mongo/db/repl/tenant_database_cloner.h b/src/mongo/db/repl/tenant_database_cloner.h
index 94413fffefb..f0aaaca38d3 100644
--- a/src/mongo/db/repl/tenant_database_cloner.h
+++ b/src/mongo/db/repl/tenant_database_cloner.h
@@ -48,7 +48,9 @@ public:
size_t collections{0};
size_t clonedCollections{0};
size_t clonedCollectionsBeforeFailover{0};
+
std::vector<TenantCollectionCloner::Stats> collectionStats;
+ long long approxTotalBytesCopied{0};
std::string toString() const;
BSONObj toBSON() const;
diff --git a/src/mongo/db/repl/tenant_database_cloner_test.cpp b/src/mongo/db/repl/tenant_database_cloner_test.cpp
index 287a7904115..e72e393d6a7 100644
--- a/src/mongo/db/repl/tenant_database_cloner_test.cpp
+++ b/src/mongo/db/repl/tenant_database_cloner_test.cpp
@@ -77,6 +77,7 @@ protected:
return Status::OK();
};
_mockClient->setOperationTime(_operationTime);
+ _mockServer->setCommandReply("collStats", BSON("size" << 1));
}
std::unique_ptr<TenantDatabaseCloner> makeDatabaseCloner(
diff --git a/src/mongo/db/repl/tenant_migration_recipient_service.cpp b/src/mongo/db/repl/tenant_migration_recipient_service.cpp
index 5f4e7cc0f7c..5ccd84db32d 100644
--- a/src/mongo/db/repl/tenant_migration_recipient_service.cpp
+++ b/src/mongo/db/repl/tenant_migration_recipient_service.cpp
@@ -327,6 +327,23 @@ boost::optional<BSONObj> TenantMigrationRecipientService::Instance::reportForCur
_stateDoc.getNumRestartsDueToDonorConnectionFailure());
bob.append("numRestartsDueToRecipientFailure", _stateDoc.getNumRestartsDueToRecipientFailure());
+ if (_tenantAllDatabaseCloner) {
+ auto stats = _tenantAllDatabaseCloner->getStats();
+ bob.append("approxTotalDataSize", stats.approxTotalDataSize);
+ bob.append("approxTotalBytesCopied", stats.approxTotalBytesCopied);
+
+ long long elapsedMillis = duration_cast<Milliseconds>(Date_t::now() - stats.start).count();
+ bob.append("totalReceiveElapsedMillis", elapsedMillis);
+
+ // Perform the multiplication first to avoid rounding errors, and add one to avoid division
+ // by 0.
+ long long timeRemainingMillis =
+ ((stats.approxTotalDataSize - stats.approxTotalBytesCopied) * elapsedMillis) /
+ (stats.approxTotalBytesCopied + 1);
+
+ bob.append("remainingReceiveEstimatedMillis", timeRemainingMillis);
+ }
+
if (_stateDoc.getStartFetchingDonorOpTime())
bob.append("startFetchingDonorOpTime", _stateDoc.getStartFetchingDonorOpTime()->toBSON());
if (_stateDoc.getStartApplyingDonorOpTime())