diff options
author | Dianna Hohensee <dianna.hohensee@10gen.com> | 2016-05-24 10:55:23 -0400 |
---|---|---|
committer | Dianna Hohensee <dianna.hohensee@10gen.com> | 2016-06-21 15:22:36 -0400 |
commit | dfe69f949be0e1aa83efcaaea07cc0c4639677bf (patch) | |
tree | d7beb97f3db76157520adcfaa8893808bbdb991c | |
parent | 1daac14a2bc1cd67edf509075ca2b503bdeebb38 (diff) | |
download | mongo-dfe69f949be0e1aa83efcaaea07cc0c4639677bf.tar.gz |
SERVER-24031 Prevent cleanupOrphaned from starting if a migration is running.
3 files changed, 84 insertions, 45 deletions
diff --git a/jstests/sharding/cleanup_orphaned_cmd_during_movechunk.js b/jstests/sharding/cleanup_orphaned_cmd_during_movechunk.js index 4283abdae48..dc82ab033ed 100644 --- a/jstests/sharding/cleanup_orphaned_cmd_during_movechunk.js +++ b/jstests/sharding/cleanup_orphaned_cmd_during_movechunk.js @@ -1,7 +1,9 @@ // // Tests cleanupOrphaned concurrent with moveChunk. -// Inserts orphan documents to the donor and recipient shards during the moveChunk and -// verifies that cleanupOrphaned removes orphans. +// +// Inserts orphan documents on the donor and recipient shards during a moveChunk and verifies that +// cleanupOrphaned removes orphans on the recipient shard and fails on the donor shard -- +// cleanupOrphaned aborts if there is an active migration when it is called. // load('./jstests/libs/chunk_manipulation_util.js'); @@ -30,25 +32,25 @@ load('./jstests/libs/cleanup_orphaned_util.js'); assert.commandWorked(admin.runCommand( {moveChunk: ns, find: {_id: 20}, to: shards[1]._id, _waitForDelete: true})); - jsTest.log('Inserting 40 docs into shard 0....'); + jsTest.log('Inserting 20 docs into shard 0....'); for (var i = -20; i < 20; i += 2) coll.insert({_id: i}); assert.eq(null, coll.getDB().getLastError()); assert.eq(20, donorColl.count()); - jsTest.log('Inserting 25 docs into shard 1....'); + jsTest.log('Inserting 10 docs into shard 1....'); for (i = 20; i < 40; i += 2) coll.insert({_id: i}); assert.eq(null, coll.getDB().getLastError()); assert.eq(10, recipientColl.count()); // - // Start a moveChunk in the background. Move chunk [0, 20), which has 10 docs, - // from shard 0 to shard 1. Pause it at some points in the donor's and - // recipient's work flows, and test cleanupOrphaned on shard 0 and shard 1. + // Start a moveChunk in the background. Move chunk [0, 20), which has 10 docs, from shard 0 to + // shard 1. Pause it at some points in the donor and recipient's work flows, and test + // cleanupOrphaned on shard 0 and shard 1. // - jsTest.log('setting failpoint startedMoveChunk'); + jsTest.log('setting failpoint startedMoveChunk (donor) and cloned (recipient)'); pauseMoveChunkAtStep(donor, moveChunkStepNames.startedMoveChunk); pauseMigrateAtStep(recipient, migrateStepNames.cloned); var joinMoveChunk = moveChunkParallel( @@ -75,42 +77,49 @@ load('./jstests/libs/cleanup_orphaned_util.js'); assert.eq(null, recipientColl.getDB().getLastError()); assert.eq(21, recipientColl.count()); - cleanupOrphaned(donor, ns, 2); - assert.eq(20, donorColl.count()); + // Cannot clean donor shard -- active migration --, but can clean recipient. + assert.throws(function() { + cleanupOrphaned(donor, ns, 2); + }); + assert.eq(21, donorColl.count()); cleanupOrphaned(recipient, ns, 2); assert.eq(20, recipientColl.count()); jsTest.log('Inserting document on donor side'); - // Inserted a new document (not an orphan) with id 19, which belongs in the - // [0, 20) chunk. + // Inserted a new document (not an orphan) with id 19, which belongs in the migrating [0, 20) + // chunk. donorColl.insert({_id: 19}); assert.eq(null, coll.getDB().getLastError()); - assert.eq(21, donorColl.count()); + assert.eq(22, donorColl.count()); // Recipient transfers this modification. jsTest.log('Let migrate proceed to transferredMods'); - pauseMigrateAtStep(recipient, migrateStepNames.transferredMods); - unpauseMigrateAtStep(recipient, migrateStepNames.cloned); - waitForMigrateStep(recipient, migrateStepNames.transferredMods); + proceedToMigrateStep(recipient, migrateStepNames.transferredMods); jsTest.log('Done letting migrate proceed to transferredMods'); assert.eq(21, recipientColl.count(), "Recipient didn't transfer inserted document."); - cleanupOrphaned(donor, ns, 2); - assert.eq(21, donorColl.count()); + // Still cannot clean donor -- active migration. Clean up runs on recipient, but the pending + // range for the migration is safe from cleanupOrphaned. + assert.throws(function() { + cleanupOrphaned(donor, ns, 2); + }); + assert.eq(22, donorColl.count()); cleanupOrphaned(recipient, ns, 2); assert.eq(21, recipientColl.count()); // Create orphans. - donorColl.insert([{_id: 26}]); + donorColl.insert([{_id: 27}]); assert.eq(null, donorColl.getDB().getLastError()); - assert.eq(22, donorColl.count()); + assert.eq(23, donorColl.count()); recipientColl.insert([{_id: -1}]); assert.eq(null, recipientColl.getDB().getLastError()); assert.eq(22, recipientColl.count()); - cleanupOrphaned(donor, ns, 2); - assert.eq(21, donorColl.count()); + assert.throws(function() { + cleanupOrphaned(donor, ns, 2); + }); + assert.eq(23, donorColl.count()); cleanupOrphaned(recipient, ns, 2); assert.eq(21, recipientColl.count()); @@ -121,26 +130,38 @@ load('./jstests/libs/cleanup_orphaned_util.js'); proceedToMigrateStep(recipient, migrateStepNames.done); // Create orphans. - donorColl.insert([{_id: 26}]); + donorColl.insert([{_id: 28}]); assert.eq(null, donorColl.getDB().getLastError()); - assert.eq(22, donorColl.count()); + assert.eq(24, donorColl.count()); recipientColl.insert([{_id: -1}]); assert.eq(null, recipientColl.getDB().getLastError()); assert.eq(22, recipientColl.count()); - // cleanupOrphaned should still fail on donor, but should work on the recipient - cleanupOrphaned(donor, ns, 2); - assert.eq(10, donorColl.count()); + assert.throws(function() { + cleanupOrphaned(donor, ns, 2); + }); + assert.eq(24, donorColl.count()); cleanupOrphaned(recipient, ns, 2); assert.eq(21, recipientColl.count()); - // Let migration thread complete. + // Let recipient side of the migration finish so that the donor can proceed with the commit and + // complete the migration. unpauseMigrateAtStep(recipient, migrateStepNames.done); + waitForMoveChunkStep(donor, moveChunkStepNames.committed); + + // Donor is paused after the migration chunk commit, but before it finishes the cleanup that + // includes running the range deleter. Thus it technically has orphaned data -- migration is + // complete, but moved data is still present. cleanupOrphaned can remove the data the donor + // would otherwise clean up itself in its post-migration delete phase. + cleanupOrphaned(donor, ns, 2); + assert.eq(10, donorColl.count()); + + // Let the donor migration finish. unpauseMoveChunkAtStep(donor, moveChunkStepNames.committed); joinMoveChunk(); - // Donor has finished post-move delete. - cleanupOrphaned(donor, ns, 2); // this is necessary for the count to not be 11 + // Donor has finished post-move delete, which had nothing to remove with the range deleter + // because of the preemptive cleanupOrphaned call. assert.eq(10, donorColl.count()); assert.eq(21, recipientColl.count()); assert.eq(31, coll.count()); diff --git a/jstests/sharding/cleanup_orphaned_cmd_during_movechunk_hashed.js b/jstests/sharding/cleanup_orphaned_cmd_during_movechunk_hashed.js index ecbfc39ad65..a3051725825 100644 --- a/jstests/sharding/cleanup_orphaned_cmd_during_movechunk_hashed.js +++ b/jstests/sharding/cleanup_orphaned_cmd_during_movechunk_hashed.js @@ -53,9 +53,9 @@ load('./jstests/libs/cleanup_orphaned_util.js'); assert.eq(null, coll.getDB().getLastError()); // - // Start a moveChunk in the background from shard 0 to shard 1. Pause it at - // some points in the donor's and recipient's work flows, and test - // cleanupOrphaned. + // Start a moveChunk in the background from shard 0 to shard 1. Pause it at some points in the + // donor's and recipient's work flows and test cleanupOrphaned: it should fail on the donor + // while the migration is active. // var donor, recip; @@ -67,9 +67,9 @@ load('./jstests/libs/cleanup_orphaned_util.js'); donor = st.shard1; } - jsTest.log('setting failpoint startedMoveChunk'); + jsTest.log('setting failpoint startedMoveChunk (donor) and cloned (recipient)'); pauseMoveChunkAtStep(donor, moveChunkStepNames.startedMoveChunk); - pauseMigrateAtStep(recip, migrateStepNames.cloned); + pauseMigrateAtStep(recip, migrateStepNames.transferredMods); var joinMoveChunk = moveChunkParallel(staticMongod, st.s0.host, @@ -79,8 +79,7 @@ load('./jstests/libs/cleanup_orphaned_util.js'); recip.shardName); waitForMoveChunkStep(donor, moveChunkStepNames.startedMoveChunk); - waitForMigrateStep(recip, migrateStepNames.cloned); - proceedToMigrateStep(recip, migrateStepNames.transferredMods); + waitForMigrateStep(recip, migrateStepNames.transferredMods); // recipient has run _recvChunkStart and begun its migration thread; // 'doc' has been cloned and chunkWithDoc is noted as 'pending' on recipient. @@ -89,9 +88,11 @@ load('./jstests/libs/cleanup_orphaned_util.js'); assert.eq(1, donorColl.count()); assert.eq(1, recipColl.count()); - // cleanupOrphaned should go through two iterations, since the default chunk - // setup leaves two unowned ranges on each shard. - cleanupOrphaned(donor, ns, 2); + // cleanupOrphaned should go through two iterations, since the default chunk setup leaves two + // unowned ranges on each shard. Command fails on donor shard because of active migration. + assert.throws(function() { + cleanupOrphaned(donor, ns, 2); + }); cleanupOrphaned(recip, ns, 2); assert.eq(1, donorColl.count()); assert.eq(1, recipColl.count()); @@ -102,15 +103,26 @@ load('./jstests/libs/cleanup_orphaned_util.js'); unpauseMigrateAtStep(recip, migrateStepNames.transferredMods); proceedToMigrateStep(recip, migrateStepNames.done); - // cleanupOrphaned removes migrated data from donor. The donor would - // otherwise clean them up itself, in the post-move delete phase. - cleanupOrphaned(donor, ns, 2); - assert.eq(0, donorColl.count()); + // Donor cannot clean up orphans while there's an active migration. + assert.throws(function() { + cleanupOrphaned(donor, ns, 2); + }); + assert.eq(1, donorColl.count()); cleanupOrphaned(recip, ns, 2); assert.eq(1, recipColl.count()); - // Let migration thread complete. + // Let recip side of the migration finish so that the donor proceeds with the commit. unpauseMigrateAtStep(recip, migrateStepNames.done); + waitForMoveChunkStep(donor, moveChunkStepNames.committed); + + // Donor is paused after the migration chunk commit, but before it finishes the cleanup that + // includes running the range deleter. Thus it technically has orphaned data -- commit is + // complete, but moved data is still present. cleanupOrphaned can remove the data the donor + // would otherwise clean up itself in its post-move delete phase. + cleanupOrphaned(donor, ns, 2); + assert.eq(0, donorColl.count()); + + // Let migration thread complete. unpauseMoveChunkAtStep(donor, moveChunkStepNames.committed); joinMoveChunk(); diff --git a/src/mongo/db/commands/cleanup_orphaned_cmd.cpp b/src/mongo/db/commands/cleanup_orphaned_cmd.cpp index e3d36168a1f..11414984b2f 100644 --- a/src/mongo/db/commands/cleanup_orphaned_cmd.cpp +++ b/src/mongo/db/commands/cleanup_orphaned_cmd.cpp @@ -268,6 +268,12 @@ public: return false; } + if (shardingState->migrationSourceManager()->isActive()) { + errmsg = str::stream() << "Unable to cleanup orphaned documents because there is an " + << "active migration"; + return false; + } + ChunkVersion shardVersion; status = shardingState->refreshMetadataNow(txn, ns, &shardVersion); if (!status.isOK()) { |