diff options
author | Jack Mulrow <jack.mulrow@mongodb.com> | 2019-05-15 13:05:08 -0400 |
---|---|---|
committer | Jack Mulrow <jack.mulrow@mongodb.com> | 2019-05-20 11:21:01 -0400 |
commit | 90dd9f8983183cb3cb6f7b4a9dfc16ce972655fe (patch) | |
tree | 163abe804839d81bebe89936658b13fd1f753193 | |
parent | bc456b822cc243e63d2e539f11beba9538f11267 (diff) | |
download | mongo-90dd9f8983183cb3cb6f7b4a9dfc16ce972655fe.tar.gz |
SERVER-40982 Add counters for aborted migrations from lock timeouts
-rw-r--r-- | jstests/sharding/sharding_statistics_server_status.js | 103 | ||||
-rw-r--r-- | src/mongo/db/s/move_chunk_command.cpp | 4 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_statistics.cpp | 1 | ||||
-rw-r--r-- | src/mongo/db/s/sharding_statistics.h | 4 |
4 files changed, 110 insertions, 2 deletions
diff --git a/jstests/sharding/sharding_statistics_server_status.js b/jstests/sharding/sharding_statistics_server_status.js index 396dd6d1f25..bd54eeb51cd 100644 --- a/jstests/sharding/sharding_statistics_server_status.js +++ b/jstests/sharding/sharding_statistics_server_status.js @@ -5,6 +5,10 @@ (function() { 'use strict'; + + load("jstests/libs/chunk_manipulation_util.js"); + load("jstests/libs/parallelTester.js"); + function ShardStat() { this.countDonorMoveChunkStarted = 0; this.countRecipientMoveChunkStarted = 0; @@ -28,6 +32,7 @@ assert(statsFromServerStatus[i].totalCriticalSectionCommitTimeMillis); assert(statsFromServerStatus[i].totalCriticalSectionTimeMillis); assert(statsFromServerStatus[i].totalDonorChunkCloneTimeMillis); + assert(statsFromServerStatus[i].countDonorMoveChunkLockTimeout); assert.eq(stats[i].countDonorMoveChunkStarted, statsFromServerStatus[i].countDonorMoveChunkStarted); assert.eq(stats[i].countDocsClonedOnRecipient, @@ -40,10 +45,31 @@ statsFromServerStatus[i].countRecipientMoveChunkStarted); } } + + function checkServerStatusMigrationLockTimeoutCount(shardConn, count) { + const shardStats = + assert.commandWorked(shardConn.adminCommand({serverStatus: 1})).shardingStatistics; + assert(shardStats.hasOwnProperty("countDonorMoveChunkLockTimeout")); + assert.eq(count, shardStats.countDonorMoveChunkLockTimeout); + } + + function runConcurrentMoveChunk(host, ns, toShard) { + const mongos = new Mongo(host); + return mongos.adminCommand({moveChunk: ns, find: {_id: 1}, to: toShard}); + } + + function runConcurrentRead(host, dbName, collName) { + const mongos = new Mongo(host); + return mongos.getDB(dbName)[collName].find({_id: 5}).comment("concurrent read").itcount(); + } + + const dbName = "db"; + const collName = "coll"; + const st = new ShardingTest({shards: 2, mongos: 1}); const mongos = st.s0; const admin = mongos.getDB("admin"); - const coll = mongos.getCollection("db.coll"); + const coll = mongos.getCollection(dbName + "." + collName); const numDocsToInsert = 3; const shardArr = [st.shard0, st.shard1]; const stats = [new ShardStat(), new ShardStat()]; @@ -78,6 +104,79 @@ {moveChunk: coll + '', find: {_id: 1}, to: st.shard0.shardName, _waitForDelete: true})); incrementStatsAndCheckServerShardStats(stats[1], stats[0], numDocsInserted); - st.stop(); + // + // Tests for the count of migrations aborting from lock timeouts. + // + + // Lower migrationLockAcquisitionMaxWaitMS so migrations time out more quickly. + const donorConn = st.rs0.getPrimary(); + const lockParameterRes = assert.commandWorked( + donorConn.adminCommand({getParameter: 1, migrationLockAcquisitionMaxWaitMS: 1})); + const originalMigrationLockTimeout = lockParameterRes.migrationLockAcquisitionMaxWaitMS; + assert.commandWorked( + donorConn.adminCommand({setParameter: 1, migrationLockAcquisitionMaxWaitMS: 2 * 1000})); + + // Counter starts at 0. + checkServerStatusMigrationLockTimeoutCount(donorConn, 0); + // Pause a migration before entering the critical section. + pauseMoveChunkAtStep(donorConn, moveChunkStepNames.reachedSteadyState); + let moveChunkThread = new ScopedThread( + runConcurrentMoveChunk, st.s.host, dbName + "." + collName, st.shard1.shardName); + moveChunkThread.start(); + waitForMoveChunkStep(donorConn, moveChunkStepNames.reachedSteadyState); + + // Start a transaction and insert to the migrating chunk to block entering the critical section. + const session = mongos.startSession(); + session.startTransaction(); + session.getDatabase(dbName)[collName].insert({_id: 5}); + + // Unpause the migration and it should time out entering the critical section. + unpauseMoveChunkAtStep(donorConn, moveChunkStepNames.reachedSteadyState); + moveChunkThread.join(); + assert.commandFailedWithCode(moveChunkThread.returnData(), ErrorCodes.LockTimeout); + + // Clean up the transaction and verify the counter was incremented in serverStatus. + assert.commandWorked(session.abortTransaction_forTesting()); + + checkServerStatusMigrationLockTimeoutCount(donorConn, 1); + + // Writes are blocked during the critical section, so insert a document into the chunk to be + // moved before the migration begins that can be read later. + assert.commandWorked(st.s.getDB(dbName)[collName].insert({_id: 5})); + + // Pause a migration after entering the critical section, but before entering the commit phase. + pauseMoveChunkAtStep(donorConn, moveChunkStepNames.chunkDataCommitted); + moveChunkThread = new ScopedThread( + runConcurrentMoveChunk, st.s.host, dbName + "." + collName, st.shard1.shardName); + moveChunkThread.start(); + waitForMoveChunkStep(donorConn, moveChunkStepNames.chunkDataCommitted); + + // Pause a read while it's holding locks so the migration can't commit. + assert.commandWorked(donorConn.adminCommand( + {configureFailPoint: "waitInFindBeforeMakingBatch", mode: "alwaysOn"})); + const concurrentRead = new ScopedThread(runConcurrentRead, st.s.host, dbName, collName); + concurrentRead.start(); + assert.soon(function() { + const curOpResults = assert.commandWorked(donorConn.adminCommand({currentOp: 1})); + return curOpResults.inprog.some(op => op["command"]["comment"] === "concurrent read"); + }); + + // Unpause the migration and it should time out entering the commit phase. + unpauseMoveChunkAtStep(donorConn, moveChunkStepNames.chunkDataCommitted); + moveChunkThread.join(); + assert.commandFailedWithCode(moveChunkThread.returnData(), ErrorCodes.LockTimeout); + + // Let the read finish and verify the counter was incremented in serverStatus. + assert.commandWorked( + donorConn.adminCommand({configureFailPoint: "waitInFindBeforeMakingBatch", mode: "off"})); + concurrentRead.join(); + assert.eq(1, concurrentRead.returnData()); + + checkServerStatusMigrationLockTimeoutCount(donorConn, 2); + + assert.commandWorked(donorConn.adminCommand( + {setParameter: 1, migrationLockAcquisitionMaxWaitMS: originalMigrationLockTimeout})); + + st.stop(); })(); diff --git a/src/mongo/db/s/move_chunk_command.cpp b/src/mongo/db/s/move_chunk_command.cpp index 10ec850f4b7..dd62c984292 100644 --- a/src/mongo/db/s/move_chunk_command.cpp +++ b/src/mongo/db/s/move_chunk_command.cpp @@ -42,6 +42,7 @@ #include "mongo/db/s/migration_source_manager.h" #include "mongo/db/s/move_timing_helper.h" #include "mongo/db/s/sharding_state.h" +#include "mongo/db/s/sharding_statistics.h" #include "mongo/s/client/shard_registry.h" #include "mongo/s/grid.h" #include "mongo/s/request_types/migration_secondary_throttle_options.h" @@ -142,6 +143,9 @@ public: status = Status::OK(); } catch (const DBException& e) { status = e.toStatus(); + if (status.code() == ErrorCodes::LockTimeout) { + ShardingStatistics::get(opCtx).countDonorMoveChunkLockTimeout.addAndFetch(1); + } } catch (const std::exception& e) { scopedMigration.signalComplete( {ErrorCodes::InternalError, diff --git a/src/mongo/db/s/sharding_statistics.cpp b/src/mongo/db/s/sharding_statistics.cpp index 5589084357f..9ae2b0bc0ec 100644 --- a/src/mongo/db/s/sharding_statistics.cpp +++ b/src/mongo/db/s/sharding_statistics.cpp @@ -62,6 +62,7 @@ void ShardingStatistics::report(BSONObjBuilder* builder) const { builder->append("countDocsClonedOnDonor", countDocsClonedOnDonor.load()); builder->append("countRecipientMoveChunkStarted", countRecipientMoveChunkStarted.load()); builder->append("countDocsDeletedOnDonor", countDocsDeletedOnDonor.load()); + builder->append("countDonorMoveChunkLockTimeout", countDonorMoveChunkLockTimeout.load()); } } // namespace mongo diff --git a/src/mongo/db/s/sharding_statistics.h b/src/mongo/db/s/sharding_statistics.h index 3734f7a3042..aac4876ad92 100644 --- a/src/mongo/db/s/sharding_statistics.h +++ b/src/mongo/db/s/sharding_statistics.h @@ -87,6 +87,10 @@ struct ShardingStatistics { // from the donor to the recipient). AtomicWord<long long> totalCriticalSectionTimeMillis{0}; + // Cumulative, always-increasing counter of the number of migrations aborted on this node + // after timing out waiting to acquire a lock. + AtomicWord<long long> countDonorMoveChunkLockTimeout{0}; + /** * Obtains the per-process instance of the sharding statistics object. */ |