summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJack Mulrow <jack.mulrow@mongodb.com>2019-05-15 13:05:08 -0400
committerJack Mulrow <jack.mulrow@mongodb.com>2019-05-20 11:21:01 -0400
commit90dd9f8983183cb3cb6f7b4a9dfc16ce972655fe (patch)
tree163abe804839d81bebe89936658b13fd1f753193
parentbc456b822cc243e63d2e539f11beba9538f11267 (diff)
downloadmongo-90dd9f8983183cb3cb6f7b4a9dfc16ce972655fe.tar.gz
SERVER-40982 Add counters for aborted migrations from lock timeouts
-rw-r--r--jstests/sharding/sharding_statistics_server_status.js103
-rw-r--r--src/mongo/db/s/move_chunk_command.cpp4
-rw-r--r--src/mongo/db/s/sharding_statistics.cpp1
-rw-r--r--src/mongo/db/s/sharding_statistics.h4
4 files changed, 110 insertions, 2 deletions
diff --git a/jstests/sharding/sharding_statistics_server_status.js b/jstests/sharding/sharding_statistics_server_status.js
index 396dd6d1f25..bd54eeb51cd 100644
--- a/jstests/sharding/sharding_statistics_server_status.js
+++ b/jstests/sharding/sharding_statistics_server_status.js
@@ -5,6 +5,10 @@
(function() {
'use strict';
+
+ load("jstests/libs/chunk_manipulation_util.js");
+ load("jstests/libs/parallelTester.js");
+
function ShardStat() {
this.countDonorMoveChunkStarted = 0;
this.countRecipientMoveChunkStarted = 0;
@@ -28,6 +32,7 @@
assert(statsFromServerStatus[i].totalCriticalSectionCommitTimeMillis);
assert(statsFromServerStatus[i].totalCriticalSectionTimeMillis);
assert(statsFromServerStatus[i].totalDonorChunkCloneTimeMillis);
+ assert(statsFromServerStatus[i].countDonorMoveChunkLockTimeout);
assert.eq(stats[i].countDonorMoveChunkStarted,
statsFromServerStatus[i].countDonorMoveChunkStarted);
assert.eq(stats[i].countDocsClonedOnRecipient,
@@ -40,10 +45,31 @@
statsFromServerStatus[i].countRecipientMoveChunkStarted);
}
}
+
+ function checkServerStatusMigrationLockTimeoutCount(shardConn, count) {
+ const shardStats =
+ assert.commandWorked(shardConn.adminCommand({serverStatus: 1})).shardingStatistics;
+ assert(shardStats.hasOwnProperty("countDonorMoveChunkLockTimeout"));
+ assert.eq(count, shardStats.countDonorMoveChunkLockTimeout);
+ }
+
+ function runConcurrentMoveChunk(host, ns, toShard) {
+ const mongos = new Mongo(host);
+ return mongos.adminCommand({moveChunk: ns, find: {_id: 1}, to: toShard});
+ }
+
+ function runConcurrentRead(host, dbName, collName) {
+ const mongos = new Mongo(host);
+ return mongos.getDB(dbName)[collName].find({_id: 5}).comment("concurrent read").itcount();
+ }
+
+ const dbName = "db";
+ const collName = "coll";
+
const st = new ShardingTest({shards: 2, mongos: 1});
const mongos = st.s0;
const admin = mongos.getDB("admin");
- const coll = mongos.getCollection("db.coll");
+ const coll = mongos.getCollection(dbName + "." + collName);
const numDocsToInsert = 3;
const shardArr = [st.shard0, st.shard1];
const stats = [new ShardStat(), new ShardStat()];
@@ -78,6 +104,79 @@
{moveChunk: coll + '', find: {_id: 1}, to: st.shard0.shardName, _waitForDelete: true}));
incrementStatsAndCheckServerShardStats(stats[1], stats[0], numDocsInserted);
- st.stop();
+ //
+ // Tests for the count of migrations aborting from lock timeouts.
+ //
+
+ // Lower migrationLockAcquisitionMaxWaitMS so migrations time out more quickly.
+ const donorConn = st.rs0.getPrimary();
+ const lockParameterRes = assert.commandWorked(
+ donorConn.adminCommand({getParameter: 1, migrationLockAcquisitionMaxWaitMS: 1}));
+ const originalMigrationLockTimeout = lockParameterRes.migrationLockAcquisitionMaxWaitMS;
+ assert.commandWorked(
+ donorConn.adminCommand({setParameter: 1, migrationLockAcquisitionMaxWaitMS: 2 * 1000}));
+
+ // Counter starts at 0.
+ checkServerStatusMigrationLockTimeoutCount(donorConn, 0);
+ // Pause a migration before entering the critical section.
+ pauseMoveChunkAtStep(donorConn, moveChunkStepNames.reachedSteadyState);
+ let moveChunkThread = new ScopedThread(
+ runConcurrentMoveChunk, st.s.host, dbName + "." + collName, st.shard1.shardName);
+ moveChunkThread.start();
+ waitForMoveChunkStep(donorConn, moveChunkStepNames.reachedSteadyState);
+
+ // Start a transaction and insert to the migrating chunk to block entering the critical section.
+ const session = mongos.startSession();
+ session.startTransaction();
+ session.getDatabase(dbName)[collName].insert({_id: 5});
+
+ // Unpause the migration and it should time out entering the critical section.
+ unpauseMoveChunkAtStep(donorConn, moveChunkStepNames.reachedSteadyState);
+ moveChunkThread.join();
+ assert.commandFailedWithCode(moveChunkThread.returnData(), ErrorCodes.LockTimeout);
+
+ // Clean up the transaction and verify the counter was incremented in serverStatus.
+ assert.commandWorked(session.abortTransaction_forTesting());
+
+ checkServerStatusMigrationLockTimeoutCount(donorConn, 1);
+
+ // Writes are blocked during the critical section, so insert a document into the chunk to be
+ // moved before the migration begins that can be read later.
+ assert.commandWorked(st.s.getDB(dbName)[collName].insert({_id: 5}));
+
+ // Pause a migration after entering the critical section, but before entering the commit phase.
+ pauseMoveChunkAtStep(donorConn, moveChunkStepNames.chunkDataCommitted);
+ moveChunkThread = new ScopedThread(
+ runConcurrentMoveChunk, st.s.host, dbName + "." + collName, st.shard1.shardName);
+ moveChunkThread.start();
+ waitForMoveChunkStep(donorConn, moveChunkStepNames.chunkDataCommitted);
+
+ // Pause a read while it's holding locks so the migration can't commit.
+ assert.commandWorked(donorConn.adminCommand(
+ {configureFailPoint: "waitInFindBeforeMakingBatch", mode: "alwaysOn"}));
+ const concurrentRead = new ScopedThread(runConcurrentRead, st.s.host, dbName, collName);
+ concurrentRead.start();
+ assert.soon(function() {
+ const curOpResults = assert.commandWorked(donorConn.adminCommand({currentOp: 1}));
+ return curOpResults.inprog.some(op => op["command"]["comment"] === "concurrent read");
+ });
+
+ // Unpause the migration and it should time out entering the commit phase.
+ unpauseMoveChunkAtStep(donorConn, moveChunkStepNames.chunkDataCommitted);
+ moveChunkThread.join();
+ assert.commandFailedWithCode(moveChunkThread.returnData(), ErrorCodes.LockTimeout);
+
+ // Let the read finish and verify the counter was incremented in serverStatus.
+ assert.commandWorked(
+ donorConn.adminCommand({configureFailPoint: "waitInFindBeforeMakingBatch", mode: "off"}));
+ concurrentRead.join();
+ assert.eq(1, concurrentRead.returnData());
+
+ checkServerStatusMigrationLockTimeoutCount(donorConn, 2);
+
+ assert.commandWorked(donorConn.adminCommand(
+ {setParameter: 1, migrationLockAcquisitionMaxWaitMS: originalMigrationLockTimeout}));
+
+ st.stop();
})();
diff --git a/src/mongo/db/s/move_chunk_command.cpp b/src/mongo/db/s/move_chunk_command.cpp
index 10ec850f4b7..dd62c984292 100644
--- a/src/mongo/db/s/move_chunk_command.cpp
+++ b/src/mongo/db/s/move_chunk_command.cpp
@@ -42,6 +42,7 @@
#include "mongo/db/s/migration_source_manager.h"
#include "mongo/db/s/move_timing_helper.h"
#include "mongo/db/s/sharding_state.h"
+#include "mongo/db/s/sharding_statistics.h"
#include "mongo/s/client/shard_registry.h"
#include "mongo/s/grid.h"
#include "mongo/s/request_types/migration_secondary_throttle_options.h"
@@ -142,6 +143,9 @@ public:
status = Status::OK();
} catch (const DBException& e) {
status = e.toStatus();
+ if (status.code() == ErrorCodes::LockTimeout) {
+ ShardingStatistics::get(opCtx).countDonorMoveChunkLockTimeout.addAndFetch(1);
+ }
} catch (const std::exception& e) {
scopedMigration.signalComplete(
{ErrorCodes::InternalError,
diff --git a/src/mongo/db/s/sharding_statistics.cpp b/src/mongo/db/s/sharding_statistics.cpp
index 5589084357f..9ae2b0bc0ec 100644
--- a/src/mongo/db/s/sharding_statistics.cpp
+++ b/src/mongo/db/s/sharding_statistics.cpp
@@ -62,6 +62,7 @@ void ShardingStatistics::report(BSONObjBuilder* builder) const {
builder->append("countDocsClonedOnDonor", countDocsClonedOnDonor.load());
builder->append("countRecipientMoveChunkStarted", countRecipientMoveChunkStarted.load());
builder->append("countDocsDeletedOnDonor", countDocsDeletedOnDonor.load());
+ builder->append("countDonorMoveChunkLockTimeout", countDonorMoveChunkLockTimeout.load());
}
} // namespace mongo
diff --git a/src/mongo/db/s/sharding_statistics.h b/src/mongo/db/s/sharding_statistics.h
index 3734f7a3042..aac4876ad92 100644
--- a/src/mongo/db/s/sharding_statistics.h
+++ b/src/mongo/db/s/sharding_statistics.h
@@ -87,6 +87,10 @@ struct ShardingStatistics {
// from the donor to the recipient).
AtomicWord<long long> totalCriticalSectionTimeMillis{0};
+ // Cumulative, always-increasing counter of the number of migrations aborted on this node
+ // after timing out waiting to acquire a lock.
+ AtomicWord<long long> countDonorMoveChunkLockTimeout{0};
+
/**
* Obtains the per-process instance of the sharding statistics object.
*/