diff options
author | Kevin Pulo <kevin.pulo@mongodb.com> | 2018-08-20 06:53:03 +0000 |
---|---|---|
committer | Kevin Pulo <kevin.pulo@mongodb.com> | 2018-08-20 06:54:29 +0000 |
commit | 581a10bcfc9b755a099a59f8489a5b0e6d5a1746 (patch) | |
tree | 9e9492946785615edf292ad8eb0c796b3ddf5c4a | |
parent | 0d18044aabe790f330e9a625175dbc071bd6fb64 (diff) | |
download | mongo-581a10bcfc9b755a099a59f8489a5b0e6d5a1746.tar.gz |
SERVER-33237 Range deleter avoid query planning for every document.
Custom backport to v3.4, based on 6ed473ed3a122bebc1e932c946fe1c991dbd7ecb.
In v3.4, the excessive query planning is only avoided when _secondaryThrottle
is off.
Also add the rangeDeleterBatchDelayMS server parameter (default 20ms), to give
greater control (when necessary) over range deleter throttling. This delay is
in addition to any _secondaryThrottle replication delays.
-rw-r--r-- | src/mongo/db/dbhelpers.cpp | 226 |
1 files changed, 145 insertions, 81 deletions
diff --git a/src/mongo/db/dbhelpers.cpp b/src/mongo/db/dbhelpers.cpp index 96eb011018d..a372d42824b 100644 --- a/src/mongo/db/dbhelpers.cpp +++ b/src/mongo/db/dbhelpers.cpp @@ -60,6 +60,7 @@ #include "mongo/db/s/collection_metadata.h" #include "mongo/db/s/collection_sharding_state.h" #include "mongo/db/s/sharding_state.h" +#include "mongo/db/server_parameters.h" #include "mongo/db/service_context.h" #include "mongo/db/storage/data_protector.h" #include "mongo/db/storage/encryption_hooks.h" @@ -266,6 +267,10 @@ BSONObj Helpers::inferKeyPattern(const BSONObj& o) { return kpBuilder.obj(); } +// After completing internalQueryExecYieldIterations document deletions, the time in millis to wait +// before continuing deletions. +MONGO_EXPORT_SERVER_PARAMETER(rangeDeleterBatchDelayMS, int, 20); + long long Helpers::removeRange(OperationContext* txn, const KeyRange& range, BoundInclusion boundInclusion, @@ -323,7 +328,7 @@ long long Helpers::removeRange(OperationContext* txn, MONGO_LOG_COMPONENT(1, LogComponent::kSharding) - << "begin removal of " << min << " to " << max << " in " << ns + << "begin removal of " << redact(min) << " to " << redact(max) << " in " << ns << " with write concern: " << writeConcern.toBSON() << endl; long long numDeleted = 0; @@ -331,20 +336,25 @@ long long Helpers::removeRange(OperationContext* txn, replWaitDuration = Milliseconds::zero(); while (1) { + long long numDeletedPreviously = numDeleted; + long long iterationsBetweenSleeps = internalQueryExecYieldIterations.load(); + long long batchSize = writeConcern.shouldWaitForOtherNodes() ? 1 : iterationsBetweenSleeps; + // Scoping for write lock. { ScopedTransaction scopedXact(txn, MODE_IX); AutoGetCollection ctx(txn, NamespaceString(ns), MODE_IX, MODE_IX); Collection* collection = ctx.getCollection(); - if (!collection) + if (!collection) { break; + } IndexDescriptor* desc = collection->getIndexCatalog()->findIndexByName(txn, indexName); if (!desc) { warning(LogComponent::kSharding) << "shard key index '" << indexName << "' on '" << ns << "' was dropped"; - return -1; + break; } unique_ptr<PlanExecutor> exec( @@ -357,99 +367,153 @@ long long Helpers::removeRange(OperationContext* txn, PlanExecutor::YIELD_MANUAL, InternalPlanner::FORWARD, InternalPlanner::IXSCAN_FETCH)); - exec->setYieldPolicy(PlanExecutor::YIELD_AUTO, collection); - - RecordId rloc; - BSONObj obj; - PlanExecutor::ExecState state; - // This may yield so we cannot touch nsd after this. - state = exec->getNext(&obj, &rloc); - if (PlanExecutor::IS_EOF == state) { - break; - } - if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { - warning(LogComponent::kSharding) - << PlanExecutor::statestr(state) << " - cursor error while trying to delete " - << min << " to " << max << " in " << ns << ": " - << WorkingSetCommon::toStatusString(obj) - << ", stats: " << Explain::getWinningPlanStats(exec.get()) << endl; - break; - } - - verify(PlanExecutor::ADVANCED == state); - - if (onlyRemoveOrphanedDocs) { - // Do a final check in the write lock to make absolutely sure that our - // collection hasn't been modified in a way that invalidates our migration - // cleanup. + bool errorOccurred = false; - // We should never be able to turn off the sharding state once enabled, but - // in the future we might want to. - verify(ShardingState::get(txn)->enabled()); + while (numDeleted - numDeletedPreviously < batchSize) { - bool docIsOrphan; - - // In write lock, so will be the most up-to-date version - auto metadataNow = CollectionShardingState::get(txn, ns)->getMetadata(); - if (metadataNow) { - ShardKeyPattern kp(metadataNow->getKeyPattern()); - BSONObj key = kp.extractShardKeyFromDoc(obj); - docIsOrphan = - !metadataNow->keyBelongsToMe(key) && !metadataNow->keyIsPending(key); - } else { - docIsOrphan = false; + RecordId rloc; + BSONObj obj; + PlanExecutor::ExecState state; + // This may yield so we cannot touch nsd after this. + state = exec->getNext(&obj, &rloc); + if (PlanExecutor::IS_EOF == state) { + errorOccurred = true; + break; } - if (!docIsOrphan) { + if (PlanExecutor::FAILURE == state || PlanExecutor::DEAD == state) { warning(LogComponent::kSharding) - << "aborting migration cleanup for chunk " << min << " to " << max - << (metadataNow ? (string) " at document " + obj.toString() : "") - << ", collection " << ns << " has changed " << endl; + << PlanExecutor::statestr(state) + << " - cursor error while trying to delete " << redact(min) << " to " + << redact(max) << " in " << ns << ": " + << redact(WorkingSetCommon::toStatusString(obj)) + << ", stats: " << Explain::getWinningPlanStats(exec.get()) << endl; + errorOccurred = true; break; } - } - MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { - WriteUnitOfWork wuow(txn); - NamespaceString nss(ns); - if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(nss)) { - warning() << "stepped down from primary while deleting chunk; " - << "orphaning data in " << ns << " in range [" << redact(min) << ", " - << redact(max) << ")"; - return numDeleted; + verify(PlanExecutor::ADVANCED == state); + + if (onlyRemoveOrphanedDocs) { + // Do a final check in the write lock to make absolutely sure that our + // collection hasn't been modified in a way that invalidates our migration + // cleanup. + + // We should never be able to turn off the sharding state once enabled, but + // in the future we might want to. + verify(ShardingState::get(txn)->enabled()); + + bool docIsOrphan; + + // In write lock, so will be the most up-to-date version + auto metadataNow = CollectionShardingState::get(txn, ns)->getMetadata(); + if (metadataNow) { + ShardKeyPattern kp(metadataNow->getKeyPattern()); + BSONObj key = kp.extractShardKeyFromDoc(obj); + docIsOrphan = + !metadataNow->keyBelongsToMe(key) && !metadataNow->keyIsPending(key); + } else { + docIsOrphan = false; + } + + if (!docIsOrphan) { + warning(LogComponent::kSharding) + << "aborting migration cleanup for chunk " << redact(min) << " to " + << redact(max) + << (metadataNow ? (string) " at document " + redact(obj.toString()) + : "") + << ", collection " << ns << " has changed " << endl; + // No chance of success with a new plan, so fully abort. + errorOccurred = true; + break; + } } - if (callback) - callback->goingToDelete(obj); + exec->saveState(); + + MONGO_WRITE_CONFLICT_RETRY_LOOP_BEGIN { + WriteUnitOfWork wuow(txn); + NamespaceString nss(ns); + if (!repl::getGlobalReplicationCoordinator()->canAcceptWritesFor(nss)) { + warning() << "stepped down from primary while deleting chunk; " + << "orphaning data in " << ns << " in range [" << redact(min) + << ", " << redact(max) << ")"; + // No chance of success with a new plan, so fully abort. + errorOccurred = true; + break; + } + + if (callback) + callback->goingToDelete(obj); + + OpDebug* const nullOpDebug = nullptr; + collection->deleteDocument(txn, rloc, nullOpDebug, fromMigrate); + wuow.commit(); + } + MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "delete range", ns); + + if (!exec->restoreState()) { + MONGO_LOG_COMPONENT(1, LogComponent::kSharding) + << "unable to restore cursor state while trying to delete " << redact(min) + << " to " << redact(max) << " in " << ns + << ", stats: " << Explain::getWinningPlanStats(exec.get()) + << ", replanning"; + // Try again with a new plan. + break; + } - OpDebug* const nullOpDebug = nullptr; - collection->deleteDocument(txn, rloc, nullOpDebug, fromMigrate); - wuow.commit(); + numDeleted++; } - MONGO_WRITE_CONFLICT_RETRY_LOOP_END(txn, "delete range", ns); - numDeleted++; - } + if (errorOccurred) { + break; + } - // TODO remove once the yielding below that references this timer has been removed - Timer secondaryThrottleTime; - - if (writeConcern.shouldWaitForOtherNodes() && numDeleted > 0) { - repl::ReplicationCoordinator::StatusAndDuration replStatus = - repl::getGlobalReplicationCoordinator()->awaitReplication( - txn, - repl::ReplClientInfo::forClient(txn->getClient()).getLastOp(), - writeConcern); - if (replStatus.status.code() == ErrorCodes::ExceededTimeLimit || - replStatus.status.code() == ErrorCodes::WriteConcernFailed) { - warning(LogComponent::kSharding) << "replication to secondaries for removeRange at " - "least 60 seconds behind"; - } else { - uassertStatusOK(replStatus.status); + } // End scope for write lock. + + if (numDeleted > 0) { + if (writeConcern.shouldWaitForOtherNodes()) { + repl::ReplicationCoordinator::StatusAndDuration replStatus = + repl::getGlobalReplicationCoordinator()->awaitReplication( + txn, + repl::ReplClientInfo::forClient(txn->getClient()).getLastOp(), + writeConcern); + if (replStatus.status.code() == ErrorCodes::ExceededTimeLimit || + replStatus.status.code() == ErrorCodes::WriteConcernFailed) { + warning(LogComponent::kSharding) + << "replication to secondaries for removeRange at " + "least 60 seconds behind"; + } else { + uassertStatusOK(replStatus.status); + } + replWaitDuration += replStatus.duration; + } + + // The `rangeDeleterBatchDelayMS` parameter is defined as a delay every + // `internalQueryExecYieldIterations` (aka `batchSize`) document deletions. + // + // In v3.6+, this applies regardless of waiting for replication (aka + // _secondaryThrottle), because in those versions the replication waits and query + // replanning also happen after each batch of `batchSize` deletions. + // + // However, here in v3.4, when _secondaryThrottle is on it's necessary to preserve the + // semantics of waiting for replication after each document deletion. But it's also + // necessary for `rangeDeleterBatchDelayMS` - which is the only wait when + // _secondaryThrottle is off - to behave as it does in other versions (despite query + // planning still occuring every document deletion in v3.4 when _secondaryThrottle is + // on). + // + // Therefore, we sleep for `rangeDeleterBatchDelayMS` here (every `batchSize` + // iterations), even if we have also waited for replication above. This approach also + // makes the RangeDeleter behavior more consistent when enabling/disabling + // _secondaryThrottle. + if (batchSize != 1 || numDeleted % iterationsBetweenSleeps == 0) { + sleepmillis(rangeDeleterBatchDelayMS.load()); } - replWaitDuration += replStatus.duration; } + + // Loop back to get a new plan and go again. } if (writeConcern.shouldWaitForOtherNodes()) @@ -457,8 +521,8 @@ long long Helpers::removeRange(OperationContext* txn, << "Helpers::removeRangeUnlocked time spent waiting for replication: " << durationCount<Milliseconds>(replWaitDuration) << "ms" << endl; - MONGO_LOG_COMPONENT(1, LogComponent::kSharding) << "end removal of " << min << " to " << max - << " in " << ns << " (took " + MONGO_LOG_COMPONENT(1, LogComponent::kSharding) << "end removal of " << redact(min) << " to " + << redact(max) << " in " << ns << " (took " << rangeRemoveTimer.millis() << "ms)" << endl; return numDeleted; |