/** * Copyright (C) 2018-present MongoDB, Inc. * * This program is free software: you can redistribute it and/or modify * it under the terms of the Server Side Public License, version 1, * as published by MongoDB, Inc. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * Server Side Public License for more details. * * You should have received a copy of the Server Side Public License * along with this program. If not, see * . * * As a special exception, the copyright holders give permission to link the * code of portions of this program with the OpenSSL library under certain * conditions as described in each individual source file and distribute * linked combinations including the program with the OpenSSL library. You * must comply with the Server Side Public License in all respects for * all of the code used other than as permitted herein. If you modify file(s) * with this exception, you may extend this exception to your version of the * file(s), but you are not obligated to do so. If you do not wish to do so, * delete this exception statement from your version. If you delete this * exception statement from all source files in the program, then also delete * it in the license file. */ #include "mongo/db/ttl.h" #include "mongo/db/auth/authorization_session.h" #include "mongo/db/auth/user_name.h" #include "mongo/db/catalog/coll_mod.h" #include "mongo/db/catalog/collection.h" #include "mongo/db/catalog/collection_catalog.h" #include "mongo/db/catalog/database_holder.h" #include "mongo/db/catalog/index_catalog.h" #include "mongo/db/catalog/index_key_validate.h" #include "mongo/db/client.h" #include "mongo/db/commands/fsync_locked.h" #include "mongo/db/commands/server_status_metric.h" #include "mongo/db/db_raii.h" #include "mongo/db/exec/delete_stage.h" #include "mongo/db/index/index_descriptor.h" #include "mongo/db/namespace_string.h" #include "mongo/db/ops/insert.h" #include "mongo/db/query/internal_plans.h" #include "mongo/db/record_id_helpers.h" #include "mongo/db/repl/replica_set_aware_service.h" #include "mongo/db/repl/replication_coordinator.h" #include "mongo/db/repl/tenant_migration_access_blocker_registry.h" #include "mongo/db/s/operation_sharding_state.h" #include "mongo/db/s/shard_filtering_metadata_refresh.h" #include "mongo/db/service_context.h" #include "mongo/db/shard_role.h" #include "mongo/db/stats/resource_consumption_metrics.h" #include "mongo/db/storage/storage_parameters_gen.h" #include "mongo/db/ttl_collection_cache.h" #include "mongo/db/ttl_gen.h" #include "mongo/logv2/log.h" #include "mongo/s/grid.h" #include "mongo/s/shard_version_factory.h" #include "mongo/util/assert_util.h" #include "mongo/util/background.h" #include "mongo/util/concurrency/idle_thread_block.h" #include "mongo/util/log_with_sampling.h" #define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kIndex namespace mongo { namespace { const auto getTTLMonitor = ServiceContext::declareDecoration>(); bool isBatchingEnabled() { return feature_flags::gBatchMultiDeletes.isEnabled(serverGlobalParams.featureCompatibility) && ttlMonitorBatchDeletes.load(); } // When batching is enabled, returns BatchedDeleteStageParams that limit the amount of work done in // a delete such that it is possible not all expired documents will be removed. Returns nullptr // otherwise. // // When batching is disabled, all expired documents are removed by the delete operation. std::unique_ptr getBatchedDeleteStageParams(bool batchingEnabled) { if (!batchingEnabled) { return nullptr; } auto batchedDeleteParams = std::make_unique(); batchedDeleteParams->targetPassDocs = ttlIndexDeleteTargetDocs.load(); batchedDeleteParams->targetPassTimeMS = Milliseconds(ttlIndexDeleteTargetTimeMS.load()); return batchedDeleteParams; } AdmissionContext::Priority computeTTLPriority( const UUID& uuid, const stdx::unordered_map& collSubpassHistory) { if (auto it = collSubpassHistory.find(uuid); it != collSubpassHistory.end()) { if (it->second >= ttlCollLowPrioritySubpassLimit.load()) { return AdmissionContext::Priority::kNormal; } } return AdmissionContext::Priority::kLow; } // Given the set of current TTL collections via 'ttlCollectionInfo', populates the 'ttlPriorityMap' // with TTL delete priority for each collection based on the 'collSubpassHistory'. // // Returns the number of collections whose TTL deletes should be executed with non-default priority // 'AdmissionContext::Priority::kNormal'. long long populateTTLPriorityMap( const TTLCollectionCache::InfoMap& ttlCollectionInfo, const stdx::unordered_map& collSubpassHistory, stdx::unordered_map& ttlPriorityMap) { long long normalPriorityCount = 0; for (const auto& [uuid, _] : ttlCollectionInfo) { auto priority = computeTTLPriority(uuid, collSubpassHistory); ttlPriorityMap[uuid] = priority; if (priority == AdmissionContext::Priority::kNormal) { normalPriorityCount++; } } return normalPriorityCount; } AdmissionContext::Priority getTTLPriority( const UUID& uuid, const stdx::unordered_map& ttlPriorityMap) { auto it = ttlPriorityMap.find(uuid); // The 'ttlPriorityMap' should contain entries for every collection in the TTLCollectionCache at // the start of a subpass. If not, something went wrong during the population of the map. invariant(it != ttlPriorityMap.end()); return it->second; } // Given 'remainingWorkAfterSubpass', updates the 'collSubpassHistory' count for each collection // with more work. Removes collections from 'collSubpassHistory' with no work left after the // subpass. void updateCollSubpassHistory(stdx::unordered_map& collSubpassHistory, const TTLCollectionCache::InfoMap& remainingWorkAfterSubpass) { // Remove history for collections that are caught up on TTL deletes. stdx::erase_if(collSubpassHistory, [&](auto&& it) { auto uuid = it.first; return remainingWorkAfterSubpass.find(uuid) == remainingWorkAfterSubpass.end(); }); // Increment the subpass count for the unexhausted collections. for (const auto& [uuid, _] : remainingWorkAfterSubpass) { if (auto it = collSubpassHistory.find(uuid); it != collSubpassHistory.end()) { it->second++; } else { collSubpassHistory[uuid] = 1; } } } // Generates an expiration date based on the user-configured expireAfterSeconds. Includes special // 'safe' handling for time-series collections. Date_t safeExpirationDate(OperationContext* opCtx, const CollectionPtr& coll, std::int64_t expireAfterSeconds) { if (auto timeseries = coll->getTimeseriesOptions()) { const auto bucketMaxSpan = Seconds(*timeseries->getBucketMaxSpanSeconds()); // Don't delete data unless it is safely out of range of the bucket maximum time // range. On time-series collections, the _id (and thus RecordId) is the minimum // time value of a bucket. A bucket may have newer data, so we cannot safely delete // the entire bucket yet until the maximum bucket range has passed, even if the // minimum value can be expired. return Date_t::now() - Seconds(expireAfterSeconds) - bucketMaxSpan; } return Date_t::now() - Seconds(expireAfterSeconds); } // Computes and returns the start 'RecordIdBound' with the correct type for a bounded, clustered // collection scan. All time-series buckets collections delete entries of type 'ObjectId'. All // other collections must only delete entries of type 'Date'. RecordIdBound makeCollScanStartBound(const CollectionPtr& collection, const Date_t startDate) { if (collection->getTimeseriesOptions()) { auto startOID = OID(); startOID.init(startDate, false /* max */); return RecordIdBound(record_id_helpers::keyForOID(startOID)); } return RecordIdBound(record_id_helpers::keyForDate(startDate)); } // Computes and returns the end 'RecordIdBound' with the correct type for a bounded, clustered // collection scan. All time-series buckets collections delete entries of type 'ObjectId'. All // other collections must only delete entries of type 'Date'. RecordIdBound makeCollScanEndBound(const CollectionPtr& collection, Date_t expirationDate) { if (collection->getTimeseriesOptions()) { auto endOID = OID(); endOID.init(expirationDate, true /* max */); return RecordIdBound(record_id_helpers::keyForOID(endOID)); } return RecordIdBound(record_id_helpers::keyForDate(expirationDate)); } const IndexDescriptor* getValidTTLIndex(OperationContext* opCtx, TTLCollectionCache* ttlCollectionCache, const CollectionPtr& collection, const BSONObj& spec, std::string indexName) { if (!spec.hasField(IndexDescriptor::kExpireAfterSecondsFieldName)) { ttlCollectionCache->deregisterTTLIndexByName(collection->uuid(), indexName); return nullptr; } if (!collection->isIndexReady(indexName)) { return nullptr; } const BSONObj key = spec["key"].Obj(); if (key.nFields() != 1) { LOGV2_ERROR(22540, "key for ttl index can only have 1 field, skipping TTL job", "index"_attr = spec); return nullptr; } const IndexDescriptor* desc = collection->getIndexCatalog()->findIndexByName(opCtx, indexName); if (!desc) { LOGV2_DEBUG(22535, 1, "index not found; skipping ttl job", "index"_attr = spec); return nullptr; } if (IndexType::INDEX_BTREE != IndexNames::nameToType(desc->getAccessMethodName())) { LOGV2_ERROR(22541, "special index can't be used as a TTL index, skipping TTL job", "index"_attr = spec); return nullptr; } if (auto status = index_key_validate::validateIndexSpecTTL(spec); !status.isOK()) { LOGV2_ERROR(6909100, "Skipping TTL job due to invalid index spec", "reason"_attr = status.reason(), "ns"_attr = collection->ns(), "uuid"_attr = collection->uuid(), "index"_attr = spec); return nullptr; } return desc; } /** * Runs on primaries and secondaries. Forwards replica set events to the TTLMonitor. */ class TTLMonitorService : public ReplicaSetAwareService { public: static TTLMonitorService* get(ServiceContext* serviceContext); TTLMonitorService() = default; private: void onStartup(OperationContext* opCtx) override {} void onSetCurrentConfig(OperationContext* opCtx) override {} void onInitialDataAvailable(OperationContext* opCtx, bool isMajorityDataAvailable) override {} void onShutdown() override {} void onStepUpBegin(OperationContext* opCtx, long long term) override {} void onStepUpComplete(OperationContext* opCtx, long long term) override { auto ttlMonitor = TTLMonitor::get(opCtx->getServiceContext()); if (!ttlMonitor) { // Some test fixtures might not install the TTLMonitor. return; } ttlMonitor->onStepUp(opCtx); } void onStepDown() override {} void onBecomeArbiter() override {} inline std::string getServiceName() const override final { return "TTLMonitorService"; } }; const auto _ttlMonitorService = ServiceContext::declareDecoration(); const ReplicaSetAwareServiceRegistry::Registerer _ttlMonitorServiceRegisterer( "TTLMonitorService"); // static TTLMonitorService* TTLMonitorService::get(ServiceContext* serviceContext) { return &_ttlMonitorService(serviceContext); } } // namespace MONGO_FAIL_POINT_DEFINE(hangTTLMonitorWithLock); MONGO_FAIL_POINT_DEFINE(hangTTLMonitorBetweenPasses); // A TTL pass completes when there are no more expired documents to remove. A single TTL pass may // consist of multiple sub-passes. Each sub-pass deletes all the expired documents it can up to // 'ttlSubPassTargetSecs'. It is possible for a sub-pass to complete before all expired documents // have been removed. CounterMetric ttlPasses("ttl.passes"); CounterMetric ttlSubPasses("ttl.subPasses"); CounterMetric ttlDeletedDocuments("ttl.deletedDocuments"); // Counts the subpasses over TTL collections where the deletes on a collection are increased from // 'low' to 'normal' priority. CounterMetric ttlCollSubpassesIncreasedPriority("ttl.collSubpassesIncreasedPriority"); using MtabType = TenantMigrationAccessBlocker::BlockerType; TTLMonitor::TTLMonitor() : BackgroundJob(false /* selfDelete */), _ttlMonitorSleepSecs(Seconds{ttlMonitorSleepSecs.load()}) {} TTLMonitor* TTLMonitor::get(ServiceContext* serviceCtx) { return getTTLMonitor(serviceCtx).get(); } void TTLMonitor::set(ServiceContext* serviceCtx, std::unique_ptr monitor) { auto& ttlMonitor = getTTLMonitor(serviceCtx); if (ttlMonitor) { invariant(!ttlMonitor->running(), "Tried to reset the TTLMonitor without shutting down the original instance."); } invariant(monitor); ttlMonitor = std::move(monitor); } Status TTLMonitor::onUpdateTTLMonitorSleepSeconds(int newSleepSeconds) { if (auto client = Client::getCurrent()) { if (auto ttlMonitor = TTLMonitor::get(client->getServiceContext())) { ttlMonitor->updateSleepSeconds(Seconds{newSleepSeconds}); } } return Status::OK(); } void TTLMonitor::updateSleepSeconds(Seconds newSeconds) { { stdx::lock_guard lk(_stateMutex); _ttlMonitorSleepSecs = newSeconds; } _notificationCV.notify_all(); } void TTLMonitor::run() { ThreadClient tc(name(), getGlobalServiceContext()); AuthorizationSession::get(cc())->grantInternalAuthorization(&cc()); while (true) { { auto startTime = Date_t::now(); // Wait until either ttlMonitorSleepSecs passes, a shutdown is requested, or the // sleeping time has changed. stdx::unique_lock lk(_stateMutex); auto deadline = startTime + _ttlMonitorSleepSecs; MONGO_IDLE_THREAD_BLOCK; while (Date_t::now() <= deadline && !_shuttingDown) { _notificationCV.wait_until(lk, deadline.toSystemTimePoint()); // Recompute the deadline in case the sleep time has changed since we started. auto newDeadline = startTime + _ttlMonitorSleepSecs; if (deadline != newDeadline) { LOGV2_INFO(7005501, "TTL sleep deadline has changed", "oldDeadline"_attr = deadline, "newDeadline"_attr = newDeadline); deadline = newDeadline; } } if (_shuttingDown) { return; } } LOGV2_DEBUG(22528, 3, "thread awake"); if (!ttlMonitorEnabled.load()) { LOGV2_DEBUG(22529, 1, "disabled"); continue; } if (lockedForWriting()) { // Note: this is not perfect as you can go into fsync+lock between this and actually // doing the delete later. LOGV2_DEBUG(22530, 3, "locked for writing"); continue; } try { _doTTLPass(); } catch (const WriteConflictException&) { LOGV2_DEBUG(22531, 1, "got WriteConflictException"); } catch (const DBException& ex) { LOGV2_WARNING(22537, "TTLMonitor was interrupted, waiting before doing another pass", "interruption"_attr = ex, "wait"_attr = Milliseconds(Seconds(ttlMonitorSleepSecs.load()))); } } } void TTLMonitor::shutdown() { LOGV2(3684100, "Shutting down TTL collection monitor thread"); { stdx::lock_guard lk(_stateMutex); _shuttingDown = true; _notificationCV.notify_all(); } wait(); LOGV2(3684101, "Finished shutting down TTL collection monitor thread"); } void TTLMonitor::_doTTLPass() { const ServiceContext::UniqueOperationContext opCtxPtr = cc().makeOperationContext(); OperationContext* opCtx = opCtxPtr.get(); hangTTLMonitorBetweenPasses.pauseWhileSet(opCtx); // Increment the metric after the TTL work has been finished. ON_BLOCK_EXIT([&] { ttlPasses.increment(); }); // Tracks the number of consecutive subpasses that have failed to exhaust a collection of TTL // deletes. If a collection incurs 'ttlCollLowPrioritySubpassLimit', then all TTL deletes on the // collection are executed at 'normal' priority until there are no TTL deletes remaining on the // collection. stdx::unordered_map collSubpassHistory; bool moreToDelete = true; while (moreToDelete) { // Sub-passes may not delete all documents in the interest of fairness. If a sub-pass // indicates that it did not delete everything possible, we continue performing sub-passes. // This maintains the semantic that a full TTL pass deletes everything it possibly can // before sleeping periodically. moreToDelete = _doTTLSubPass(opCtx, collSubpassHistory); } } bool TTLMonitor::_doTTLSubPass( OperationContext* opCtx, stdx::unordered_map& collSubpassHistory) { // If part of replSet but not in a readable state (e.g. during initial sync), skip. if (repl::ReplicationCoordinator::get(opCtx)->getReplicationMode() == repl::ReplicationCoordinator::modeReplSet && !repl::ReplicationCoordinator::get(opCtx)->getMemberState().readable()) return false; ON_BLOCK_EXIT([&] { ttlSubPasses.increment(); }); TTLCollectionCache& ttlCollectionCache = TTLCollectionCache::get(getGlobalServiceContext()); // Refresh view of current TTL indexes - prevents starvation if a new TTL index is introduced // during a long running pass. TTLCollectionCache::InfoMap work = ttlCollectionCache.getTTLInfos(); // Before the subpass begins work, compute the priority at which TTL deletes should be executed // on each collection. By default, TTL deletes are 'low' priority. Only collections where TTL // deletes have fallen behind over several subpasses are promoted to 'normal' priority TTL // deletes. stdx::unordered_map ttlPriorityMap; auto numNormalPriorityCollections = populateTTLPriorityMap(work, collSubpassHistory, ttlPriorityMap); ttlCollSubpassesIncreasedPriority.increment(numNormalPriorityCollections); // When batching is enabled, _doTTLIndexDelete will limit the amount of work it // performs in both time and the number of documents it deletes. If it reaches one // of these limits on an index, it will return moreToDelete as true, and we will // re-visit it, but only after passing through every other TTL index. We repeat this // process until we hit the ttlMonitorSubPassTargetSecs time limit. // // When batching is disabled, _doTTLIndexDelete will delete as many documents as // possible without limit. Timer timer; do { TTLCollectionCache::InfoMap moreWork; for (const auto& [uuid, infos] : work) { // If there are multiple TTL indexes on a TTL collection, and any of those have fallen // behind TTL inserts over consecutive subpasses, raising the priority to // 'AdmissionContext::Priority::kNormal' for one index means the priority will be // 'normal' for all indexes. AdmissionContext::Priority priority = getTTLPriority(uuid, ttlPriorityMap); ScopedAdmissionPriorityForLock priorityGuard(opCtx->lockState(), priority); for (const auto& info : infos) { bool moreToDelete = _doTTLIndexDelete(opCtx, &ttlCollectionCache, uuid, info); if (moreToDelete) { moreWork[uuid].push_back(info); } } } work = moreWork; } while (!work.empty() && Seconds(timer.seconds()) < Seconds(ttlMonitorSubPassTargetSecs.load())); updateCollSubpassHistory(collSubpassHistory, work); // More work signals there may more expired documents to visit. return !work.empty(); } bool TTLMonitor::_doTTLIndexDelete(OperationContext* opCtx, TTLCollectionCache* ttlCollectionCache, const UUID& uuid, const TTLCollectionCache::Info& info) { // Skip collections that have not been made visible yet. The TTLCollectionCache // already has the index information available, so we want to avoid removing it // until the collection is visible. auto collectionCatalog = CollectionCatalog::get(opCtx); if (collectionCatalog->isCollectionAwaitingVisibility(uuid)) { return false; } // The collection was dropped. auto nss = collectionCatalog->lookupNSSByUUID(opCtx, uuid); if (!nss) { if (info.isClustered()) { ttlCollectionCache->deregisterTTLClusteredIndex(uuid); } else { ttlCollectionCache->deregisterTTLIndexByName(uuid, info.getIndexName()); } return false; } if (nss->isTemporaryReshardingCollection() || nss->isDropPendingNamespace()) { // For resharding, the donor shard primary is responsible for performing the TTL // deletions. return false; } try { uassertStatusOK(userAllowedWriteNS(opCtx, *nss)); auto catalogCache = Grid::get(opCtx)->catalogCache(); auto sii = catalogCache ? uassertStatusOK(catalogCache->getCollectionRoutingInfo(opCtx, *nss)).sii : boost::none; // Attach IGNORED placement version to skip orphans (the range deleter will clear them up) const auto shardVersion = ShardVersionFactory::make( ChunkVersion::IGNORED(), sii ? boost::make_optional(sii->getCollectionIndexes()) : boost::none); auto scopedRole = ScopedSetShardRole(opCtx, *nss, shardVersion, boost::none); const auto coll = acquireCollection(opCtx, CollectionAcquisitionRequest(*nss, {boost::none, shardVersion}, repl::ReadConcernArgs::get(opCtx), AcquisitionPrerequisites::kWrite), MODE_IX); // The collection with `uuid` might be renamed before the lock and the wrong namespace // would be locked and looked up so we double check here. if (!coll.exists() || coll.uuid() != uuid) return false; // Allow TTL deletion on non-capped collections, and on capped clustered collections. const auto& collectionPtr = coll.getCollectionPtr(); invariant(!collectionPtr->isCapped() || (collectionPtr->isCapped() && collectionPtr->isClustered())); if (MONGO_unlikely(hangTTLMonitorWithLock.shouldFail())) { LOGV2(22534, "Hanging due to hangTTLMonitorWithLock fail point", "ttlPasses"_attr = ttlPasses.get()); hangTTLMonitorWithLock.pauseWhileSet(opCtx); } if (!repl::ReplicationCoordinator::get(opCtx)->canAcceptWritesFor(opCtx, *nss)) { return false; } std::shared_ptr mtab; if (nullptr != (mtab = TenantMigrationAccessBlockerRegistry::get(opCtx->getServiceContext()) .getTenantMigrationAccessBlockerForDbName(coll.nss().dbName(), MtabType::kRecipient)) && mtab->checkIfShouldBlockTTL()) { LOGV2_DEBUG(53768, 1, "Postpone TTL of DB because of active tenant migration", "tenantMigrationAccessBlocker"_attr = mtab->getDebugInfo().jsonString(), "database"_attr = coll.nss().dbName()); return false; } ResourceConsumption::ScopedMetricsCollector scopedMetrics(opCtx, nss->db().toString()); if (info.isClustered()) { return _deleteExpiredWithCollscan(opCtx, ttlCollectionCache, coll); } else { return _deleteExpiredWithIndex(opCtx, ttlCollectionCache, coll, info.getIndexName()); } } catch (const ExceptionForCat& ex) { // The TTL index tried to delete some information from a sharded collection // through a direct operation against the shard but the filtering metadata was // not available or the index version in the cache was stale. // // The current TTL task cannot be completed. However, if the critical section is // not held the code below will fire an asynchronous refresh, hoping that the // next time this task is re-executed the filtering information is already // present. It will also invalidate the cache, causing the index information to be refreshed // on the next attempt. if (auto staleInfo = ex.extraInfo(); staleInfo && !staleInfo->getCriticalSectionSignal()) { auto executor = Grid::get(opCtx)->getExecutorPool()->getFixedExecutor(); ExecutorFuture(executor) .then([serviceContext = opCtx->getServiceContext(), nss, staleInfo] { ThreadClient tc("TTLShardVersionRecovery", serviceContext); auto uniqueOpCtx = tc->makeOperationContext(); auto opCtx = uniqueOpCtx.get(); // Invalidate cache in case index version is stale if (staleInfo->getVersionWanted()) { Grid::get(opCtx) ->catalogCache() ->invalidateShardOrEntireCollectionEntryForShardedCollection( *nss, staleInfo->getVersionWanted(), staleInfo->getShardId()); } onCollectionPlacementVersionMismatchNoExcept( opCtx, *nss, staleInfo->getVersionWanted() ? boost::make_optional( staleInfo->getVersionWanted()->placementVersion()) : boost::none) .ignore(); }) .getAsync([](auto) {}); } LOGV2_WARNING(6353000, "Error running TTL job on collection: the shard should refresh " "before being able to complete this task", logAttrs(*nss), "error"_attr = ex); return false; } catch (const DBException& ex) { if (!opCtx->checkForInterruptNoAssert().isOK()) { // The exception is relevant to the entire TTL monitoring process, not just the specific // TTL index. Let the exception escape so it can be addressed at the higher monitoring // layer. throw; } LOGV2_ERROR( 5400703, "Error running TTL job on collection", logAttrs(*nss), "error"_attr = ex); return false; } } bool TTLMonitor::_deleteExpiredWithIndex(OperationContext* opCtx, TTLCollectionCache* ttlCollectionCache, const ScopedCollectionAcquisition& collection, std::string indexName) { const auto& collectionPtr = collection.getCollectionPtr(); if (!collectionPtr->isIndexPresent(indexName)) { ttlCollectionCache->deregisterTTLIndexByName(collection.uuid(), indexName); return false; } BSONObj spec = collectionPtr->getIndexSpec(indexName); const IndexDescriptor* desc = getValidTTLIndex(opCtx, ttlCollectionCache, collectionPtr, spec, indexName); if (!desc) { return false; } LOGV2_DEBUG(22533, 1, "running TTL job for index", logAttrs(collection.nss()), "key"_attr = desc->keyPattern(), "name"_attr = indexName); auto expireAfterSeconds = spec[IndexDescriptor::kExpireAfterSecondsFieldName].safeNumberLong(); const Date_t kDawnOfTime = Date_t::fromMillisSinceEpoch(std::numeric_limits::min()); const auto expirationDate = safeExpirationDate(opCtx, collectionPtr, expireAfterSeconds); const BSONObj startKey = BSON("" << kDawnOfTime); const BSONObj endKey = BSON("" << expirationDate); auto key = desc->keyPattern(); // The canonical check as to whether a key pattern element is "ascending" or // "descending" is (elt.number() >= 0). This is defined by the Ordering class. const InternalPlanner::Direction direction = (key.firstElement().number() >= 0) ? InternalPlanner::Direction::FORWARD : InternalPlanner::Direction::BACKWARD; // We need to pass into the DeleteStageParams (below) a CanonicalQuery with a BSONObj that // queries for the expired documents correctly so that we do not delete documents that are // not actually expired when our snapshot changes during deletion. const char* keyFieldName = key.firstElement().fieldName(); BSONObj query = BSON(keyFieldName << BSON("$gte" << kDawnOfTime << "$lte" << expirationDate)); auto findCommand = std::make_unique(collection.nss()); findCommand->setFilter(query); auto canonicalQuery = CanonicalQuery::canonicalize(opCtx, std::move(findCommand)); invariant(canonicalQuery.getStatus()); auto params = std::make_unique(); params->isMulti = true; params->canonicalQuery = canonicalQuery.getValue().get(); // Maintain a consistent view of whether batching is enabled - batching depends on // parameters that can be set at runtime, and it is illegal to try to get // BatchedDeleteStageStats from a non-batched delete. bool batchingEnabled = isBatchingEnabled(); Timer timer; auto exec = InternalPlanner::deleteWithIndexScan(opCtx, collection, std::move(params), desc, startKey, endKey, BoundInclusion::kIncludeBothStartAndEndKeys, PlanYieldPolicy::YieldPolicy::YIELD_AUTO, direction, getBatchedDeleteStageParams(batchingEnabled)); try { const auto numDeleted = exec->executeDelete(); ttlDeletedDocuments.increment(numDeleted); const auto duration = Milliseconds(timer.millis()); if (shouldLogSlowOpWithSampling(opCtx, logv2::LogComponent::kIndex, duration, Milliseconds(serverGlobalParams.slowMS.load())) .first) { LOGV2(5479200, "Deleted expired documents using index", logAttrs(collection.nss()), "index"_attr = indexName, "numDeleted"_attr = numDeleted, "duration"_attr = duration); } if (batchingEnabled) { auto batchedDeleteStats = exec->getBatchedDeleteStats(); // A pass target met implies there may be more to delete. return batchedDeleteStats.passTargetMet; } } catch (const ExceptionFor&) { // It is expected that a collection drop can kill a query plan while the TTL monitor // is deleting an old document, so ignore this error. } return false; } bool TTLMonitor::_deleteExpiredWithCollscan(OperationContext* opCtx, TTLCollectionCache* ttlCollectionCache, const ScopedCollectionAcquisition& collection) { const auto& collectionPtr = collection.getCollectionPtr(); const auto& collOptions = collectionPtr->getCollectionOptions(); uassert(5400701, "collection is not clustered but is described as being TTL", collOptions.clusteredIndex); invariant(collectionPtr->isClustered()); auto expireAfterSeconds = collOptions.expireAfterSeconds; if (!expireAfterSeconds) { ttlCollectionCache->deregisterTTLClusteredIndex(collection.uuid()); return false; } LOGV2_DEBUG(5400704, 1, "running TTL job for clustered collection", logAttrs(collection.nss())); const auto startId = makeCollScanStartBound(collectionPtr, Date_t::min()); const auto expirationDate = safeExpirationDate(opCtx, collectionPtr, *expireAfterSeconds); const auto endId = makeCollScanEndBound(collectionPtr, expirationDate); auto params = std::make_unique(); params->isMulti = true; // Maintain a consistent view of whether batching is enabled - batching depends on // parameters that can be set at runtime, and it is illegal to try to get // BatchedDeleteStageStats from a non-batched delete. bool batchingEnabled = isBatchingEnabled(); // Deletes records using a bounded collection scan from the beginning of time to the // expiration time (inclusive). Timer timer; auto exec = InternalPlanner::deleteWithCollectionScan( opCtx, collection, std::move(params), PlanYieldPolicy::YieldPolicy::YIELD_AUTO, InternalPlanner::Direction::FORWARD, startId, endId, CollectionScanParams::ScanBoundInclusion::kIncludeBothStartAndEndRecords, getBatchedDeleteStageParams(batchingEnabled)); try { const auto numDeleted = exec->executeDelete(); ttlDeletedDocuments.increment(numDeleted); const auto duration = Milliseconds(timer.millis()); if (shouldLogSlowOpWithSampling(opCtx, logv2::LogComponent::kIndex, duration, Milliseconds(serverGlobalParams.slowMS.load())) .first) { LOGV2(5400702, "Deleted expired documents using collection scan", logAttrs(collection.nss()), "numDeleted"_attr = numDeleted, "duration"_attr = duration); } if (batchingEnabled) { auto batchedDeleteStats = exec->getBatchedDeleteStats(); // A pass target met implies there may be more work to be done on the index. return batchedDeleteStats.passTargetMet; } } catch (const ExceptionFor&) { // It is expected that a collection drop can kill a query plan while the TTL monitor // is deleting an old document, so ignore this error. } return false; } void startTTLMonitor(ServiceContext* serviceContext) { std::unique_ptr ttlMonitor = std::make_unique(); ttlMonitor->go(); TTLMonitor::set(serviceContext, std::move(ttlMonitor)); } void shutdownTTLMonitor(ServiceContext* serviceContext) { TTLMonitor* ttlMonitor = TTLMonitor::get(serviceContext); // We allow the TTLMonitor not to be set in case shutdown occurs before the thread has been // initialized. if (ttlMonitor) { ttlMonitor->shutdown(); } } void TTLMonitor::onStepUp(OperationContext* opCtx) { auto&& ttlCollectionCache = TTLCollectionCache::get(opCtx->getServiceContext()); auto ttlInfos = ttlCollectionCache.getTTLInfos(); for (const auto& [uuid, infos] : ttlInfos) { auto collectionCatalog = CollectionCatalog::get(opCtx); if (collectionCatalog->isCollectionAwaitingVisibility(uuid)) { continue; } // The collection was dropped. auto nss = collectionCatalog->lookupNSSByUUID(opCtx, uuid); if (!nss) { continue; } if (nss->isTemporaryReshardingCollection() || nss->isDropPendingNamespace()) { continue; } try { uassertStatusOK(userAllowedWriteNS(opCtx, *nss)); for (const auto& info : infos) { // Skip clustered indexes with TTL. This includes time-series collections. if (info.isClustered()) { continue; } if (!info.isExpireAfterSecondsInvalid()) { continue; } auto indexName = info.getIndexName(); LOGV2(6847700, "Running collMod to fix TTL index with invalid 'expireAfterSeconds'.", "ns"_attr = *nss, "uuid"_attr = uuid, "name"_attr = indexName, "expireAfterSecondsNew"_attr = index_key_validate::kExpireAfterSecondsForInactiveTTLIndex); // Compose collMod command to amend 'expireAfterSeconds' to same value that // would be used by listIndexes() to convert a NaN value in the catalog. CollModIndex collModIndex; collModIndex.setName(StringData{indexName}); collModIndex.setExpireAfterSeconds(mongo::durationCount( index_key_validate::kExpireAfterSecondsForInactiveTTLIndex)); CollMod collModCmd{*nss}; collModCmd.getCollModRequest().setIndex(collModIndex); // processCollModCommand() will acquire MODE_X access to the collection. BSONObjBuilder builder; uassertStatusOK( processCollModCommand(opCtx, {nss->dbName(), uuid}, collModCmd, &builder)); auto result = builder.obj(); LOGV2( 6847701, "Successfully fixed TTL index with invalid 'expireAfterSeconds' using collMod", "ns"_attr = *nss, "uuid"_attr = uuid, "name"_attr = indexName, "result"_attr = result); } } catch (const ExceptionForCat&) { // The exception is relevant to the entire TTL monitoring process, not just the specific // TTL index. Let the exception escape so it can be addressed at the higher monitoring // layer. throw; } catch (const DBException& ex) { LOGV2_ERROR(6835901, "Error checking TTL job on collection during step up", logAttrs(*nss), "error"_attr = ex); continue; } } } long long TTLMonitor::getTTLPasses_forTest() { return ttlPasses.get(); } long long TTLMonitor::getTTLSubPasses_forTest() { return ttlSubPasses.get(); } } // namespace mongo