diff options
author | Yuhong Zhang <yuhong.zhang@mongodb.com> | 2022-09-06 18:56:16 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-09-06 20:33:38 +0000 |
commit | 1a8f5ae47b2cd941b96cf8795fa45a167ac4fbeb (patch) | |
tree | 4fcbf5c206427d070d918f3376cd4572cc7a801d /src/mongo/db/catalog/validate_state.cpp | |
parent | d7926ae6e17f1a6b5075ef53b0daf6a562576499 (diff) | |
download | mongo-1a8f5ae47b2cd941b96cf8795fa45a167ac4fbeb.tar.gz |
SERVER-68689 Switch background validation to use checkpoint cursors again
Diffstat (limited to 'src/mongo/db/catalog/validate_state.cpp')
-rw-r--r-- | src/mongo/db/catalog/validate_state.cpp | 149 |
1 files changed, 104 insertions, 45 deletions
diff --git a/src/mongo/db/catalog/validate_state.cpp b/src/mongo/db/catalog/validate_state.cpp index 2fb6505ce1b..0f9301c31f8 100644 --- a/src/mongo/db/catalog/validate_state.cpp +++ b/src/mongo/db/catalog/validate_state.cpp @@ -40,6 +40,7 @@ #include "mongo/db/db_raii.h" #include "mongo/db/index/index_access_method.h" #include "mongo/db/operation_context.h" +#include "mongo/db/storage/durable_catalog.h" #include "mongo/logv2/log.h" #include "mongo/util/fail_point.h" @@ -69,7 +70,6 @@ ValidateState::ValidateState(OperationContext* opCtx, // being validated. _noPBWM.emplace(opCtx->lockState()); - _globalLock.emplace(opCtx, MODE_IS); _databaseLock.emplace(opCtx, _nss.db(), MODE_IS); _collectionLock.emplace(opCtx, _nss, MODE_IS); } else { @@ -196,11 +196,6 @@ void ValidateState::_yieldCursors(OperationContext* opCtx) { _traverseRecordStoreCursor->save(); _seekRecordStoreCursor->save(); - if (isBackground() && _validateTs) { - // Reset snapshot to help ameliorate WiredTiger cache pressure. - opCtx->recoveryUnit()->refreshSnapshot(); - } - // Restore all the cursors. for (const auto& indexCursor : _indexCursors) { indexCursor.second->restore(); @@ -218,25 +213,12 @@ void ValidateState::initializeCursors(OperationContext* opCtx) { invariant(!_traverseRecordStoreCursor && !_seekRecordStoreCursor && _indexCursors.size() == 0 && _indexes.size() == 0); - // Background validation (on replica sets) will read from a snapshot opened on the kNoOverlap - // read source, which is the minimum of the last applied and all durable timestamps, instead of - // the latest data. Using the kNoOverlap read source prevents us from having to take the PBWM - // lock, which blocks replication. We cannot solely rely on the all durable timestamp as it can - // be set while we're in the middle of applying a batch on secondary nodes. - // Background validation on standalones uses the kNoTimestamp read source because standalones - // have no timestamps to use for maintaining a consistent snapshot. + // Background validation reads from the last stable checkpoint instead of the latest data. This + // allows concurrent writes to go ahead without interfering with validation's view of the data. RecoveryUnit::ReadSource rs = RecoveryUnit::ReadSource::kNoTimestamp; if (isBackground()) { opCtx->recoveryUnit()->abandonSnapshot(); - // Background validation is expecting to read from the no overlap timestamp, but - // standalones do not support timestamps. Therefore, if this process is currently running as - // a standalone, don't use a timestamp. - - if (repl::ReplicationCoordinator::get(opCtx)->isReplEnabled()) { - rs = RecoveryUnit::ReadSource::kNoOverlap; - } else { - rs = RecoveryUnit::ReadSource::kNoTimestamp; - } + rs = RecoveryUnit::ReadSource::kCheckpoint; opCtx->recoveryUnit()->setTimestampReadSource(rs); } @@ -247,34 +229,111 @@ void ValidateState::initializeCursors(OperationContext* opCtx) { _dataThrottle.turnThrottlingOff(); } - _traverseRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( - opCtx, _collection->getRecordStore(), &_dataThrottle); - _seekRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( - opCtx, _collection->getRecordStore(), &_dataThrottle); + // Capture the checkpointTimestamp before and after opening the cursors. If it has moved, the + // cursors are out of sync. + auto storageEngine = opCtx->getServiceContext()->getStorageEngine(); + boost::optional<Timestamp> checkpointTimestamp = boost::none; + boost::optional<Timestamp> currCheckpointTimestamp = boost::none; + do { + _indexCursors.clear(); + _indexes.clear(); + checkpointTimestamp = storageEngine->getLastStableRecoveryTimestamp(); + StringSet readyDurableIndexes; + try { + _traverseRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( + opCtx, _collection->getRecordStore(), &_dataThrottle); + _seekRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( + opCtx, _collection->getRecordStore(), &_dataThrottle); + DurableCatalog::get(opCtx)->getReadyIndexes( + opCtx, _collection->getCatalogId(), &readyDurableIndexes); + } catch (const ExceptionFor<ErrorCodes::CursorNotFound>& ex) { + invariant(isBackground()); + // End the validation if we can't open a checkpoint cursor on the collection. + LOGV2( + 6868900, + "Skipping background validation because the collection is not yet in a checkpoint", + "nss"_attr = _nss, + "ex"_attr = ex); + throw; + } - if (rs != RecoveryUnit::ReadSource::kNoTimestamp) { - invariant(rs == RecoveryUnit::ReadSource::kNoOverlap); - invariant(isBackground()); - _validateTs = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(opCtx); - } + const IndexCatalog* indexCatalog = _collection->getIndexCatalog(); + // The index iterator for ready indexes is timestamp-aware and will only return indexes that + // are visible at our read time. + const auto it = + indexCatalog->getIndexIterator(opCtx, IndexCatalog::InclusionPolicy::kReady); + while (it->more()) { + const IndexCatalogEntry* entry = it->next(); + const IndexDescriptor* desc = entry->descriptor(); + + // Filter out any in-memory index in the collection that is not in our PIT view of the + // MDB catalog. This is only important when background:true because we are then reading + // from the checkpoint's view of the MDB catalog and data. + if (isBackground() && + readyDurableIndexes.find(desc->indexName()) == readyDurableIndexes.end()) { + LOGV2( + 6868901, + "Skipping background validation on the index because the index is not yet in a " + "checkpoint.", + "desc_indexName"_attr = desc->indexName(), + "nss"_attr = _nss); + continue; + } - const IndexCatalog* indexCatalog = _collection->getIndexCatalog(); - // The index iterator for ready indexes is timestamp-aware and will only return indexes that - // are visible at our read time. - const auto it = indexCatalog->getIndexIterator(opCtx, IndexCatalog::InclusionPolicy::kReady); - while (it->more()) { - const IndexCatalogEntry* entry = it->next(); - const IndexDescriptor* desc = entry->descriptor(); + // Read the index's ident from disk (the checkpoint if background:true). If it does not + // match the in-memory ident saved in the IndexCatalogEntry, then our PIT view of the + // index is old and the index has been dropped and recreated. In this case we will skip + // it since there is no utility in checking a dropped index (we also cannot currently + // access it because its in-memory representation is gone). + auto diskIndexIdent = + opCtx->getServiceContext()->getStorageEngine()->getCatalog()->getIndexIdent( + opCtx, _collection->getCatalogId(), desc->indexName()); + if (entry->getIdent() != diskIndexIdent) { + LOGV2(6868902, + "Skipping validation on the index because the index was recreated and is not " + "yet in a checkpoint.", + "desc_indexName"_attr = desc->indexName(), + "nss"_attr = _nss); + continue; + } - auto iam = entry->accessMethod()->asSortedData(); - if (!iam) - continue; + auto iam = entry->accessMethod()->asSortedData(); + if (!iam) + continue; + + _indexCursors.emplace( + desc->indexName(), + std::make_unique<SortedDataInterfaceThrottleCursor>(opCtx, iam, &_dataThrottle)); + + // Skip any newly created indexes that, because they were built with a WT bulk loader, + // are checkpoint'ed but not yet consistent with the rest of checkpoint's PIT view of + // the data. + if (isBackground() && + opCtx->getServiceContext()->getStorageEngine()->isInIndividuallyCheckpointedIndexes( + diskIndexIdent)) { + _indexCursors.erase(desc->indexName()); + LOGV2( + 6868903, + "Skipping background validation on the index because the index data is not yet " + "consistent in the checkpoint.", + "desc_indexName"_attr = desc->indexName(), + "nss"_attr = _nss); + continue; + } - _indexCursors.emplace( - desc->indexName(), - std::make_unique<SortedDataInterfaceThrottleCursor>(opCtx, iam, &_dataThrottle)); + _indexes.push_back(indexCatalog->getEntryShared(desc)); + } + currCheckpointTimestamp = + isBackground() ? storageEngine->getLastStableRecoveryTimestamp() : checkpointTimestamp; + // We will retry if a checkpoint happens during opening the cursors or break out of the loop + // for foreground validation. Due to the limited number of indexes a collection can have, it + // is expected to have at most one retry. + } while (currCheckpointTimestamp != checkpointTimestamp); - _indexes.push_back(indexCatalog->getEntryShared(desc)); + if (rs != RecoveryUnit::ReadSource::kNoTimestamp) { + invariant(rs == RecoveryUnit::ReadSource::kCheckpoint); + invariant(isBackground()); + _validateTs = checkpointTimestamp; } // Because SeekableRecordCursors don't have a method to reset to the start, we save and then |