diff options
Diffstat (limited to 'src/mongo/db/catalog')
-rw-r--r-- | src/mongo/db/catalog/collection_validation.cpp | 12 | ||||
-rw-r--r-- | src/mongo/db/catalog/collection_validation_test.cpp | 17 | ||||
-rw-r--r-- | src/mongo/db/catalog/validate_state.cpp | 149 | ||||
-rw-r--r-- | src/mongo/db/catalog/validate_state.h | 6 | ||||
-rw-r--r-- | src/mongo/db/catalog/validate_state_test.cpp | 77 |
5 files changed, 196 insertions, 65 deletions
diff --git a/src/mongo/db/catalog/collection_validation.cpp b/src/mongo/db/catalog/collection_validation.cpp index 7b39f823443..4645e18aece 100644 --- a/src/mongo/db/catalog/collection_validation.cpp +++ b/src/mongo/db/catalog/collection_validation.cpp @@ -625,6 +625,18 @@ Status validate(OperationContext* opCtx, "corruption found", logAttrs(validateState.nss()), logAttrs(validateState.uuid())); + } catch (ExceptionFor<ErrorCodes::CursorNotFound>&) { + invariant(validateState.isBackground()); + string warning = str::stream() + << "Collection validation with {background: true} validates" + << " the latest checkpoint (data in a snapshot written to disk in a consistent" + << " way across all data files). During this validation, some tables have not yet been" + << " checkpointed."; + results->warnings.push_back(warning); + + // Nothing to validate, so it must be valid. + results->valid = true; + return Status::OK(); } catch (const DBException& e) { if (opCtx->isKillPending() || e.code() == ErrorCodes::Interrupted) { LOGV2_OPTIONS(5160301, diff --git a/src/mongo/db/catalog/collection_validation_test.cpp b/src/mongo/db/catalog/collection_validation_test.cpp index f0cfa7842c8..15f8f475aec 100644 --- a/src/mongo/db/catalog/collection_validation_test.cpp +++ b/src/mongo/db/catalog/collection_validation_test.cpp @@ -45,6 +45,9 @@ namespace { const NamespaceString kNss = NamespaceString("test.t"); class CollectionValidationTest : public CatalogTestFixture { +protected: + CollectionValidationTest(Options options = {}) : CatalogTestFixture(std::move(options)) {} + private: void setUp() override { CatalogTestFixture::setUp(); @@ -56,6 +59,12 @@ private: }; }; +// Background validation opens checkpoint cursors which requires reading from the disk. +class CollectionValidationDiskTest : public CollectionValidationTest { +protected: + CollectionValidationDiskTest() : CollectionValidationTest(Options{}.ephemeral(false)) {} +}; + /** * Calls validate on collection kNss with both kValidateFull and kValidateNormal validation levels * and verifies the results. @@ -209,7 +218,7 @@ TEST_F(CollectionValidationTest, ValidateEmpty) { /*numInvalidDocuments*/ 0, /*numErrors*/ 0); } -TEST_F(CollectionValidationTest, BackgroundValidateEmpty) { +TEST_F(CollectionValidationDiskTest, BackgroundValidateEmpty) { backgroundValidate(operationContext(), /*valid*/ true, /*numRecords*/ 0, @@ -227,7 +236,7 @@ TEST_F(CollectionValidationTest, Validate) { /*numInvalidDocuments*/ 0, /*numErrors*/ 0); } -TEST_F(CollectionValidationTest, BackgroundValidate) { +TEST_F(CollectionValidationDiskTest, BackgroundValidate) { auto opCtx = operationContext(); backgroundValidate(opCtx, /*valid*/ true, @@ -246,7 +255,7 @@ TEST_F(CollectionValidationTest, ValidateError) { /*numInvalidDocuments*/ 1, /*numErrors*/ 1); } -TEST_F(CollectionValidationTest, BackgroundValidateError) { +TEST_F(CollectionValidationDiskTest, BackgroundValidateError) { auto opCtx = operationContext(); backgroundValidate(opCtx, /*valid*/ false, @@ -280,7 +289,7 @@ void waitUntilValidateFailpointHasBeenReached() { ASSERT(CollectionValidation::getIsValidationPausedForTest()); } -TEST_F(CollectionValidationTest, BackgroundValidateRunsConcurrentlyWithWrites) { +TEST_F(CollectionValidationDiskTest, BackgroundValidateRunsConcurrentlyWithWrites) { auto opCtx = operationContext(); auto serviceContext = opCtx->getServiceContext(); diff --git a/src/mongo/db/catalog/validate_state.cpp b/src/mongo/db/catalog/validate_state.cpp index 2fb6505ce1b..0f9301c31f8 100644 --- a/src/mongo/db/catalog/validate_state.cpp +++ b/src/mongo/db/catalog/validate_state.cpp @@ -40,6 +40,7 @@ #include "mongo/db/db_raii.h" #include "mongo/db/index/index_access_method.h" #include "mongo/db/operation_context.h" +#include "mongo/db/storage/durable_catalog.h" #include "mongo/logv2/log.h" #include "mongo/util/fail_point.h" @@ -69,7 +70,6 @@ ValidateState::ValidateState(OperationContext* opCtx, // being validated. _noPBWM.emplace(opCtx->lockState()); - _globalLock.emplace(opCtx, MODE_IS); _databaseLock.emplace(opCtx, _nss.db(), MODE_IS); _collectionLock.emplace(opCtx, _nss, MODE_IS); } else { @@ -196,11 +196,6 @@ void ValidateState::_yieldCursors(OperationContext* opCtx) { _traverseRecordStoreCursor->save(); _seekRecordStoreCursor->save(); - if (isBackground() && _validateTs) { - // Reset snapshot to help ameliorate WiredTiger cache pressure. - opCtx->recoveryUnit()->refreshSnapshot(); - } - // Restore all the cursors. for (const auto& indexCursor : _indexCursors) { indexCursor.second->restore(); @@ -218,25 +213,12 @@ void ValidateState::initializeCursors(OperationContext* opCtx) { invariant(!_traverseRecordStoreCursor && !_seekRecordStoreCursor && _indexCursors.size() == 0 && _indexes.size() == 0); - // Background validation (on replica sets) will read from a snapshot opened on the kNoOverlap - // read source, which is the minimum of the last applied and all durable timestamps, instead of - // the latest data. Using the kNoOverlap read source prevents us from having to take the PBWM - // lock, which blocks replication. We cannot solely rely on the all durable timestamp as it can - // be set while we're in the middle of applying a batch on secondary nodes. - // Background validation on standalones uses the kNoTimestamp read source because standalones - // have no timestamps to use for maintaining a consistent snapshot. + // Background validation reads from the last stable checkpoint instead of the latest data. This + // allows concurrent writes to go ahead without interfering with validation's view of the data. RecoveryUnit::ReadSource rs = RecoveryUnit::ReadSource::kNoTimestamp; if (isBackground()) { opCtx->recoveryUnit()->abandonSnapshot(); - // Background validation is expecting to read from the no overlap timestamp, but - // standalones do not support timestamps. Therefore, if this process is currently running as - // a standalone, don't use a timestamp. - - if (repl::ReplicationCoordinator::get(opCtx)->isReplEnabled()) { - rs = RecoveryUnit::ReadSource::kNoOverlap; - } else { - rs = RecoveryUnit::ReadSource::kNoTimestamp; - } + rs = RecoveryUnit::ReadSource::kCheckpoint; opCtx->recoveryUnit()->setTimestampReadSource(rs); } @@ -247,34 +229,111 @@ void ValidateState::initializeCursors(OperationContext* opCtx) { _dataThrottle.turnThrottlingOff(); } - _traverseRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( - opCtx, _collection->getRecordStore(), &_dataThrottle); - _seekRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( - opCtx, _collection->getRecordStore(), &_dataThrottle); + // Capture the checkpointTimestamp before and after opening the cursors. If it has moved, the + // cursors are out of sync. + auto storageEngine = opCtx->getServiceContext()->getStorageEngine(); + boost::optional<Timestamp> checkpointTimestamp = boost::none; + boost::optional<Timestamp> currCheckpointTimestamp = boost::none; + do { + _indexCursors.clear(); + _indexes.clear(); + checkpointTimestamp = storageEngine->getLastStableRecoveryTimestamp(); + StringSet readyDurableIndexes; + try { + _traverseRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( + opCtx, _collection->getRecordStore(), &_dataThrottle); + _seekRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( + opCtx, _collection->getRecordStore(), &_dataThrottle); + DurableCatalog::get(opCtx)->getReadyIndexes( + opCtx, _collection->getCatalogId(), &readyDurableIndexes); + } catch (const ExceptionFor<ErrorCodes::CursorNotFound>& ex) { + invariant(isBackground()); + // End the validation if we can't open a checkpoint cursor on the collection. + LOGV2( + 6868900, + "Skipping background validation because the collection is not yet in a checkpoint", + "nss"_attr = _nss, + "ex"_attr = ex); + throw; + } - if (rs != RecoveryUnit::ReadSource::kNoTimestamp) { - invariant(rs == RecoveryUnit::ReadSource::kNoOverlap); - invariant(isBackground()); - _validateTs = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(opCtx); - } + const IndexCatalog* indexCatalog = _collection->getIndexCatalog(); + // The index iterator for ready indexes is timestamp-aware and will only return indexes that + // are visible at our read time. + const auto it = + indexCatalog->getIndexIterator(opCtx, IndexCatalog::InclusionPolicy::kReady); + while (it->more()) { + const IndexCatalogEntry* entry = it->next(); + const IndexDescriptor* desc = entry->descriptor(); + + // Filter out any in-memory index in the collection that is not in our PIT view of the + // MDB catalog. This is only important when background:true because we are then reading + // from the checkpoint's view of the MDB catalog and data. + if (isBackground() && + readyDurableIndexes.find(desc->indexName()) == readyDurableIndexes.end()) { + LOGV2( + 6868901, + "Skipping background validation on the index because the index is not yet in a " + "checkpoint.", + "desc_indexName"_attr = desc->indexName(), + "nss"_attr = _nss); + continue; + } - const IndexCatalog* indexCatalog = _collection->getIndexCatalog(); - // The index iterator for ready indexes is timestamp-aware and will only return indexes that - // are visible at our read time. - const auto it = indexCatalog->getIndexIterator(opCtx, IndexCatalog::InclusionPolicy::kReady); - while (it->more()) { - const IndexCatalogEntry* entry = it->next(); - const IndexDescriptor* desc = entry->descriptor(); + // Read the index's ident from disk (the checkpoint if background:true). If it does not + // match the in-memory ident saved in the IndexCatalogEntry, then our PIT view of the + // index is old and the index has been dropped and recreated. In this case we will skip + // it since there is no utility in checking a dropped index (we also cannot currently + // access it because its in-memory representation is gone). + auto diskIndexIdent = + opCtx->getServiceContext()->getStorageEngine()->getCatalog()->getIndexIdent( + opCtx, _collection->getCatalogId(), desc->indexName()); + if (entry->getIdent() != diskIndexIdent) { + LOGV2(6868902, + "Skipping validation on the index because the index was recreated and is not " + "yet in a checkpoint.", + "desc_indexName"_attr = desc->indexName(), + "nss"_attr = _nss); + continue; + } - auto iam = entry->accessMethod()->asSortedData(); - if (!iam) - continue; + auto iam = entry->accessMethod()->asSortedData(); + if (!iam) + continue; + + _indexCursors.emplace( + desc->indexName(), + std::make_unique<SortedDataInterfaceThrottleCursor>(opCtx, iam, &_dataThrottle)); + + // Skip any newly created indexes that, because they were built with a WT bulk loader, + // are checkpoint'ed but not yet consistent with the rest of checkpoint's PIT view of + // the data. + if (isBackground() && + opCtx->getServiceContext()->getStorageEngine()->isInIndividuallyCheckpointedIndexes( + diskIndexIdent)) { + _indexCursors.erase(desc->indexName()); + LOGV2( + 6868903, + "Skipping background validation on the index because the index data is not yet " + "consistent in the checkpoint.", + "desc_indexName"_attr = desc->indexName(), + "nss"_attr = _nss); + continue; + } - _indexCursors.emplace( - desc->indexName(), - std::make_unique<SortedDataInterfaceThrottleCursor>(opCtx, iam, &_dataThrottle)); + _indexes.push_back(indexCatalog->getEntryShared(desc)); + } + currCheckpointTimestamp = + isBackground() ? storageEngine->getLastStableRecoveryTimestamp() : checkpointTimestamp; + // We will retry if a checkpoint happens during opening the cursors or break out of the loop + // for foreground validation. Due to the limited number of indexes a collection can have, it + // is expected to have at most one retry. + } while (currCheckpointTimestamp != checkpointTimestamp); - _indexes.push_back(indexCatalog->getEntryShared(desc)); + if (rs != RecoveryUnit::ReadSource::kNoTimestamp) { + invariant(rs == RecoveryUnit::ReadSource::kCheckpoint); + invariant(isBackground()); + _validateTs = checkpointTimestamp; } // Because SeekableRecordCursors don't have a method to reset to the start, we save and then diff --git a/src/mongo/db/catalog/validate_state.h b/src/mongo/db/catalog/validate_state.h index fb4d85eb9a3..e28161a22a3 100644 --- a/src/mongo/db/catalog/validate_state.h +++ b/src/mongo/db/catalog/validate_state.h @@ -182,8 +182,7 @@ public: /** * Initializes all the cursors to be used during validation and moves the traversal record - * store cursor to the first record. For background validation, this should be called while - * holding the checkpoint lock when performing a background validation. + * store cursor to the first record. */ void initializeCursors(OperationContext* opCtx); @@ -229,8 +228,7 @@ private: /** * Saves and restores the open cursors to release snapshots and minimize cache pressure for - * validation. For background validation, also refreshes the snapshot by starting a new storage - * transaction. + * validation. */ void _yieldCursors(OperationContext* opCtx); diff --git a/src/mongo/db/catalog/validate_state_test.cpp b/src/mongo/db/catalog/validate_state_test.cpp index a3ccd77ff58..caababd402a 100644 --- a/src/mongo/db/catalog/validate_state_test.cpp +++ b/src/mongo/db/catalog/validate_state_test.cpp @@ -36,6 +36,7 @@ #include "mongo/db/op_observer/op_observer_registry.h" #include "mongo/db/op_observer/oplog_writer_mock.h" #include "mongo/db/repl/replication_coordinator.h" +#include "mongo/db/storage/durable_catalog.h" #include "mongo/db/storage/snapshot_manager.h" #include "mongo/db/storage/wiredtiger/wiredtiger_global_options.h" #include "mongo/util/fail_point.h" @@ -50,7 +51,9 @@ namespace { const NamespaceString kNss = NamespaceString("fooDB.fooColl"); class ValidateStateTest : public CatalogTestFixture { -public: +protected: + ValidateStateTest(Options options = {}) : CatalogTestFixture(std::move(options)) {} + /** * Create collection 'nss'. It will possess a default _id index. */ @@ -65,6 +68,12 @@ private: void setUp() override; }; +// Background validation opens checkpoint cursors which requires reading from the disk. +class ValidateStateDiskTest : public ValidateStateTest { +protected: + ValidateStateDiskTest() : ValidateStateTest(Options{}.ephemeral(false)) {} +}; + void ValidateStateTest::createCollection(OperationContext* opCtx, const NamespaceString& nss) { // Create collection. CollectionOptions defaultCollectionOptions; @@ -170,7 +179,8 @@ TEST_F(ValidateStateTest, NonExistentCollectionShouldThrowNamespaceNotFoundError ErrorCodes::NamespaceNotFound); } -TEST_F(ValidateStateTest, UncheckpointedCollectionShouldBeAbleToInitializeCursors) { +// Validate with {background:true} should fail to find an uncheckpoint'ed collection. +TEST_F(ValidateStateDiskTest, UncheckpointedCollectionShouldThrowCursorNotFoundError) { auto opCtx = operationContext(); // Disable periodic checkpoint'ing thread so we can control when checkpoints occur. @@ -179,16 +189,16 @@ TEST_F(ValidateStateTest, UncheckpointedCollectionShouldBeAbleToInitializeCursor // Checkpoint of all of the data. opCtx->recoveryUnit()->waitUntilUnjournaledWritesDurable(opCtx, /*stableCheckpoint*/ false); + // Create the collection, which will not be in the checkpoint, and check that a CursorNotFound + // error is thrown when attempting to open cursors. createCollectionAndPopulateIt(opCtx, kNss); CollectionValidation::ValidateState validateState( opCtx, kNss, CollectionValidation::ValidateMode::kBackground, CollectionValidation::RepairMode::kNone); - // Assert that cursors are able to created on the new collection. - validateState.initializeCursors(opCtx); - // There should only be a first record id if cursors were initialized successfully. - ASSERT(!validateState.getFirstRecordId().isNull()); + ASSERT_THROWS_CODE( + validateState.initializeCursors(opCtx), AssertionException, ErrorCodes::CursorNotFound); } // Basic test with {background:false} to open cursors against all collection indexes. @@ -234,8 +244,8 @@ TEST_F(ValidateStateTest, OpenCursorsOnAllIndexes) { ASSERT_EQ(validateState.getIndexes().size(), 5); } -// Open cursors against all indexes with {background:true}. -TEST_F(ValidateStateTest, OpenCursorsOnAllIndexesWithBackground) { +// Open cursors against checkpoint'ed indexes with {background:true}. +TEST_F(ValidateStateDiskTest, OpenCursorsOnCheckpointedIndexes) { auto opCtx = operationContext(); createCollectionAndPopulateIt(opCtx, kNss); @@ -259,14 +269,57 @@ TEST_F(ValidateStateTest, OpenCursorsOnAllIndexesWithBackground) { CollectionValidation::RepairMode::kNone); validateState.initializeCursors(opCtx); - // We should be able to open a cursor on each index. - // (Note the _id index was create with collection creation, so we have 5 indexes.) - ASSERT_EQ(validateState.getIndexes().size(), 5); + // Make sure the uncheckpoint'ed indexes are not found. + // (Note the _id index was create with collection creation, so we have 3 indexes.) + ASSERT_EQ(validateState.getIndexes().size(), 3); +} + +// Only open cursors against indexes that are consistent with the rest of the checkpoint'ed data. +TEST_F(ValidateStateDiskTest, OpenCursorsOnConsistentlyCheckpointedIndexes) { + auto opCtx = operationContext(); + createCollectionAndPopulateIt(opCtx, kNss); + + // Disable periodic checkpoint'ing thread so we can control when checkpoints occur. + FailPointEnableBlock failPoint("pauseCheckpointThread"); + + // Create several indexes. + createIndex(opCtx, kNss, BSON("a" << 1)); + createIndex(opCtx, kNss, BSON("b" << 1)); + createIndex(opCtx, kNss, BSON("c" << 1)); + createIndex(opCtx, kNss, BSON("d" << 1)); + + // Checkpoint the indexes. + opCtx->recoveryUnit()->waitUntilUnjournaledWritesDurable(opCtx, /*stableCheckpoint*/ false); + + { + // Artificially set two indexes as inconsistent with the checkpoint. + AutoGetCollection autoColl(opCtx, kNss, MODE_IS); + auto indexIdentA = + opCtx->getServiceContext()->getStorageEngine()->getCatalog()->getIndexIdent( + opCtx, autoColl.getCollection()->getCatalogId(), "a_1"); + auto indexIdentB = + opCtx->getServiceContext()->getStorageEngine()->getCatalog()->getIndexIdent( + opCtx, autoColl.getCollection()->getCatalogId(), "b_1"); + opCtx->getServiceContext()->getStorageEngine()->addIndividuallyCheckpointedIndex( + indexIdentA); + opCtx->getServiceContext()->getStorageEngine()->addIndividuallyCheckpointedIndex( + indexIdentB); + } + + // The two inconsistent indexes should not be found. + // (Note the _id index was create with collection creation, so we have 3 indexes.) + CollectionValidation::ValidateState validateState( + opCtx, + kNss, + CollectionValidation::ValidateMode::kBackground, + CollectionValidation::RepairMode::kNone); + validateState.initializeCursors(opCtx); + ASSERT_EQ(validateState.getIndexes().size(), 3); } // Indexes in the checkpoint that were dropped in the present should not have cursors opened against // them. -TEST_F(ValidateStateTest, CursorsAreNotOpenedAgainstCheckpointedIndexesThatWereLaterDropped) { +TEST_F(ValidateStateDiskTest, CursorsAreNotOpenedAgainstCheckpointedIndexesThatWereLaterDropped) { auto opCtx = operationContext(); createCollectionAndPopulateIt(opCtx, kNss); |