diff options
Diffstat (limited to 'src')
17 files changed, 293 insertions, 101 deletions
diff --git a/src/mongo/db/catalog/collection_validation.cpp b/src/mongo/db/catalog/collection_validation.cpp index 7b39f823443..4645e18aece 100644 --- a/src/mongo/db/catalog/collection_validation.cpp +++ b/src/mongo/db/catalog/collection_validation.cpp @@ -625,6 +625,18 @@ Status validate(OperationContext* opCtx, "corruption found", logAttrs(validateState.nss()), logAttrs(validateState.uuid())); + } catch (ExceptionFor<ErrorCodes::CursorNotFound>&) { + invariant(validateState.isBackground()); + string warning = str::stream() + << "Collection validation with {background: true} validates" + << " the latest checkpoint (data in a snapshot written to disk in a consistent" + << " way across all data files). During this validation, some tables have not yet been" + << " checkpointed."; + results->warnings.push_back(warning); + + // Nothing to validate, so it must be valid. + results->valid = true; + return Status::OK(); } catch (const DBException& e) { if (opCtx->isKillPending() || e.code() == ErrorCodes::Interrupted) { LOGV2_OPTIONS(5160301, diff --git a/src/mongo/db/catalog/collection_validation_test.cpp b/src/mongo/db/catalog/collection_validation_test.cpp index f0cfa7842c8..15f8f475aec 100644 --- a/src/mongo/db/catalog/collection_validation_test.cpp +++ b/src/mongo/db/catalog/collection_validation_test.cpp @@ -45,6 +45,9 @@ namespace { const NamespaceString kNss = NamespaceString("test.t"); class CollectionValidationTest : public CatalogTestFixture { +protected: + CollectionValidationTest(Options options = {}) : CatalogTestFixture(std::move(options)) {} + private: void setUp() override { CatalogTestFixture::setUp(); @@ -56,6 +59,12 @@ private: }; }; +// Background validation opens checkpoint cursors which requires reading from the disk. +class CollectionValidationDiskTest : public CollectionValidationTest { +protected: + CollectionValidationDiskTest() : CollectionValidationTest(Options{}.ephemeral(false)) {} +}; + /** * Calls validate on collection kNss with both kValidateFull and kValidateNormal validation levels * and verifies the results. @@ -209,7 +218,7 @@ TEST_F(CollectionValidationTest, ValidateEmpty) { /*numInvalidDocuments*/ 0, /*numErrors*/ 0); } -TEST_F(CollectionValidationTest, BackgroundValidateEmpty) { +TEST_F(CollectionValidationDiskTest, BackgroundValidateEmpty) { backgroundValidate(operationContext(), /*valid*/ true, /*numRecords*/ 0, @@ -227,7 +236,7 @@ TEST_F(CollectionValidationTest, Validate) { /*numInvalidDocuments*/ 0, /*numErrors*/ 0); } -TEST_F(CollectionValidationTest, BackgroundValidate) { +TEST_F(CollectionValidationDiskTest, BackgroundValidate) { auto opCtx = operationContext(); backgroundValidate(opCtx, /*valid*/ true, @@ -246,7 +255,7 @@ TEST_F(CollectionValidationTest, ValidateError) { /*numInvalidDocuments*/ 1, /*numErrors*/ 1); } -TEST_F(CollectionValidationTest, BackgroundValidateError) { +TEST_F(CollectionValidationDiskTest, BackgroundValidateError) { auto opCtx = operationContext(); backgroundValidate(opCtx, /*valid*/ false, @@ -280,7 +289,7 @@ void waitUntilValidateFailpointHasBeenReached() { ASSERT(CollectionValidation::getIsValidationPausedForTest()); } -TEST_F(CollectionValidationTest, BackgroundValidateRunsConcurrentlyWithWrites) { +TEST_F(CollectionValidationDiskTest, BackgroundValidateRunsConcurrentlyWithWrites) { auto opCtx = operationContext(); auto serviceContext = opCtx->getServiceContext(); diff --git a/src/mongo/db/catalog/validate_state.cpp b/src/mongo/db/catalog/validate_state.cpp index 2fb6505ce1b..0f9301c31f8 100644 --- a/src/mongo/db/catalog/validate_state.cpp +++ b/src/mongo/db/catalog/validate_state.cpp @@ -40,6 +40,7 @@ #include "mongo/db/db_raii.h" #include "mongo/db/index/index_access_method.h" #include "mongo/db/operation_context.h" +#include "mongo/db/storage/durable_catalog.h" #include "mongo/logv2/log.h" #include "mongo/util/fail_point.h" @@ -69,7 +70,6 @@ ValidateState::ValidateState(OperationContext* opCtx, // being validated. _noPBWM.emplace(opCtx->lockState()); - _globalLock.emplace(opCtx, MODE_IS); _databaseLock.emplace(opCtx, _nss.db(), MODE_IS); _collectionLock.emplace(opCtx, _nss, MODE_IS); } else { @@ -196,11 +196,6 @@ void ValidateState::_yieldCursors(OperationContext* opCtx) { _traverseRecordStoreCursor->save(); _seekRecordStoreCursor->save(); - if (isBackground() && _validateTs) { - // Reset snapshot to help ameliorate WiredTiger cache pressure. - opCtx->recoveryUnit()->refreshSnapshot(); - } - // Restore all the cursors. for (const auto& indexCursor : _indexCursors) { indexCursor.second->restore(); @@ -218,25 +213,12 @@ void ValidateState::initializeCursors(OperationContext* opCtx) { invariant(!_traverseRecordStoreCursor && !_seekRecordStoreCursor && _indexCursors.size() == 0 && _indexes.size() == 0); - // Background validation (on replica sets) will read from a snapshot opened on the kNoOverlap - // read source, which is the minimum of the last applied and all durable timestamps, instead of - // the latest data. Using the kNoOverlap read source prevents us from having to take the PBWM - // lock, which blocks replication. We cannot solely rely on the all durable timestamp as it can - // be set while we're in the middle of applying a batch on secondary nodes. - // Background validation on standalones uses the kNoTimestamp read source because standalones - // have no timestamps to use for maintaining a consistent snapshot. + // Background validation reads from the last stable checkpoint instead of the latest data. This + // allows concurrent writes to go ahead without interfering with validation's view of the data. RecoveryUnit::ReadSource rs = RecoveryUnit::ReadSource::kNoTimestamp; if (isBackground()) { opCtx->recoveryUnit()->abandonSnapshot(); - // Background validation is expecting to read from the no overlap timestamp, but - // standalones do not support timestamps. Therefore, if this process is currently running as - // a standalone, don't use a timestamp. - - if (repl::ReplicationCoordinator::get(opCtx)->isReplEnabled()) { - rs = RecoveryUnit::ReadSource::kNoOverlap; - } else { - rs = RecoveryUnit::ReadSource::kNoTimestamp; - } + rs = RecoveryUnit::ReadSource::kCheckpoint; opCtx->recoveryUnit()->setTimestampReadSource(rs); } @@ -247,34 +229,111 @@ void ValidateState::initializeCursors(OperationContext* opCtx) { _dataThrottle.turnThrottlingOff(); } - _traverseRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( - opCtx, _collection->getRecordStore(), &_dataThrottle); - _seekRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( - opCtx, _collection->getRecordStore(), &_dataThrottle); + // Capture the checkpointTimestamp before and after opening the cursors. If it has moved, the + // cursors are out of sync. + auto storageEngine = opCtx->getServiceContext()->getStorageEngine(); + boost::optional<Timestamp> checkpointTimestamp = boost::none; + boost::optional<Timestamp> currCheckpointTimestamp = boost::none; + do { + _indexCursors.clear(); + _indexes.clear(); + checkpointTimestamp = storageEngine->getLastStableRecoveryTimestamp(); + StringSet readyDurableIndexes; + try { + _traverseRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( + opCtx, _collection->getRecordStore(), &_dataThrottle); + _seekRecordStoreCursor = std::make_unique<SeekableRecordThrottleCursor>( + opCtx, _collection->getRecordStore(), &_dataThrottle); + DurableCatalog::get(opCtx)->getReadyIndexes( + opCtx, _collection->getCatalogId(), &readyDurableIndexes); + } catch (const ExceptionFor<ErrorCodes::CursorNotFound>& ex) { + invariant(isBackground()); + // End the validation if we can't open a checkpoint cursor on the collection. + LOGV2( + 6868900, + "Skipping background validation because the collection is not yet in a checkpoint", + "nss"_attr = _nss, + "ex"_attr = ex); + throw; + } - if (rs != RecoveryUnit::ReadSource::kNoTimestamp) { - invariant(rs == RecoveryUnit::ReadSource::kNoOverlap); - invariant(isBackground()); - _validateTs = opCtx->recoveryUnit()->getPointInTimeReadTimestamp(opCtx); - } + const IndexCatalog* indexCatalog = _collection->getIndexCatalog(); + // The index iterator for ready indexes is timestamp-aware and will only return indexes that + // are visible at our read time. + const auto it = + indexCatalog->getIndexIterator(opCtx, IndexCatalog::InclusionPolicy::kReady); + while (it->more()) { + const IndexCatalogEntry* entry = it->next(); + const IndexDescriptor* desc = entry->descriptor(); + + // Filter out any in-memory index in the collection that is not in our PIT view of the + // MDB catalog. This is only important when background:true because we are then reading + // from the checkpoint's view of the MDB catalog and data. + if (isBackground() && + readyDurableIndexes.find(desc->indexName()) == readyDurableIndexes.end()) { + LOGV2( + 6868901, + "Skipping background validation on the index because the index is not yet in a " + "checkpoint.", + "desc_indexName"_attr = desc->indexName(), + "nss"_attr = _nss); + continue; + } - const IndexCatalog* indexCatalog = _collection->getIndexCatalog(); - // The index iterator for ready indexes is timestamp-aware and will only return indexes that - // are visible at our read time. - const auto it = indexCatalog->getIndexIterator(opCtx, IndexCatalog::InclusionPolicy::kReady); - while (it->more()) { - const IndexCatalogEntry* entry = it->next(); - const IndexDescriptor* desc = entry->descriptor(); + // Read the index's ident from disk (the checkpoint if background:true). If it does not + // match the in-memory ident saved in the IndexCatalogEntry, then our PIT view of the + // index is old and the index has been dropped and recreated. In this case we will skip + // it since there is no utility in checking a dropped index (we also cannot currently + // access it because its in-memory representation is gone). + auto diskIndexIdent = + opCtx->getServiceContext()->getStorageEngine()->getCatalog()->getIndexIdent( + opCtx, _collection->getCatalogId(), desc->indexName()); + if (entry->getIdent() != diskIndexIdent) { + LOGV2(6868902, + "Skipping validation on the index because the index was recreated and is not " + "yet in a checkpoint.", + "desc_indexName"_attr = desc->indexName(), + "nss"_attr = _nss); + continue; + } - auto iam = entry->accessMethod()->asSortedData(); - if (!iam) - continue; + auto iam = entry->accessMethod()->asSortedData(); + if (!iam) + continue; + + _indexCursors.emplace( + desc->indexName(), + std::make_unique<SortedDataInterfaceThrottleCursor>(opCtx, iam, &_dataThrottle)); + + // Skip any newly created indexes that, because they were built with a WT bulk loader, + // are checkpoint'ed but not yet consistent with the rest of checkpoint's PIT view of + // the data. + if (isBackground() && + opCtx->getServiceContext()->getStorageEngine()->isInIndividuallyCheckpointedIndexes( + diskIndexIdent)) { + _indexCursors.erase(desc->indexName()); + LOGV2( + 6868903, + "Skipping background validation on the index because the index data is not yet " + "consistent in the checkpoint.", + "desc_indexName"_attr = desc->indexName(), + "nss"_attr = _nss); + continue; + } - _indexCursors.emplace( - desc->indexName(), - std::make_unique<SortedDataInterfaceThrottleCursor>(opCtx, iam, &_dataThrottle)); + _indexes.push_back(indexCatalog->getEntryShared(desc)); + } + currCheckpointTimestamp = + isBackground() ? storageEngine->getLastStableRecoveryTimestamp() : checkpointTimestamp; + // We will retry if a checkpoint happens during opening the cursors or break out of the loop + // for foreground validation. Due to the limited number of indexes a collection can have, it + // is expected to have at most one retry. + } while (currCheckpointTimestamp != checkpointTimestamp); - _indexes.push_back(indexCatalog->getEntryShared(desc)); + if (rs != RecoveryUnit::ReadSource::kNoTimestamp) { + invariant(rs == RecoveryUnit::ReadSource::kCheckpoint); + invariant(isBackground()); + _validateTs = checkpointTimestamp; } // Because SeekableRecordCursors don't have a method to reset to the start, we save and then diff --git a/src/mongo/db/catalog/validate_state.h b/src/mongo/db/catalog/validate_state.h index fb4d85eb9a3..e28161a22a3 100644 --- a/src/mongo/db/catalog/validate_state.h +++ b/src/mongo/db/catalog/validate_state.h @@ -182,8 +182,7 @@ public: /** * Initializes all the cursors to be used during validation and moves the traversal record - * store cursor to the first record. For background validation, this should be called while - * holding the checkpoint lock when performing a background validation. + * store cursor to the first record. */ void initializeCursors(OperationContext* opCtx); @@ -229,8 +228,7 @@ private: /** * Saves and restores the open cursors to release snapshots and minimize cache pressure for - * validation. For background validation, also refreshes the snapshot by starting a new storage - * transaction. + * validation. */ void _yieldCursors(OperationContext* opCtx); diff --git a/src/mongo/db/catalog/validate_state_test.cpp b/src/mongo/db/catalog/validate_state_test.cpp index a3ccd77ff58..caababd402a 100644 --- a/src/mongo/db/catalog/validate_state_test.cpp +++ b/src/mongo/db/catalog/validate_state_test.cpp @@ -36,6 +36,7 @@ #include "mongo/db/op_observer/op_observer_registry.h" #include "mongo/db/op_observer/oplog_writer_mock.h" #include "mongo/db/repl/replication_coordinator.h" +#include "mongo/db/storage/durable_catalog.h" #include "mongo/db/storage/snapshot_manager.h" #include "mongo/db/storage/wiredtiger/wiredtiger_global_options.h" #include "mongo/util/fail_point.h" @@ -50,7 +51,9 @@ namespace { const NamespaceString kNss = NamespaceString("fooDB.fooColl"); class ValidateStateTest : public CatalogTestFixture { -public: +protected: + ValidateStateTest(Options options = {}) : CatalogTestFixture(std::move(options)) {} + /** * Create collection 'nss'. It will possess a default _id index. */ @@ -65,6 +68,12 @@ private: void setUp() override; }; +// Background validation opens checkpoint cursors which requires reading from the disk. +class ValidateStateDiskTest : public ValidateStateTest { +protected: + ValidateStateDiskTest() : ValidateStateTest(Options{}.ephemeral(false)) {} +}; + void ValidateStateTest::createCollection(OperationContext* opCtx, const NamespaceString& nss) { // Create collection. CollectionOptions defaultCollectionOptions; @@ -170,7 +179,8 @@ TEST_F(ValidateStateTest, NonExistentCollectionShouldThrowNamespaceNotFoundError ErrorCodes::NamespaceNotFound); } -TEST_F(ValidateStateTest, UncheckpointedCollectionShouldBeAbleToInitializeCursors) { +// Validate with {background:true} should fail to find an uncheckpoint'ed collection. +TEST_F(ValidateStateDiskTest, UncheckpointedCollectionShouldThrowCursorNotFoundError) { auto opCtx = operationContext(); // Disable periodic checkpoint'ing thread so we can control when checkpoints occur. @@ -179,16 +189,16 @@ TEST_F(ValidateStateTest, UncheckpointedCollectionShouldBeAbleToInitializeCursor // Checkpoint of all of the data. opCtx->recoveryUnit()->waitUntilUnjournaledWritesDurable(opCtx, /*stableCheckpoint*/ false); + // Create the collection, which will not be in the checkpoint, and check that a CursorNotFound + // error is thrown when attempting to open cursors. createCollectionAndPopulateIt(opCtx, kNss); CollectionValidation::ValidateState validateState( opCtx, kNss, CollectionValidation::ValidateMode::kBackground, CollectionValidation::RepairMode::kNone); - // Assert that cursors are able to created on the new collection. - validateState.initializeCursors(opCtx); - // There should only be a first record id if cursors were initialized successfully. - ASSERT(!validateState.getFirstRecordId().isNull()); + ASSERT_THROWS_CODE( + validateState.initializeCursors(opCtx), AssertionException, ErrorCodes::CursorNotFound); } // Basic test with {background:false} to open cursors against all collection indexes. @@ -234,8 +244,8 @@ TEST_F(ValidateStateTest, OpenCursorsOnAllIndexes) { ASSERT_EQ(validateState.getIndexes().size(), 5); } -// Open cursors against all indexes with {background:true}. -TEST_F(ValidateStateTest, OpenCursorsOnAllIndexesWithBackground) { +// Open cursors against checkpoint'ed indexes with {background:true}. +TEST_F(ValidateStateDiskTest, OpenCursorsOnCheckpointedIndexes) { auto opCtx = operationContext(); createCollectionAndPopulateIt(opCtx, kNss); @@ -259,14 +269,57 @@ TEST_F(ValidateStateTest, OpenCursorsOnAllIndexesWithBackground) { CollectionValidation::RepairMode::kNone); validateState.initializeCursors(opCtx); - // We should be able to open a cursor on each index. - // (Note the _id index was create with collection creation, so we have 5 indexes.) - ASSERT_EQ(validateState.getIndexes().size(), 5); + // Make sure the uncheckpoint'ed indexes are not found. + // (Note the _id index was create with collection creation, so we have 3 indexes.) + ASSERT_EQ(validateState.getIndexes().size(), 3); +} + +// Only open cursors against indexes that are consistent with the rest of the checkpoint'ed data. +TEST_F(ValidateStateDiskTest, OpenCursorsOnConsistentlyCheckpointedIndexes) { + auto opCtx = operationContext(); + createCollectionAndPopulateIt(opCtx, kNss); + + // Disable periodic checkpoint'ing thread so we can control when checkpoints occur. + FailPointEnableBlock failPoint("pauseCheckpointThread"); + + // Create several indexes. + createIndex(opCtx, kNss, BSON("a" << 1)); + createIndex(opCtx, kNss, BSON("b" << 1)); + createIndex(opCtx, kNss, BSON("c" << 1)); + createIndex(opCtx, kNss, BSON("d" << 1)); + + // Checkpoint the indexes. + opCtx->recoveryUnit()->waitUntilUnjournaledWritesDurable(opCtx, /*stableCheckpoint*/ false); + + { + // Artificially set two indexes as inconsistent with the checkpoint. + AutoGetCollection autoColl(opCtx, kNss, MODE_IS); + auto indexIdentA = + opCtx->getServiceContext()->getStorageEngine()->getCatalog()->getIndexIdent( + opCtx, autoColl.getCollection()->getCatalogId(), "a_1"); + auto indexIdentB = + opCtx->getServiceContext()->getStorageEngine()->getCatalog()->getIndexIdent( + opCtx, autoColl.getCollection()->getCatalogId(), "b_1"); + opCtx->getServiceContext()->getStorageEngine()->addIndividuallyCheckpointedIndex( + indexIdentA); + opCtx->getServiceContext()->getStorageEngine()->addIndividuallyCheckpointedIndex( + indexIdentB); + } + + // The two inconsistent indexes should not be found. + // (Note the _id index was create with collection creation, so we have 3 indexes.) + CollectionValidation::ValidateState validateState( + opCtx, + kNss, + CollectionValidation::ValidateMode::kBackground, + CollectionValidation::RepairMode::kNone); + validateState.initializeCursors(opCtx); + ASSERT_EQ(validateState.getIndexes().size(), 3); } // Indexes in the checkpoint that were dropped in the present should not have cursors opened against // them. -TEST_F(ValidateStateTest, CursorsAreNotOpenedAgainstCheckpointedIndexesThatWereLaterDropped) { +TEST_F(ValidateStateDiskTest, CursorsAreNotOpenedAgainstCheckpointedIndexesThatWereLaterDropped) { auto opCtx = operationContext(); createCollectionAndPopulateIt(opCtx, kNss); diff --git a/src/mongo/db/storage/durable_catalog.h b/src/mongo/db/storage/durable_catalog.h index 1a622c9284a..bb538107aa3 100644 --- a/src/mongo/db/storage/durable_catalog.h +++ b/src/mongo/db/storage/durable_catalog.h @@ -211,6 +211,10 @@ public: virtual int getTotalIndexCount(OperationContext* opCtx, const RecordId& catalogId) const = 0; + virtual void getReadyIndexes(OperationContext* opCtx, + RecordId catalogId, + StringSet* names) const = 0; + virtual bool isIndexPresent(OperationContext* opCtx, const RecordId& catalogId, StringData indexName) const = 0; diff --git a/src/mongo/db/storage/durable_catalog_impl.cpp b/src/mongo/db/storage/durable_catalog_impl.cpp index 44459abe6b1..dbf6758d920 100644 --- a/src/mongo/db/storage/durable_catalog_impl.cpp +++ b/src/mongo/db/storage/durable_catalog_impl.cpp @@ -824,6 +824,21 @@ int DurableCatalogImpl::getTotalIndexCount(OperationContext* opCtx, return md->getTotalIndexCount(); } +void DurableCatalogImpl::getReadyIndexes(OperationContext* opCtx, + RecordId catalogId, + StringSet* names) const { + auto md = getMetaData(opCtx, catalogId); + + if (!md) { + return; + } + + for (const auto& index : md->indexes) { + if (index.ready) + names->insert(index.spec["name"].String()); + } +} + bool DurableCatalogImpl::isIndexPresent(OperationContext* opCtx, const RecordId& catalogId, StringData indexName) const { diff --git a/src/mongo/db/storage/durable_catalog_impl.h b/src/mongo/db/storage/durable_catalog_impl.h index ecd29d74307..97819879bb6 100644 --- a/src/mongo/db/storage/durable_catalog_impl.h +++ b/src/mongo/db/storage/durable_catalog_impl.h @@ -144,6 +144,8 @@ public: int getTotalIndexCount(OperationContext* opCtx, const RecordId& catalogId) const; + void getReadyIndexes(OperationContext* opCtx, RecordId catalogId, StringSet* names) const; + bool isIndexPresent(OperationContext* opCtx, const RecordId& catalogId, StringData indexName) const; diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h index 8135aea2d22..9464bab4fb1 100644 --- a/src/mongo/db/storage/recovery_unit.h +++ b/src/mongo/db/storage/recovery_unit.h @@ -425,11 +425,6 @@ public: } /** - * Refreshes a read transaction by resetting the snapshot in use - */ - virtual void refreshSnapshot() {} - - /** * The ReadSource indicates which external or provided timestamp to read from for future * transactions. */ diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_cursor.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_cursor.cpp index a1c6924a4cd..49346bfc671 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_cursor.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_cursor.cpp @@ -93,7 +93,12 @@ WiredTigerCursor::WiredTigerCursor(const std::string& uri, } WiredTigerCursor::~WiredTigerCursor() { - _session->releaseCursor(_tableID, _cursor, _config); + if (_isCheckpoint) { + // Closes the checkpoint cursor to avoid outdated data view when opening a new one. + _session->closeCursor(_cursor); + } else { + _session->releaseCursor(_tableID, _cursor, _config); + } } void WiredTigerCursor::reset() { diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp index 8eed8b14701..ece7c56bd4f 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.cpp @@ -2013,12 +2013,12 @@ void WiredTigerKVEngine::_checkpoint(WT_SESSION* session) { // Third, stableTimestamp >= initialDataTimestamp: Take stable checkpoint. Steady state // case. if (initialDataTimestamp.asULL() <= 1) { - clearIndividuallyCheckpointedIndexes(); invariantWTOK(session->checkpoint(session, "use_timestamp=false"), session); LOGV2_FOR_RECOVERY(5576602, 2, "Completed unstable checkpoint.", "initialDataTimestamp"_attr = initialDataTimestamp.toString()); + clearIndividuallyCheckpointedIndexes(); } else if (stableTimestamp < initialDataTimestamp) { LOGV2_FOR_RECOVERY( 23985, @@ -2036,8 +2036,8 @@ void WiredTigerKVEngine::_checkpoint(WT_SESSION* session) { "oplogNeededForRollback"_attr = toString(oplogNeededForRollback)); { - clearIndividuallyCheckpointedIndexes(); invariantWTOK(session->checkpoint(session, "use_timestamp=true"), session); + clearIndividuallyCheckpointedIndexes(); } if (oplogNeededForRollback.isOK()) { diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h index 10672661629..11d79930fd3 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_kv_engine.h @@ -379,14 +379,17 @@ public: } void addIndividuallyCheckpointedIndex(const std::string& ident) override { + stdx::lock_guard lk(_checkpointedIndexesMutex); _checkpointedIndexes.insert(ident); } void clearIndividuallyCheckpointedIndexes() override { + stdx::lock_guard lk(_checkpointedIndexesMutex); _checkpointedIndexes.clear(); } bool isInIndividuallyCheckpointedIndexes(const std::string& ident) const override { + stdx::lock_guard lk(_checkpointedIndexesMutex); return _checkpointedIndexes.find(ident) != _checkpointedIndexes.end(); } @@ -533,14 +536,14 @@ private: // timestamp. Provided by replication layer because WT does not persist timestamps. AtomicWord<std::uint64_t> _initialDataTimestamp; - // Required for taking a checkpoint; and can be used to ensure multiple checkpoint cursors - // target the same checkpoint. - Lock::ResourceMutex _checkpointCursorMutex = Lock::ResourceMutex("checkpointCursorMutex"); + // Required for accessing '_checkpointedIndexes'. + mutable Mutex _checkpointedIndexesMutex = + MONGO_MAKE_LATCH("WiredTigerKVEngine::_checkpointedIndexesMutex"); // A set of indexes that were individually checkpoint'ed and are not consistent with the rest // of the checkpoint's PIT view of the storage data. This set is reset when a storage-wide WT // checkpoint is taken that makes the PIT view consistent again. - std::set<std::string> _checkpointedIndexes; + StringSet _checkpointedIndexes; AtomicWord<std::uint64_t> _oplogNeededForCrashRecovery; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp index 43b88dec64c..1513acbe023 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.cpp @@ -377,21 +377,6 @@ void WiredTigerRecoveryUnit::preallocateSnapshotForOplogRead() { preallocateSnapshot(); } -void WiredTigerRecoveryUnit::refreshSnapshot() { - // Currently, this code only works for kNoOverlap or kNoTimestamp. - invariant(_timestampReadSource == ReadSource::kNoOverlap || - _timestampReadSource == ReadSource::kNoTimestamp); - invariant(_isActive()); - invariant(!_inUnitOfWork()); - invariant(!_noEvictionAfterRollback); - invariant(_abandonSnapshotMode == AbandonSnapshotMode::kAbort); - - auto session = _session->getSession(); - invariantWTOK(session->reset_snapshot(session), session); - LOGV2_DEBUG( - 6235000, 3, "WT refreshed snapshot", "snapshotId"_attr = getSnapshotId().toNumber()); -} - void WiredTigerRecoveryUnit::_txnClose(bool commit) { invariant(_isActive(), toString(_getState())); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h index 5397b91fb91..26102686dbd 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h @@ -181,8 +181,6 @@ public: std::shared_ptr<StorageStats> getOperationStatistics() const override; - void refreshSnapshot() override; - void ignoreAllMultiTimestampConstraints() { _multiTimestampConstraintTracker.ignoreAllMultiTimestampConstraints = true; } diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp index d6bf0e62721..9757ca278a2 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit_test.cpp @@ -623,7 +623,7 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, CommitTimestampAfterSetTimestampOnAbor ASSERT(!commitTs); } -TEST_F(WiredTigerRecoveryUnitTestFixture, CheckpointCursorsCached) { +TEST_F(WiredTigerRecoveryUnitTestFixture, CheckpointCursorsAreNotCached) { auto opCtx = clientAndCtx1.second.get(); // Hold the global lock throughout the test to avoid having the global lock destructor @@ -663,7 +663,8 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, CheckpointCursorsCached) { // Force a checkpoint. engine->flushAllFiles(opCtx, /*callerHoldsReadLock*/ false); - // Test 2: Checkpoint cursors will be released into the cache. + // Test 2: Checkpoint cursors are not expected to be cached, they + // should be immediately closed when destructed. ru->setTimestampReadSource(WiredTigerRecoveryUnit::ReadSource::kCheckpoint); // Close any cached cursors to establish a new 'before' state. @@ -673,9 +674,9 @@ TEST_F(WiredTigerRecoveryUnitTestFixture, CheckpointCursorsCached) { // Will search the checkpoint cursor for the record, then release the checkpoint cursor. ASSERT_TRUE(rs->findRecord(opCtx, s.getValue(), &rd)); - // A new cursor should have been released into the cache, along with a metadata cursor that is - // opened to determine if the table is LSM. Metadata cursors are cached. - ASSERT_GT(ru->getSession()->cachedCursors(), cachedCursorsBefore + 1); + // No new cursors should have been released into the cache, with the exception of a metadata + // cursor that is opened to determine if the table is LSM. Metadata cursors are cached. + ASSERT_EQ(ru->getSession()->cachedCursors(), cachedCursorsBefore + 1); // All opened cursors are closed. ASSERT_EQ(0, ru->getSession()->cursorsOut()); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp index a79e5494cdc..a8eb73f355b 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_session_cache.cpp @@ -311,8 +311,8 @@ void WiredTigerSessionCache::waitUntilDurable(OperationContext* opCtx, auto config = syncType == Fsync::kCheckpointStableTimestamp ? "use_timestamp=true" : "use_timestamp=false"; { - _engine->clearIndividuallyCheckpointedIndexes(); invariantWTOK(s->checkpoint(s, config), s); + _engine->clearIndividuallyCheckpointedIndexes(); } if (token) { @@ -365,10 +365,10 @@ void WiredTigerSessionCache::waitUntilDurable(OperationContext* opCtx, _waitUntilDurableSession); LOGV2_DEBUG(22419, 4, "flushed journal"); } else { - _engine->clearIndividuallyCheckpointedIndexes(); invariantWTOK(_waitUntilDurableSession->checkpoint(_waitUntilDurableSession, nullptr), _waitUntilDurableSession); LOGV2_DEBUG(22420, 4, "created checkpoint"); + _engine->clearIndividuallyCheckpointedIndexes(); } if (token) { diff --git a/src/mongo/dbtests/validate_tests.cpp b/src/mongo/dbtests/validate_tests.cpp index 4674a21de29..20cbdae9fa7 100644 --- a/src/mongo/dbtests/validate_tests.cpp +++ b/src/mongo/dbtests/validate_tests.cpp @@ -122,6 +122,13 @@ public: } protected: + void forceCheckpoint(bool background) { + if (background) { + // Checkpoint all of the data. + _opCtx.getServiceContext()->getStorageEngine()->checkpoint(); + } + } + ValidateResults runValidate() { // validate() will set a kCheckpoint read source. Callers continue to do operations after // running validate, so we must reset the read source back to normal before returning. @@ -141,6 +148,7 @@ protected: ValidateResults results; BSONObjBuilder output; + forceCheckpoint(_background); ASSERT_OK(CollectionValidation::validate( &_opCtx, _nss, mode, repairMode, &results, &output, kTurnOnExtraLoggingForTest)); @@ -3357,6 +3365,15 @@ public: ValidateResults results; BSONObjBuilder output; + // validate() will set a kCheckpoint read source. Callers continue to do operations + // after running validate, so we must reset the read source back to normal before + // returning. + auto originalReadSource = _opCtx.recoveryUnit()->getTimestampReadSource(); + ON_BLOCK_EXIT([&] { + _opCtx.recoveryUnit()->abandonSnapshot(); + _opCtx.recoveryUnit()->setTimestampReadSource(originalReadSource); + }); + forceCheckpoint(_background); ASSERT_OK(CollectionValidation::validate(&_opCtx, _nss, mode, @@ -4159,6 +4176,15 @@ public: ValidateResults results; BSONObjBuilder output; + // validate() will set a kCheckpoint read source. Callers continue to do operations + // after running validate, so we must reset the read source back to normal before + // returning. + auto originalReadSource = _opCtx.recoveryUnit()->getTimestampReadSource(); + ON_BLOCK_EXIT([&] { + _opCtx.recoveryUnit()->abandonSnapshot(); + _opCtx.recoveryUnit()->setTimestampReadSource(originalReadSource); + }); + forceCheckpoint(_background); ASSERT_OK(CollectionValidation::validate(&_opCtx, _nss, mode, @@ -4264,6 +4290,15 @@ public: ValidateResults results; BSONObjBuilder output; + // validate() will set a kCheckpoint read source. Callers continue to do operations + // after running validate, so we must reset the read source back to normal before + // returning. + auto originalReadSource = _opCtx.recoveryUnit()->getTimestampReadSource(); + ON_BLOCK_EXIT([&] { + _opCtx.recoveryUnit()->abandonSnapshot(); + _opCtx.recoveryUnit()->setTimestampReadSource(originalReadSource); + }); + forceCheckpoint(_background); ASSERT_OK(CollectionValidation::validate(&_opCtx, _nss, mode, @@ -4366,6 +4401,15 @@ public: ValidateResults results; BSONObjBuilder output; + // validate() will set a kCheckpoint read source. Callers continue to do operations + // after running validate, so we must reset the read source back to normal before + // returning. + auto originalReadSource = _opCtx.recoveryUnit()->getTimestampReadSource(); + ON_BLOCK_EXIT([&] { + _opCtx.recoveryUnit()->abandonSnapshot(); + _opCtx.recoveryUnit()->setTimestampReadSource(originalReadSource); + }); + forceCheckpoint(_background); ASSERT_OK(CollectionValidation::validate(&_opCtx, _nss, mode, @@ -4397,6 +4441,15 @@ public: ValidateResults results; BSONObjBuilder output; + // validate() will set a kCheckpoint read source. Callers continue to do operations + // after running validate, so we must reset the read source back to normal before + // returning. + auto originalReadSource = _opCtx.recoveryUnit()->getTimestampReadSource(); + ON_BLOCK_EXIT([&] { + _opCtx.recoveryUnit()->abandonSnapshot(); + _opCtx.recoveryUnit()->setTimestampReadSource(originalReadSource); + }); + forceCheckpoint(_background); ASSERT_OK(CollectionValidation::validate(&_opCtx, _nss, mode, |