diff options
author | Daniel Gottlieb <daniel.gottlieb@mongodb.com> | 2022-03-07 15:47:49 -0500 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-03-09 22:29:05 +0000 |
commit | bc8142405dec99dd24cb87b2f61570c600d6f0d4 (patch) | |
tree | ca55b37a1228b248a4412f6fd791a9bedb90a6f9 /src/mongo/db | |
parent | 44ad2b3406535b927e0f968a1a4a0a022a6dbcb1 (diff) | |
download | mongo-bc8142405dec99dd24cb87b2f61570c600d6f0d4.tar.gz |
SERVER-60754: Ensure primaries write multikey paths in timestamp order.
Diffstat (limited to 'src/mongo/db')
-rw-r--r-- | src/mongo/db/catalog/index_catalog_impl.cpp | 62 | ||||
-rw-r--r-- | src/mongo/db/repl/storage_timestamp_test.cpp | 201 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp | 11 |
3 files changed, 157 insertions, 117 deletions
diff --git a/src/mongo/db/catalog/index_catalog_impl.cpp b/src/mongo/db/catalog/index_catalog_impl.cpp index 4315c583234..115c664d0fc 100644 --- a/src/mongo/db/catalog/index_catalog_impl.cpp +++ b/src/mongo/db/catalog/index_catalog_impl.cpp @@ -59,6 +59,7 @@ #include "mongo/db/jsobj.h" #include "mongo/db/keypattern.h" #include "mongo/db/matcher/expression.h" +#include "mongo/db/multi_key_path_tracker.h" #include "mongo/db/operation_context.h" #include "mongo/db/ops/delete.h" #include "mongo/db/query/collation/collation_spec.h" @@ -1601,16 +1602,61 @@ Status IndexCatalogImpl::indexRecords(OperationContext* opCtx, *keysInsertedOut = 0; } - for (auto&& it : _readyIndexes) { - Status s = _indexRecords(opCtx, coll, it.get(), bsonRecords, keysInsertedOut); - if (!s.isOK()) - return s; + // For vectored inserts, we insert index keys and flip multikey in "index order". However + // because multikey state for different indexes both live on the same _mdb_catalog document, + // index order isn't necessarily timestamp order. We track multikey paths here to ensure we make + // changes to the _mdb_catalog document with in timestamp order updates. + MultikeyPathTracker& tracker = MultikeyPathTracker::get(opCtx); + + // Take care when choosing to aggregate multikey writes. This code will only* track multikey + // when: + // * No parent is tracking multikey and* + // * There are timestamps associated with the input `bsonRecords`. + // + // If we are not responsible for tracking multikey: + // * Leave the multikey tracker in its original "tracking" state. + // * Not write any accumulated multikey paths to the _mdb_catalog document. + const bool manageMultikeyWrite = + !tracker.isTrackingMultikeyPathInfo() && !bsonRecords[0].ts.isNull(); + { + ScopeGuard stopTrackingMultikeyChanges( + [&tracker] { tracker.stopTrackingMultikeyPathInfo(); }); + if (manageMultikeyWrite) { + tracker.startTrackingMultikeyPathInfo(); + } else { + stopTrackingMultikeyChanges.dismiss(); + } + for (auto&& it : _readyIndexes) { + Status s = _indexRecords(opCtx, coll, it.get(), bsonRecords, keysInsertedOut); + if (!s.isOK()) + return s; + } + + for (auto&& it : _buildingIndexes) { + Status s = _indexRecords(opCtx, coll, it.get(), bsonRecords, keysInsertedOut); + if (!s.isOK()) + return s; + } } - for (auto&& it : _buildingIndexes) { - Status s = _indexRecords(opCtx, coll, it.get(), bsonRecords, keysInsertedOut); - if (!s.isOK()) - return s; + const WorkerMultikeyPathInfo& newPaths = tracker.getMultikeyPathInfo(); + if (newPaths.size() == 0 || !manageMultikeyWrite) { + return Status::OK(); + } + + if (Status status = opCtx->recoveryUnit()->setTimestamp(bsonRecords[0].ts); !status.isOK()) { + return status; + } + + for (const MultikeyPathInfo& newPath : newPaths) { + auto idx = findIndexByName(opCtx, newPath.indexName, /*includeUnfinishedIndexes=*/true); + if (!idx) { + return Status(ErrorCodes::IndexNotFound, + str::stream() + << "Could not find index " << newPath.indexName << " in " + << coll->ns() << " (" << coll->uuid() << ") to set to multikey."); + } + setMultikeyPaths(opCtx, coll, idx, newPath.multikeyMetadataKeys, newPath.multikeyPaths); } return Status::OK(); diff --git a/src/mongo/db/repl/storage_timestamp_test.cpp b/src/mongo/db/repl/storage_timestamp_test.cpp index f5f0abafa5a..b128b2bad20 100644 --- a/src/mongo/db/repl/storage_timestamp_test.cpp +++ b/src/mongo/db/repl/storage_timestamp_test.cpp @@ -102,7 +102,10 @@ namespace mongo { namespace { -Status createIndexFromSpec(OperationContext* opCtx, StringData ns, const BSONObj& spec) { +Status createIndexFromSpec(OperationContext* opCtx, + VectorClockMutable* clock, + StringData ns, + const BSONObj& spec) { AutoGetDb autoDb(opCtx, nsToDatabaseSubstring(ns), MODE_X); Collection* coll; { @@ -125,9 +128,10 @@ Status createIndexFromSpec(OperationContext* opCtx, StringData ns, const BSONObj .init(opCtx, collection, spec, - [opCtx](const std::vector<BSONObj>& specs) -> Status { + [opCtx, clock](const std::vector<BSONObj>& specs) -> Status { if (opCtx->recoveryUnit()->getCommitTimestamp().isNull()) { - return opCtx->recoveryUnit()->setTimestamp(Timestamp(1, 1)); + return opCtx->recoveryUnit()->setTimestamp( + clock->tickClusterTime(1).asTimestamp()); } return Status::OK(); }) @@ -153,7 +157,8 @@ Status createIndexFromSpec(OperationContext* opCtx, StringData ns, const BSONObj WriteUnitOfWork wunit(opCtx); ASSERT_OK(indexer.commit( opCtx, coll, MultiIndexBlock::kNoopOnCreateEachFn, MultiIndexBlock::kNoopOnCommitFn)); - ASSERT_OK(opCtx->recoveryUnit()->setTimestamp(Timestamp(1, 1))); + LogicalTime indexTs = clock->tickClusterTime(1); + ASSERT_OK(opCtx->recoveryUnit()->setTimestamp(indexTs.asTimestamp())); wunit.commit(); abortOnExit.dismiss(); return Status::OK(); @@ -473,6 +478,13 @@ public: return queryCollection(NamespaceString::kRsOplogNamespace, query); } + Timestamp getTopOfOplog() { + OneOffRead oor(_opCtx, Timestamp::min()); + BSONObj ret; + ASSERT_TRUE(Helpers::getLast(_opCtx, NamespaceString::kRsOplogNamespace.ns().c_str(), ret)); + return ret["ts"].timestamp(); + } + void assertMinValidDocumentAtTimestamp(const NamespaceString& nss, const Timestamp& ts, const repl::MinValidDocument& expectedDoc) { @@ -1348,7 +1360,7 @@ TEST_F(StorageTimestampTest, SecondarySetIndexMultikeyOnInsert) { auto indexName = "a_1"; auto indexSpec = BSON("name" << indexName << "key" << BSON("a" << 1) << "v" << static_cast<int>(kIndexVersion)); - ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec)); + ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec)); _coordinatorMock->alwaysAllowWrites(false); @@ -1420,7 +1432,7 @@ TEST_F(StorageTimestampTest, SecondarySetWildcardIndexMultikeyOnInsert) { auto indexName = "a_1"; auto indexSpec = BSON("name" << indexName << "key" << BSON("$**" << 1) << "v" << static_cast<int>(kIndexVersion)); - ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec)); + ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec)); _coordinatorMock->alwaysAllowWrites(false); @@ -1516,7 +1528,7 @@ TEST_F(StorageTimestampTest, SecondarySetWildcardIndexMultikeyOnUpdate) { auto indexName = "a_1"; auto indexSpec = BSON("name" << indexName << "key" << BSON("$**" << 1) << "v" << static_cast<int>(kIndexVersion)); - ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec)); + ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec)); _coordinatorMock->alwaysAllowWrites(false); @@ -1600,99 +1612,6 @@ TEST_F(StorageTimestampTest, SecondarySetWildcardIndexMultikeyOnUpdate) { } } -TEST_F(StorageTimestampTest, InitialSyncSetIndexMultikeyOnInsert) { - // Pretend to be a secondary. - repl::UnreplicatedWritesBlock uwb(_opCtx); - - NamespaceString nss("unittests.InitialSyncSetIndexMultikeyOnInsert"); - create(nss); - UUID uuid = UUID::gen(); - { - AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX); - uuid = autoColl.getCollection()->uuid(); - } - auto indexName = "a_1"; - auto indexSpec = BSON("name" << indexName << "key" << BSON("a" << 1) << "v" - << static_cast<int>(kIndexVersion)); - ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec)); - - _coordinatorMock->alwaysAllowWrites(false); - ASSERT_OK(_coordinatorMock->setFollowerMode({repl::MemberState::MS::RS_STARTUP2})); - - const LogicalTime pastTime = _clock->tickClusterTime(1); - const LogicalTime insertTime0 = _clock->tickClusterTime(1); - const LogicalTime indexBuildTime = _clock->tickClusterTime(1); - const LogicalTime insertTime1 = _clock->tickClusterTime(1); - const LogicalTime insertTime2 = _clock->tickClusterTime(1); - - BSONObj doc0 = BSON("_id" << 0 << "a" << 3); - BSONObj doc1 = BSON("_id" << 1 << "a" << BSON_ARRAY(1 << 2)); - BSONObj doc2 = BSON("_id" << 2 << "a" << BSON_ARRAY(1 << 2)); - auto op0 = repl::OplogEntry( - BSON("ts" << insertTime0.asTimestamp() << "t" << 1LL << "v" << 2 << "op" - << "i" - << "ns" << nss.ns() << "ui" << uuid << "wall" << Date_t() << "o" << doc0)); - auto op1 = repl::OplogEntry( - BSON("ts" << insertTime1.asTimestamp() << "t" << 1LL << "v" << 2 << "op" - << "i" - << "ns" << nss.ns() << "ui" << uuid << "wall" << Date_t() << "o" << doc1)); - auto op2 = repl::OplogEntry( - BSON("ts" << insertTime2.asTimestamp() << "t" << 1LL << "v" << 2 << "op" - << "i" - << "ns" << nss.ns() << "ui" << uuid << "wall" << Date_t() << "o" << doc2)); - auto indexSpec2 = BSON("createIndexes" << nss.coll() << "v" << static_cast<int>(kIndexVersion) - << "key" << BSON("b" << 1) << "name" - << "b_1"); - auto createIndexOp = - repl::OplogEntry(BSON("ts" << indexBuildTime.asTimestamp() << "t" << 1LL << "v" << 2 << "op" - << "c" - << "ns" << nss.getCommandNS().ns() << "ui" << uuid << "wall" - << Date_t() << "o" << indexSpec2)); - - // We add in an index creation op to test that we restart tracking multikey path info - // after bulk index builds. - std::vector<repl::OplogEntry> ops = {op0, createIndexOp, op1, op2}; - - DoNothingOplogApplierObserver observer; - auto storageInterface = repl::StorageInterface::get(_opCtx); - auto writerPool = repl::makeReplWriterPool(); - - repl::OplogApplierImpl oplogApplier( - nullptr, // task executor. not required for applyOplogBatch(). - nullptr, // oplog buffer. not required for applyOplogBatch(). - &observer, - _coordinatorMock, - _consistencyMarkers, - storageInterface, - repl::OplogApplier::Options(repl::OplogApplication::Mode::kInitialSync), - writerPool.get()); - auto lastTime = unittest::assertGet(oplogApplier.applyOplogBatch(_opCtx, ops)); - ASSERT_EQ(lastTime.getTimestamp(), insertTime2.asTimestamp()); - - // Wait for the index build to finish before making any assertions. - IndexBuildsCoordinator::get(_opCtx)->awaitNoIndexBuildInProgressForCollection(_opCtx, uuid); - - AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX); - - // Ensure minimumVisible has not been updated due to the index creation. - ASSERT_LT(autoColl.getCollection()->getMinimumVisibleSnapshot().get(), pastTime.asTimestamp()); - - // Reading the multikey state before 'insertTime0' is not valid or reliable to test. If the - // background index build intercepts and drains writes during inital sync, the index write - // and the write to the multikey path state will not be timestamped. This write is not - // timestamped because the lastApplied timestamp, which would normally be used on a primary - // or secondary, is not always available during initial sync. - // Additionally, it is not valid to read at a timestamp before inital sync completes, so - // these assertions below only make sense in the context of this unit test, but would - // otherwise not be exercised in any normal scenario. - assertMultikeyPaths( - _opCtx, autoColl.getCollection(), indexName, insertTime0.asTimestamp(), true, {{0}}); - assertMultikeyPaths( - _opCtx, autoColl.getCollection(), indexName, insertTime1.asTimestamp(), true, {{0}}); - assertMultikeyPaths( - _opCtx, autoColl.getCollection(), indexName, insertTime2.asTimestamp(), true, {{0}}); -} - TEST_F(StorageTimestampTest, PrimarySetIndexMultikeyOnInsert) { NamespaceString nss("unittests.PrimarySetIndexMultikeyOnInsert"); create(nss); @@ -1701,7 +1620,7 @@ TEST_F(StorageTimestampTest, PrimarySetIndexMultikeyOnInsert) { auto indexName = "a_1"; auto indexSpec = BSON("name" << indexName << "key" << BSON("a" << 1) << "v" << static_cast<int>(kIndexVersion)); - ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec)); + ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec)); const LogicalTime pastTime = _clock->tickClusterTime(1); const LogicalTime insertTime = pastTime.addTicks(1); @@ -1726,7 +1645,7 @@ TEST_F(StorageTimestampTest, PrimarySetIndexMultikeyOnInsertUnreplicated) { auto indexName = "a_1"; auto indexSpec = BSON("name" << indexName << "key" << BSON("a" << 1) << "v" << static_cast<int>(kIndexVersion)); - ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec)); + ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec)); const LogicalTime pastTime = _clock->tickClusterTime(1); const LogicalTime insertTime = pastTime.addTicks(1); @@ -1758,7 +1677,7 @@ TEST_F(StorageTimestampTest, PrimarySetsMultikeyInsideMultiDocumentTransaction) { AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX); - ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec)); + ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec)); } const auto currentTime = _clock->getTime(); @@ -3185,6 +3104,82 @@ TEST_F(StorageTimestampTest, CreateCollectionWithSystemIndex) { assertIdentsExistAtTimestamp(durableCatalog, "", indexIdent, _nullTs); } +TEST_F(StorageTimestampTest, MultipleTimestampsForMultikeyWrites) { + auto storageEngine = _opCtx->getServiceContext()->getStorageEngine(); + auto durableCatalog = storageEngine->getCatalog(); + RecordId catalogId; + + // Create config.system.indexBuilds collection to store commit quorum value during index + // building. + ASSERT_OK(repl::StorageInterface::get(_opCtx)->dropCollection( + _opCtx, NamespaceString::kIndexBuildEntryNamespace)); + ASSERT_OK( + createCollection(_opCtx, + NamespaceString::kIndexBuildEntryNamespace.db().toString(), + BSON("create" << NamespaceString::kIndexBuildEntryNamespace.coll()))); + + NamespaceString nss("unittests.timestampVectoredInsertMultikey"); + create(nss); + + { + AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX); + + WriteUnitOfWork wuow(_opCtx); + insertDocument( + autoColl.getCollection(), + InsertStatement(BSON("_id" << 0 << "a" << 1 << "b" << 2), Timestamp(), _presentTerm)); + wuow.commit(); + ASSERT_EQ(1, itCount(autoColl.getCollection())); + catalogId = autoColl.getCollection()->getCatalogId(); + } + + DBDirectClient client(_opCtx); + { + auto index1 = BSON("v" << kIndexVersion << "key" << BSON("a" << 1) << "name" + << "a_1"); + auto index2 = BSON("v" << kIndexVersion << "key" << BSON("b" << 1) << "name" + << "b_1"); + // Disable index build commit quorum as we don't have support of replication subsystem for + // voting. + auto createIndexesCmdObj = + BSON("createIndexes" << nss.coll() << "indexes" << BSON_ARRAY(index1 << index2) + << "commitQuorum" << 0); + BSONObj result; + ASSERT(client.runCommand(nss.db().toString(), createIndexesCmdObj, result)) << result; + } + + AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX); + Timestamp tsBeforeMultikeyWrites = getTopOfOplog(); + { + const LogicalTime firstInsertTime = _clock->tickClusterTime(2); + + std::vector<InsertStatement> vectoredInsert; + vectoredInsert.emplace_back(BSON("_id" << 1 << "b" << BSON_ARRAY(1 << 2)), + firstInsertTime.asTimestamp(), + _presentTerm); + vectoredInsert.emplace_back(BSON("_id" << 2 << "a" << BSON_ARRAY(1 << 2)), + firstInsertTime.addTicks(1).asTimestamp(), + _presentTerm); + + WriteUnitOfWork wuow(_opCtx); + ASSERT_OK(autoColl.getCollection()->insertDocuments( + _opCtx, vectoredInsert.begin(), vectoredInsert.end(), nullptr, false)); + wuow.commit(); + } + + // By virtue of not crashing due to WT observing an out of order write, this test has + // succeeded. For completeness, check the index metadata. + std::shared_ptr<BSONCollectionCatalogEntry::MetaData> mdBeforeInserts = + getMetaDataAtTime(durableCatalog, catalogId, tsBeforeMultikeyWrites); + ASSERT_FALSE(getIndexMetaData(mdBeforeInserts, "a_1").multikey); + ASSERT_FALSE(getIndexMetaData(mdBeforeInserts, "b_1").multikey); + + std::shared_ptr<BSONCollectionCatalogEntry::MetaData> mdAfterInserts = + getMetaDataAtTime(durableCatalog, catalogId, getTopOfOplog()); + ASSERT(getIndexMetaData(mdAfterInserts, "a_1").multikey); + ASSERT(getIndexMetaData(mdAfterInserts, "b_1").multikey); +} + class RetryableFindAndModifyTest : public StorageTimestampTest { public: const StringData dbName = "unittest"_sd; diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp index 0f4806c32a4..8cb5ce919ad 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp @@ -791,18 +791,17 @@ StatusWith<std::string> WiredTigerRecordStore::generateCreateString( ss << "split_pct=90,"; ss << "leaf_value_max=64MB,"; if (TestingProctor::instance().isEnabled()) { - if (NamespaceString(ns).isOplog() || - // TODO (SERVER-60754): Remove special handling for setting multikey. - ident.startsWith("_mdb_catalog")) { - + if (NamespaceString(ns).isOplog()) { // For the above clauses we do not assert any particular `write_timestamp_usage`. In // particular for the oplog, WT removes all timestamp information. There's nothing in // MDB's control to assert against. } else if ( // Side table drains are not timestamped. ident.startsWith("internal-") || - // TODO (SERVER-60753): Remove special handling for index build during recovery. - ns == NamespaceString::kIndexBuildEntryNamespace.ns()) { + // TODO (SERVER-60753): Remove special handling for index build during recovery. This + // includes the following _mdb_catalog ident. + ns == NamespaceString::kIndexBuildEntryNamespace.ns() || + ident.startsWith("_mdb_catalog")) { ss << "write_timestamp_usage=mixed_mode,"; } else { ss << "write_timestamp_usage=ordered,"; |