diff options
author | Daniel Gottlieb <daniel.gottlieb@mongodb.com> | 2018-05-14 11:09:50 -0400 |
---|---|---|
committer | Daniel Gottlieb <daniel.gottlieb@mongodb.com> | 2018-05-14 11:09:50 -0400 |
commit | 8e27c2b49ba17fe9b02695efb29a6322b56c2f23 (patch) | |
tree | d8b0682eee38a4655bf8d61b00143df6a1153a4d /src/mongo | |
parent | 074c9725f1e6ad9b005ab08acc1aedf34e537d69 (diff) | |
download | mongo-8e27c2b49ba17fe9b02695efb29a6322b56c2f23.tar.gz |
SERVER-34896: Move timestamping responsibility on index completion to callers.
Previously, `MultiIndexBlockImpl` would use member state as a proxy for whether
an index build originated from a `createIndexes` command. This information was
used to determine if the completion of the index needed to be explicitly
timestamped, or whether an oplog entry would timestamp the write.
However, nodes in a primary state can build indexes during primary "catch-up"
or "drain". Those index builds originated from an oplog entry and thus will not
generate one on completion. These index builds need to be explicitly
timestamped.
Diffstat (limited to 'src/mongo')
-rw-r--r-- | src/mongo/db/SConscript | 3 | ||||
-rw-r--r-- | src/mongo/db/catalog/index_create_impl.cpp | 62 | ||||
-rw-r--r-- | src/mongo/db/index_builder.cpp | 44 | ||||
-rw-r--r-- | src/mongo/dbtests/storage_timestamp_tests.cpp | 285 |
4 files changed, 266 insertions, 128 deletions
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript index 046fac61168..23a79c2bb71 100644 --- a/src/mongo/db/SConscript +++ b/src/mongo/db/SConscript @@ -821,6 +821,9 @@ env.Library( '$BUILD_DIR/mongo/db/catalog/index_catalog', '$BUILD_DIR/mongo/db/catalog/index_create', ], + LIBDEPS_PRIVATE=[ + "logical_clock", + ], ) env.Library( diff --git a/src/mongo/db/catalog/index_create_impl.cpp b/src/mongo/db/catalog/index_create_impl.cpp index e3b7bfc4b64..bbdf9c1df16 100644 --- a/src/mongo/db/catalog/index_create_impl.cpp +++ b/src/mongo/db/catalog/index_create_impl.cpp @@ -44,8 +44,8 @@ #include "mongo/db/curop.h" #include "mongo/db/exec/working_set_common.h" #include "mongo/db/index/multikey_paths.h" -#include "mongo/db/logical_clock.h" #include "mongo/db/multi_key_path_tracker.h" +#include "mongo/db/op_observer.h" #include "mongo/db/operation_context.h" #include "mongo/db/query/internal_plans.h" #include "mongo/db/repl/replication_coordinator.h" @@ -60,42 +60,8 @@ #include "mongo/util/quick_exit.h" #include "mongo/util/scopeguard.h" -#include "mongo/db/op_observer.h" - namespace mongo { -namespace { - -/** - * Returns true if writes to the catalog entry for the input namespace require being - * timestamped. A ghost write is when the operation is not committed with an oplog entry and - * implies the caller will look at the logical clock to choose a time to use. - */ -bool requiresGhostCommitTimestamp(OperationContext* opCtx, NamespaceString nss) { - if (!nss.isReplicated() || nss.coll().startsWith("tmp.mr")) { - return false; - } - - auto replCoord = repl::ReplicationCoordinator::get(opCtx); - if (!replCoord->getSettings().usingReplSets()) { - return false; - } - - // Nodes in `startup` may not have yet initialized the `LogicalClock`. Primaries do not need - // ghost writes. Nodes in the `applyOps` (`startup2`) phase of initial sync must not timestamp - // index builds before the `initialDataTimestamp`. Nodes doing replication recovery (also - // `startup2`) must timestamp index builds. All replication recovery index builds are - // foregrounded and have a "commit timestamp" set. - const auto memberState = replCoord->getMemberState(); - if (memberState.primary() || memberState.startup() || memberState.startup2()) { - return false; - } - - return opCtx->recoveryUnit()->getCommitTimestamp().isNull(); -} - -} // namespace - using std::unique_ptr; using std::string; using std::endl; @@ -193,9 +159,15 @@ MultiIndexBlockImpl::~MultiIndexBlockImpl() { } auto replCoord = repl::ReplicationCoordinator::get(_opCtx); - if (replCoord->canAcceptWritesForDatabase(_opCtx, "admin")) { - // Primaries must timestamp the failure of an index build (via an op message). - // Secondaries may not fail index builds. + // Nodes building an index on behalf of a user (e.g: `createIndexes`, `applyOps`) may + // fail, removing the existence of the index from the catalog. This update must be + // timestamped. A failure from `createIndexes` should not have a commit timestamp and + // instead write a noop entry. A foreground `applyOps` index build may have a commit + // timestamp already set. + if (_opCtx->recoveryUnit()->getCommitTimestamp().isNull() && + replCoord->canAcceptWritesForDatabase(_opCtx, "admin")) { + + // Make lock acquisition uninterruptible because writing an op message takes a lock. _opCtx->getServiceContext()->getOpObserver()->onOpMessage( _opCtx, BSON("msg" << std::string(str::stream() << "Failing index builds. Coll: " @@ -326,9 +298,11 @@ StatusWith<std::vector<BSONObj>> MultiIndexBlockImpl::init(const std::vector<BSO _backgroundOperation.reset(new BackgroundOperation(ns)); auto replCoord = repl::ReplicationCoordinator::get(_opCtx); - if (replCoord->canAcceptWritesForDatabase(_opCtx, "admin")) { + if (_opCtx->recoveryUnit()->getCommitTimestamp().isNull() && + replCoord->canAcceptWritesForDatabase(_opCtx, "admin")) { // Only primaries must timestamp this write. Secondaries run this from within a - // `TimestampBlock` + // `TimestampBlock`. Primaries performing an index build via `applyOps` may have a + // wrapping commit timestamp that will be used instead. _opCtx->getServiceContext()->getOpObserver()->onOpMessage( _opCtx, BSON("msg" << std::string(str::stream() << "Creating indexes. Coll: " << ns))); } @@ -565,14 +539,6 @@ void MultiIndexBlockImpl::commit() { } } - if (requiresGhostCommitTimestamp(_opCtx, _collection->ns())) { - // Only secondaries must explicitly timestamp index completion. Primaries perform this - // write along with an oplog entry. - fassert(50701, - _opCtx->recoveryUnit()->setTimestamp( - LogicalClock::get(_opCtx)->getClusterTime().asTimestamp())); - } - _opCtx->recoveryUnit()->registerChange(new SetNeedToCleanupOnRollback(this)); _needToCleanup = false; } diff --git a/src/mongo/db/index_builder.cpp b/src/mongo/db/index_builder.cpp index d915fc6590b..8dbef6caa25 100644 --- a/src/mongo/db/index_builder.cpp +++ b/src/mongo/db/index_builder.cpp @@ -40,6 +40,7 @@ #include "mongo/db/concurrency/write_conflict_exception.h" #include "mongo/db/curop.h" #include "mongo/db/db_raii.h" +#include "mongo/db/logical_clock.h" #include "mongo/db/repl/timestamp_block.h" #include "mongo/util/assert_util.h" #include "mongo/util/log.h" @@ -52,6 +53,42 @@ using std::endl; AtomicUInt32 IndexBuilder::_indexBuildCount; namespace { + +/** + * Returns true if writes to the catalog entry for the input namespace require being + * timestamped. A ghost write is when the operation is not committed with an oplog entry and + * implies the caller will look at the logical clock to choose a time to use. + */ +bool requiresGhostCommitTimestamp(OperationContext* opCtx, NamespaceString nss) { + if (!nss.isReplicated() || nss.coll().startsWith("tmp.mr")) { + return false; + } + + auto replCoord = repl::ReplicationCoordinator::get(opCtx); + if (!replCoord->getSettings().usingReplSets()) { + return false; + } + + // If there is a commit timestamp already assigned, there's no need to explicitly assign a + // timestamp. This case covers foreground index builds. + if (!opCtx->recoveryUnit()->getCommitTimestamp().isNull()) { + return false; + } + + // Only oplog entries (including a user's `applyOps` command) construct indexes via + // `IndexBuilder`. Nodes in `startup` may not yet have initialized the `LogicalClock`, however + // index builds during startup replication recovery must be timestamped. These index builds + // are foregrounded and timestamp their catalog writes with a "commit timestamp". Nodes in the + // oplog application phase of initial sync (`startup2`) must not timestamp index builds before + // the `initialDataTimestamp`. + const auto memberState = replCoord->getMemberState(); + if (memberState.startup() || memberState.startup2()) { + return false; + } + + return true; +} + // Synchronization tools when replication spawns a background index in a new thread. // The bool is 'true' when a new background index has started in a new thread but the // parent thread has not yet synchronized with it. @@ -202,9 +239,14 @@ Status IndexBuilder::_build(OperationContext* opCtx, if (allowBackgroundBuilding) { dbLock->relockWithMode(MODE_X); } - writeConflictRetry(opCtx, "Commit index build", ns.ns(), [opCtx, &indexer] { + writeConflictRetry(opCtx, "Commit index build", ns.ns(), [opCtx, &indexer, &ns] { WriteUnitOfWork wunit(opCtx); indexer.commit(); + if (requiresGhostCommitTimestamp(opCtx, ns)) { + fassert(50701, + opCtx->recoveryUnit()->setTimestamp( + LogicalClock::get(opCtx)->getClusterTime().asTimestamp())); + } wunit.commit(); }); diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp index 1c05ac712e9..bc26bab34a2 100644 --- a/src/mongo/dbtests/storage_timestamp_tests.cpp +++ b/src/mongo/dbtests/storage_timestamp_tests.cpp @@ -73,8 +73,36 @@ namespace mongo { +namespace { +/** + * RAII type for operating at a timestamp. Will remove any timestamping when the object destructs. + */ +class OneOffRead { +public: + OneOffRead(OperationContext* opCtx, const Timestamp& ts) : _opCtx(opCtx) { + _opCtx->recoveryUnit()->abandonSnapshot(); + ASSERT_OK(_opCtx->recoveryUnit()->setPointInTimeReadTimestamp(ts)); + } + + ~OneOffRead() { + _opCtx->recoveryUnit()->abandonSnapshot(); + ASSERT_OK(_opCtx->recoveryUnit()->setPointInTimeReadTimestamp(Timestamp::min())); + } + +private: + OperationContext* _opCtx; +}; +} + const auto kIndexVersion = IndexDescriptor::IndexVersion::kV2; +BSONCollectionCatalogEntry::IndexMetaData getIndexMetaData( + const BSONCollectionCatalogEntry::MetaData& collMetaData, StringData indexName) { + const auto idxOffset = collMetaData.findIndexOffset(indexName); + invariant(idxOffset > -1); + return collMetaData.indexes[idxOffset]; +} + class StorageTimestampTest { public: ServiceContext::UniqueOperationContext _opCtxRaii = cc().makeOperationContext(); @@ -206,8 +234,17 @@ public: return optRecord.get().data.toBson(); } + BSONCollectionCatalogEntry::MetaData getMetaDataAtTime(KVCatalog* kvCatalog, + NamespaceString ns, + const Timestamp& ts) { + OneOffRead oor(_opCtx, ts); + return kvCatalog->getMetaData(_opCtx, ns.ns()); + } + StatusWith<BSONObj> doAtomicApplyOps(const std::string& dbName, const std::list<BSONObj>& applyOpsList) { + OneOffRead oor(_opCtx, Timestamp::min()); + BSONObjBuilder result; Status status = applyOps(_opCtx, dbName, @@ -224,6 +261,8 @@ public: // Creates a dummy command operation to persuade `applyOps` to be non-atomic. StatusWith<BSONObj> doNonAtomicApplyOps(const std::string& dbName, const std::list<BSONObj>& applyOpsList) { + OneOffRead oor(_opCtx, Timestamp::min()); + BSONObjBuilder result; Status status = applyOps(_opCtx, dbName, @@ -240,9 +279,8 @@ public: void assertMinValidDocumentAtTimestamp(Collection* coll, const Timestamp& ts, const repl::MinValidDocument& expectedDoc) { - auto recoveryUnit = _opCtx->recoveryUnit(); - recoveryUnit->abandonSnapshot(); - ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(ts)); + OneOffRead oor(_opCtx, ts); + auto doc = repl::MinValidDocument::parse(IDLParserErrorContext("MinValidDocument"), findOne(coll)); ASSERT_EQ(expectedDoc.getMinValidTimestamp(), doc.getMinValidTimestamp()) @@ -265,10 +303,7 @@ public: void assertDocumentAtTimestamp(Collection* coll, const Timestamp& ts, const BSONObj& expectedDoc) { - - auto recoveryUnit = _opCtx->recoveryUnit(); - recoveryUnit->abandonSnapshot(); - ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(ts)); + OneOffRead oor(_opCtx, ts); if (expectedDoc.isEmpty()) { ASSERT_EQ(0, itCount(coll)) << "Should not find any documents in " << coll->ns() << " at ts: " << ts; @@ -290,24 +325,18 @@ public: * Asserts that the given collection is in (or not in) the KVCatalog's list of idents at the * provided timestamp. */ - void assertNamespaceInIdents(OperationContext* opCtx, - NamespaceString nss, - Timestamp ts, - bool shouldExpect) { + void assertNamespaceInIdents(NamespaceString nss, Timestamp ts, bool shouldExpect) { + OneOffRead oor(_opCtx, ts); KVCatalog* kvCatalog = static_cast<KVStorageEngine*>(_opCtx->getServiceContext()->getStorageEngine()) ->getCatalog(); - auto recoveryUnit = _opCtx->recoveryUnit(); - recoveryUnit->abandonSnapshot(); - ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(ts)); - // getCollectionIdent() returns the ident for the given namespace in the KVCatalog. // getAllIdents() actually looks in the RecordStore for a list of all idents, and is thus // versioned by timestamp. These tests do not do any renames, so we can expect the // namespace to have a consistent ident across timestamps, if it exists. auto expectedIdent = kvCatalog->getCollectionIdent(nss.ns()); - auto idents = kvCatalog->getAllIdents(opCtx); + auto idents = kvCatalog->getAllIdents(_opCtx); auto found = std::find(idents.begin(), idents.end(), expectedIdent); if (shouldExpect) { @@ -319,6 +348,8 @@ public: } std::string getNewIndexIdent(KVCatalog* kvCatalog, std::vector<std::string>& origIdents) { + OneOffRead oor(_opCtx, Timestamp::min()); + // Find the collection and index ident by performing a set difference on the original // idents and the current idents. std::vector<std::string> identsWithColl = kvCatalog->getAllIdents(_opCtx); @@ -331,7 +362,7 @@ public: origIdents.end(), std::back_inserter(collAndIdxIdents)); - ASSERT(collAndIdxIdents.size() == 1); + ASSERT(collAndIdxIdents.size() == 1) << "Num idents: " << collAndIdxIdents.size(); return collAndIdxIdents[0]; } @@ -366,9 +397,8 @@ public: const std::string& collIdent, const std::string& indexIdent, Timestamp timestamp) { - auto recoveryUnit = _opCtx->recoveryUnit(); - recoveryUnit->abandonSnapshot(); - ASSERT_OK(_opCtx->recoveryUnit()->setPointInTimeReadTimestamp(timestamp)); + OneOffRead oor(_opCtx, timestamp); + auto allIdents = kvCatalog->getAllIdents(_opCtx); if (collIdent.size() > 0) { // Index build test does not pass in a collection ident. @@ -385,9 +415,7 @@ public: const std::string& collIdent, const std::string& indexIdent, Timestamp timestamp) { - auto recoveryUnit = _opCtx->recoveryUnit(); - recoveryUnit->abandonSnapshot(); - ASSERT_OK(_opCtx->recoveryUnit()->setPointInTimeReadTimestamp(timestamp)); + OneOffRead oor(_opCtx, timestamp); auto allIdents = kvCatalog->getAllIdents(_opCtx); if (collIdent.size() > 0) { // Index build test does not pass in a collection ident. @@ -422,9 +450,7 @@ public: const MultikeyPaths& expectedMultikeyPaths) { auto catalog = collection->getCatalogEntry(); - auto recoveryUnit = _opCtx->recoveryUnit(); - recoveryUnit->abandonSnapshot(); - ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(ts)); + OneOffRead oor(_opCtx, ts); MultikeyPaths actualMultikeyPaths; if (!shouldBeMultikey) { @@ -495,10 +521,8 @@ public: } for (std::uint32_t idx = 0; idx < docsToInsert; ++idx) { - auto recoveryUnit = _opCtx->recoveryUnit(); - recoveryUnit->abandonSnapshot(); - ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp( - firstInsertTime.addTicks(idx).asTimestamp())); + OneOffRead oor(_opCtx, firstInsertTime.addTicks(idx).asTimestamp()); + BSONObj result; ASSERT(Helpers::getLast(_opCtx, nss.ns().c_str(), result)) << " idx is " << idx; ASSERT_EQ(0, SimpleBSONObjComparator::kInstance.compare(result, BSON("_id" << idx))) @@ -578,10 +602,8 @@ public: for (std::uint32_t idx = 0; idx < docsToInsert; ++idx) { - auto recoveryUnit = _opCtx->recoveryUnit(); - recoveryUnit->abandonSnapshot(); - ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp( - firstInsertTime.addTicks(idx).asTimestamp())); + OneOffRead oor(_opCtx, firstInsertTime.addTicks(idx).asTimestamp()); + BSONObj result; ASSERT(Helpers::getLast(_opCtx, nss.ns().c_str(), result)) << " idx is " << idx; ASSERT_EQ(0, SimpleBSONObjComparator::kInstance.compare(result, BSON("_id" << idx))) @@ -645,10 +667,7 @@ public: for (std::int32_t num = 0; num <= docsToInsert; ++num) { // The first loop queries at `lastInsertTime` and should count all documents. Querying // at each successive tick counts one less document. - auto recoveryUnit = _opCtx->recoveryUnit(); - recoveryUnit->abandonSnapshot(); - ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp( - lastInsertTime.addTicks(num).asTimestamp())); + OneOffRead oor(_opCtx, lastInsertTime.addTicks(num).asTimestamp()); ASSERT_EQ(docsToInsert - num, itCount(autoColl.getCollection())); } } @@ -722,10 +741,7 @@ public: for (std::size_t idx = 0; idx < updates.size(); ++idx) { // Querying at each successive ticks after `insertTime` sees the document transform in // the series. - auto recoveryUnit = _opCtx->recoveryUnit(); - recoveryUnit->abandonSnapshot(); - ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp( - insertTime.addTicks(idx + 1).asTimestamp())); + OneOffRead oor(_opCtx, insertTime.addTicks(idx + 1).asTimestamp()); auto doc = findOne(autoColl.getCollection()); ASSERT_EQ(0, SimpleBSONObjComparator::kInstance.compare(doc, updates[idx].second)) @@ -951,10 +967,10 @@ public: { ASSERT(AutoGetCollectionForReadCommand(_opCtx, nss).getCollection()); } - assertNamespaceInIdents(_opCtx, nss, pastTs, false); - assertNamespaceInIdents(_opCtx, nss, presentTs, true); - assertNamespaceInIdents(_opCtx, nss, futureTs, true); - assertNamespaceInIdents(_opCtx, nss, nullTs, true); + assertNamespaceInIdents(nss, pastTs, false); + assertNamespaceInIdents(nss, presentTs, true); + assertNamespaceInIdents(nss, futureTs, true); + assertNamespaceInIdents(nss, nullTs, true); } }; @@ -1006,17 +1022,17 @@ public: { ASSERT(AutoGetCollectionForReadCommand(_opCtx, nss1).getCollection()); } { ASSERT(AutoGetCollectionForReadCommand(_opCtx, nss2).getCollection()); } - assertNamespaceInIdents(_opCtx, nss1, pastTs, false); - assertNamespaceInIdents(_opCtx, nss1, presentTs, true); - assertNamespaceInIdents(_opCtx, nss1, futureTs, true); - assertNamespaceInIdents(_opCtx, nss1, dummyTs, true); - assertNamespaceInIdents(_opCtx, nss1, nullTs, true); - - assertNamespaceInIdents(_opCtx, nss2, pastTs, false); - assertNamespaceInIdents(_opCtx, nss2, presentTs, false); - assertNamespaceInIdents(_opCtx, nss2, futureTs, true); - assertNamespaceInIdents(_opCtx, nss2, dummyTs, true); - assertNamespaceInIdents(_opCtx, nss2, nullTs, true); + assertNamespaceInIdents(nss1, pastTs, false); + assertNamespaceInIdents(nss1, presentTs, true); + assertNamespaceInIdents(nss1, futureTs, true); + assertNamespaceInIdents(nss1, dummyTs, true); + assertNamespaceInIdents(nss1, nullTs, true); + + assertNamespaceInIdents(nss2, pastTs, false); + assertNamespaceInIdents(nss2, presentTs, false); + assertNamespaceInIdents(nss2, futureTs, true); + assertNamespaceInIdents(nss2, dummyTs, true); + assertNamespaceInIdents(nss2, nullTs, true); } }; @@ -1099,12 +1115,12 @@ public: assertDocumentAtTimestamp(coll1, dummyTs, doc1); assertDocumentAtTimestamp(coll1, nullTs, doc1); - assertNamespaceInIdents(_opCtx, nss2, pastTs, false); - assertNamespaceInIdents(_opCtx, nss2, presentTs, false); - assertNamespaceInIdents(_opCtx, nss2, futureTs, true); - assertNamespaceInIdents(_opCtx, nss2, insert2Ts, true); - assertNamespaceInIdents(_opCtx, nss2, dummyTs, true); - assertNamespaceInIdents(_opCtx, nss2, nullTs, true); + assertNamespaceInIdents(nss2, pastTs, false); + assertNamespaceInIdents(nss2, presentTs, false); + assertNamespaceInIdents(nss2, futureTs, true); + assertNamespaceInIdents(nss2, insert2Ts, true); + assertNamespaceInIdents(nss2, dummyTs, true); + assertNamespaceInIdents(nss2, nullTs, true); assertDocumentAtTimestamp(coll2, pastTs, BSONObj()); assertDocumentAtTimestamp(coll2, presentTs, BSONObj()); @@ -1156,10 +1172,10 @@ public: ASSERT_EQ(op.getNamespace().ns(), nss.getCommandNS().ns()) << op.toBSON(); ASSERT_BSONOBJ_EQ(op.getObject(), BSON("create" << nss.coll())); - assertNamespaceInIdents(_opCtx, nss, pastTs, false); - assertNamespaceInIdents(_opCtx, nss, presentTs, false); - assertNamespaceInIdents(_opCtx, nss, futureTs, true); - assertNamespaceInIdents(_opCtx, nss, nullTs, true); + assertNamespaceInIdents(nss, pastTs, false); + assertNamespaceInIdents(nss, presentTs, false); + assertNamespaceInIdents(nss, futureTs, true); + assertNamespaceInIdents(nss, nullTs, true); } }; @@ -1771,14 +1787,18 @@ public: { WriteUnitOfWork wuow(_opCtx); - // Primaries will perform no timestamping in the indexer's commit. Secondaries will - // look at the logical clock. + // All callers of `MultiIndexBlock::commit` are responsible for timestamping index + // completion. Primaries write an oplog entry. Secondaries explicitly set a + // timestamp. indexer.commit(); if (SimulatePrimary) { // The op observer is not called from the index builder, but rather the // `createIndexes` command. _opCtx->getServiceContext()->getOpObserver()->onCreateIndex( _opCtx, nss, autoColl.getCollection()->uuid(), indexInfoObj, false); + } else { + ASSERT_OK( + _opCtx->recoveryUnit()->setTimestamp(_clock->getClusterTime().asTimestamp())); } wuow.commit(); } @@ -1791,23 +1811,19 @@ public: // Assert that the index entry exists after init and `ready: false`. assertIdentsExistAtTimestamp(kvCatalog, "", indexIdent, afterIndexInit.asTimestamp()); { - _opCtx->recoveryUnit()->abandonSnapshot(); - ASSERT_OK( - _opCtx->recoveryUnit()->setPointInTimeReadTimestamp(afterIndexInit.asTimestamp())); - auto collMetaData = kvCatalog->getMetaData(_opCtx, nss.ns()); - auto indexMetaData = collMetaData.indexes[collMetaData.findIndexOffset("a_1")]; - ASSERT_FALSE(indexMetaData.ready); + ASSERT_FALSE(getIndexMetaData( + getMetaDataAtTime(kvCatalog, nss, afterIndexInit.asTimestamp()), "a_1") + .ready); } // After the build completes, assert that the index is `ready: true` and multikey. assertIdentsExistAtTimestamp(kvCatalog, "", indexIdent, afterIndexBuild); { - _opCtx->recoveryUnit()->abandonSnapshot(); - ASSERT_OK(_opCtx->recoveryUnit()->setPointInTimeReadTimestamp(afterIndexBuild)); - auto collMetaData = kvCatalog->getMetaData(_opCtx, nss.ns()); - auto indexMetaData = collMetaData.indexes[collMetaData.findIndexOffset("a_1")]; + auto indexMetaData = + getIndexMetaData(getMetaDataAtTime(kvCatalog, nss, afterIndexBuild), "a_1"); ASSERT(indexMetaData.ready); ASSERT(indexMetaData.multikey); + ASSERT_EQ(std::size_t(1), indexMetaData.multikeyPaths.size()); const bool match = indexMetaData.multikeyPaths[0] == std::set<std::size_t>({0}); if (!match) { @@ -1917,6 +1933,114 @@ public: } }; +/** + * There are a few scenarios where a primary will be using the IndexBuilder thread to build + * indexes. Specifically, when a primary builds an index from an oplog entry which can happen on + * primary catch-up, drain, a secondary step-up or `applyOps`. + * + * This test will exercise IndexBuilder code on primaries by performing a background index build + * via an `applyOps` command. + */ +template <bool Foreground> +class TimestampIndexBuilderOnPrimary : public StorageTimestampTest { +public: + void run() { + // Only run on 'wiredTiger'. No other storage engines to-date support timestamp writes. + if (mongo::storageGlobalParams.engine != "wiredTiger") { + return; + } + + // In order for applyOps to assign timestamps, we must be in non-replicated mode. + repl::UnreplicatedWritesBlock uwb(_opCtx); + + std::string dbName = "unittest"; + NamespaceString nss(dbName, "indexBuilderOnPrimary"); + BSONObj doc = BSON("_id" << 1 << "field" << 1); + + const LogicalTime setupStart = _clock->reserveTicks(1); + + UUID collUUID = UUID::gen(); + { + // Create the collection and insert a document. + reset(nss); + AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_X, LockMode::MODE_IX); + collUUID = *(autoColl.getCollection()->uuid()); + WriteUnitOfWork wuow(_opCtx); + insertDocument(autoColl.getCollection(), + InsertStatement(doc, setupStart.asTimestamp(), 0LL)); + wuow.commit(); + } + + + { + // Sanity check everything exists. + AutoGetCollectionForReadCommand autoColl(_opCtx, nss); + auto coll = autoColl.getCollection(); + ASSERT(coll); + + const auto presentTs = _clock->getClusterTime().asTimestamp(); + assertDocumentAtTimestamp(coll, presentTs, doc); + } + + { + // Create a background index via `applyOps`. We will timestamp the beginning at + // `startBuildTs` and the end, due to manipulation of the logical clock, should be + // timestamped at `endBuildTs`. + const auto beforeBuildTime = _clock->reserveTicks(3); + const auto startBuildTs = beforeBuildTime.addTicks(1).asTimestamp(); + const auto endBuildTs = beforeBuildTime.addTicks(3).asTimestamp(); + + // Grab the existing idents to identify the ident created by the index build. + auto kvStorageEngine = + dynamic_cast<KVStorageEngine*>(_opCtx->getServiceContext()->getStorageEngine()); + KVCatalog* kvCatalog = kvStorageEngine->getCatalog(); + std::vector<std::string> origIdents = kvCatalog->getAllIdents(_opCtx); + + auto indexSpec = BSON("createIndexes" << nss.coll() << "ns" << nss.ns() << "v" + << static_cast<int>(kIndexVersion) + << "key" + << BSON("field" << 1) + << "name" + << "field_1" + << "background" + << (Foreground ? false : true)); + + auto createIndexOp = + BSON("ts" << startBuildTs << "t" << 1LL << "h" << 0xBEEFBEEFLL << "v" << 2 << "op" + << "c" + << "ns" + << nss.getCommandNS().ns() + << "ui" + << collUUID + << "o" + << indexSpec); + + ASSERT_OK(doAtomicApplyOps(nss.db().toString(), {createIndexOp})); + + const std::string indexIdent = getNewIndexIdent(kvCatalog, origIdents); + assertIdentsMissingAtTimestamp( + kvCatalog, "", indexIdent, beforeBuildTime.asTimestamp()); + assertIdentsExistAtTimestamp(kvCatalog, "", indexIdent, startBuildTs); + if (Foreground) { + // In the Foreground case, the index build should start and finish at + // `startBuildTs`. + ASSERT_TRUE( + getIndexMetaData(getMetaDataAtTime(kvCatalog, nss, startBuildTs), "field_1") + .ready); + } else { + // In the Background case, the index build should not be "ready" at `startBuildTs`. + ASSERT_FALSE( + getIndexMetaData(getMetaDataAtTime(kvCatalog, nss, startBuildTs), "field_1") + .ready); + assertIdentsExistAtTimestamp(kvCatalog, "", indexIdent, endBuildTs); + ASSERT_TRUE( + getIndexMetaData(getMetaDataAtTime(kvCatalog, nss, endBuildTs), "field_1") + .ready); + } + } + } +}; + class AllStorageTimestampTests : public unittest::Suite { public: @@ -1945,6 +2069,9 @@ public: // TimestampIndexBuilds<SimulatePrimary> add<TimestampIndexBuilds<false>>(); add<TimestampIndexBuilds<true>>(); + // TimestampIndexBuilderOnPrimary<Background> + add<TimestampIndexBuilderOnPrimary<false>>(); + add<TimestampIndexBuilderOnPrimary<true>>(); add<SecondaryReadsDuringBatchApplicationAreAllowed>(); } }; |