summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorDaniel Gottlieb <daniel.gottlieb@mongodb.com>2018-05-14 11:09:50 -0400
committerDaniel Gottlieb <daniel.gottlieb@mongodb.com>2018-05-14 11:09:50 -0400
commit8e27c2b49ba17fe9b02695efb29a6322b56c2f23 (patch)
treed8b0682eee38a4655bf8d61b00143df6a1153a4d /src/mongo
parent074c9725f1e6ad9b005ab08acc1aedf34e537d69 (diff)
downloadmongo-8e27c2b49ba17fe9b02695efb29a6322b56c2f23.tar.gz
SERVER-34896: Move timestamping responsibility on index completion to callers.
Previously, `MultiIndexBlockImpl` would use member state as a proxy for whether an index build originated from a `createIndexes` command. This information was used to determine if the completion of the index needed to be explicitly timestamped, or whether an oplog entry would timestamp the write. However, nodes in a primary state can build indexes during primary "catch-up" or "drain". Those index builds originated from an oplog entry and thus will not generate one on completion. These index builds need to be explicitly timestamped.
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/SConscript3
-rw-r--r--src/mongo/db/catalog/index_create_impl.cpp62
-rw-r--r--src/mongo/db/index_builder.cpp44
-rw-r--r--src/mongo/dbtests/storage_timestamp_tests.cpp285
4 files changed, 266 insertions, 128 deletions
diff --git a/src/mongo/db/SConscript b/src/mongo/db/SConscript
index 046fac61168..23a79c2bb71 100644
--- a/src/mongo/db/SConscript
+++ b/src/mongo/db/SConscript
@@ -821,6 +821,9 @@ env.Library(
'$BUILD_DIR/mongo/db/catalog/index_catalog',
'$BUILD_DIR/mongo/db/catalog/index_create',
],
+ LIBDEPS_PRIVATE=[
+ "logical_clock",
+ ],
)
env.Library(
diff --git a/src/mongo/db/catalog/index_create_impl.cpp b/src/mongo/db/catalog/index_create_impl.cpp
index e3b7bfc4b64..bbdf9c1df16 100644
--- a/src/mongo/db/catalog/index_create_impl.cpp
+++ b/src/mongo/db/catalog/index_create_impl.cpp
@@ -44,8 +44,8 @@
#include "mongo/db/curop.h"
#include "mongo/db/exec/working_set_common.h"
#include "mongo/db/index/multikey_paths.h"
-#include "mongo/db/logical_clock.h"
#include "mongo/db/multi_key_path_tracker.h"
+#include "mongo/db/op_observer.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/query/internal_plans.h"
#include "mongo/db/repl/replication_coordinator.h"
@@ -60,42 +60,8 @@
#include "mongo/util/quick_exit.h"
#include "mongo/util/scopeguard.h"
-#include "mongo/db/op_observer.h"
-
namespace mongo {
-namespace {
-
-/**
- * Returns true if writes to the catalog entry for the input namespace require being
- * timestamped. A ghost write is when the operation is not committed with an oplog entry and
- * implies the caller will look at the logical clock to choose a time to use.
- */
-bool requiresGhostCommitTimestamp(OperationContext* opCtx, NamespaceString nss) {
- if (!nss.isReplicated() || nss.coll().startsWith("tmp.mr")) {
- return false;
- }
-
- auto replCoord = repl::ReplicationCoordinator::get(opCtx);
- if (!replCoord->getSettings().usingReplSets()) {
- return false;
- }
-
- // Nodes in `startup` may not have yet initialized the `LogicalClock`. Primaries do not need
- // ghost writes. Nodes in the `applyOps` (`startup2`) phase of initial sync must not timestamp
- // index builds before the `initialDataTimestamp`. Nodes doing replication recovery (also
- // `startup2`) must timestamp index builds. All replication recovery index builds are
- // foregrounded and have a "commit timestamp" set.
- const auto memberState = replCoord->getMemberState();
- if (memberState.primary() || memberState.startup() || memberState.startup2()) {
- return false;
- }
-
- return opCtx->recoveryUnit()->getCommitTimestamp().isNull();
-}
-
-} // namespace
-
using std::unique_ptr;
using std::string;
using std::endl;
@@ -193,9 +159,15 @@ MultiIndexBlockImpl::~MultiIndexBlockImpl() {
}
auto replCoord = repl::ReplicationCoordinator::get(_opCtx);
- if (replCoord->canAcceptWritesForDatabase(_opCtx, "admin")) {
- // Primaries must timestamp the failure of an index build (via an op message).
- // Secondaries may not fail index builds.
+ // Nodes building an index on behalf of a user (e.g: `createIndexes`, `applyOps`) may
+ // fail, removing the existence of the index from the catalog. This update must be
+ // timestamped. A failure from `createIndexes` should not have a commit timestamp and
+ // instead write a noop entry. A foreground `applyOps` index build may have a commit
+ // timestamp already set.
+ if (_opCtx->recoveryUnit()->getCommitTimestamp().isNull() &&
+ replCoord->canAcceptWritesForDatabase(_opCtx, "admin")) {
+
+ // Make lock acquisition uninterruptible because writing an op message takes a lock.
_opCtx->getServiceContext()->getOpObserver()->onOpMessage(
_opCtx,
BSON("msg" << std::string(str::stream() << "Failing index builds. Coll: "
@@ -326,9 +298,11 @@ StatusWith<std::vector<BSONObj>> MultiIndexBlockImpl::init(const std::vector<BSO
_backgroundOperation.reset(new BackgroundOperation(ns));
auto replCoord = repl::ReplicationCoordinator::get(_opCtx);
- if (replCoord->canAcceptWritesForDatabase(_opCtx, "admin")) {
+ if (_opCtx->recoveryUnit()->getCommitTimestamp().isNull() &&
+ replCoord->canAcceptWritesForDatabase(_opCtx, "admin")) {
// Only primaries must timestamp this write. Secondaries run this from within a
- // `TimestampBlock`
+ // `TimestampBlock`. Primaries performing an index build via `applyOps` may have a
+ // wrapping commit timestamp that will be used instead.
_opCtx->getServiceContext()->getOpObserver()->onOpMessage(
_opCtx, BSON("msg" << std::string(str::stream() << "Creating indexes. Coll: " << ns)));
}
@@ -565,14 +539,6 @@ void MultiIndexBlockImpl::commit() {
}
}
- if (requiresGhostCommitTimestamp(_opCtx, _collection->ns())) {
- // Only secondaries must explicitly timestamp index completion. Primaries perform this
- // write along with an oplog entry.
- fassert(50701,
- _opCtx->recoveryUnit()->setTimestamp(
- LogicalClock::get(_opCtx)->getClusterTime().asTimestamp()));
- }
-
_opCtx->recoveryUnit()->registerChange(new SetNeedToCleanupOnRollback(this));
_needToCleanup = false;
}
diff --git a/src/mongo/db/index_builder.cpp b/src/mongo/db/index_builder.cpp
index d915fc6590b..8dbef6caa25 100644
--- a/src/mongo/db/index_builder.cpp
+++ b/src/mongo/db/index_builder.cpp
@@ -40,6 +40,7 @@
#include "mongo/db/concurrency/write_conflict_exception.h"
#include "mongo/db/curop.h"
#include "mongo/db/db_raii.h"
+#include "mongo/db/logical_clock.h"
#include "mongo/db/repl/timestamp_block.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/log.h"
@@ -52,6 +53,42 @@ using std::endl;
AtomicUInt32 IndexBuilder::_indexBuildCount;
namespace {
+
+/**
+ * Returns true if writes to the catalog entry for the input namespace require being
+ * timestamped. A ghost write is when the operation is not committed with an oplog entry and
+ * implies the caller will look at the logical clock to choose a time to use.
+ */
+bool requiresGhostCommitTimestamp(OperationContext* opCtx, NamespaceString nss) {
+ if (!nss.isReplicated() || nss.coll().startsWith("tmp.mr")) {
+ return false;
+ }
+
+ auto replCoord = repl::ReplicationCoordinator::get(opCtx);
+ if (!replCoord->getSettings().usingReplSets()) {
+ return false;
+ }
+
+ // If there is a commit timestamp already assigned, there's no need to explicitly assign a
+ // timestamp. This case covers foreground index builds.
+ if (!opCtx->recoveryUnit()->getCommitTimestamp().isNull()) {
+ return false;
+ }
+
+ // Only oplog entries (including a user's `applyOps` command) construct indexes via
+ // `IndexBuilder`. Nodes in `startup` may not yet have initialized the `LogicalClock`, however
+ // index builds during startup replication recovery must be timestamped. These index builds
+ // are foregrounded and timestamp their catalog writes with a "commit timestamp". Nodes in the
+ // oplog application phase of initial sync (`startup2`) must not timestamp index builds before
+ // the `initialDataTimestamp`.
+ const auto memberState = replCoord->getMemberState();
+ if (memberState.startup() || memberState.startup2()) {
+ return false;
+ }
+
+ return true;
+}
+
// Synchronization tools when replication spawns a background index in a new thread.
// The bool is 'true' when a new background index has started in a new thread but the
// parent thread has not yet synchronized with it.
@@ -202,9 +239,14 @@ Status IndexBuilder::_build(OperationContext* opCtx,
if (allowBackgroundBuilding) {
dbLock->relockWithMode(MODE_X);
}
- writeConflictRetry(opCtx, "Commit index build", ns.ns(), [opCtx, &indexer] {
+ writeConflictRetry(opCtx, "Commit index build", ns.ns(), [opCtx, &indexer, &ns] {
WriteUnitOfWork wunit(opCtx);
indexer.commit();
+ if (requiresGhostCommitTimestamp(opCtx, ns)) {
+ fassert(50701,
+ opCtx->recoveryUnit()->setTimestamp(
+ LogicalClock::get(opCtx)->getClusterTime().asTimestamp()));
+ }
wunit.commit();
});
diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp
index 1c05ac712e9..bc26bab34a2 100644
--- a/src/mongo/dbtests/storage_timestamp_tests.cpp
+++ b/src/mongo/dbtests/storage_timestamp_tests.cpp
@@ -73,8 +73,36 @@
namespace mongo {
+namespace {
+/**
+ * RAII type for operating at a timestamp. Will remove any timestamping when the object destructs.
+ */
+class OneOffRead {
+public:
+ OneOffRead(OperationContext* opCtx, const Timestamp& ts) : _opCtx(opCtx) {
+ _opCtx->recoveryUnit()->abandonSnapshot();
+ ASSERT_OK(_opCtx->recoveryUnit()->setPointInTimeReadTimestamp(ts));
+ }
+
+ ~OneOffRead() {
+ _opCtx->recoveryUnit()->abandonSnapshot();
+ ASSERT_OK(_opCtx->recoveryUnit()->setPointInTimeReadTimestamp(Timestamp::min()));
+ }
+
+private:
+ OperationContext* _opCtx;
+};
+}
+
const auto kIndexVersion = IndexDescriptor::IndexVersion::kV2;
+BSONCollectionCatalogEntry::IndexMetaData getIndexMetaData(
+ const BSONCollectionCatalogEntry::MetaData& collMetaData, StringData indexName) {
+ const auto idxOffset = collMetaData.findIndexOffset(indexName);
+ invariant(idxOffset > -1);
+ return collMetaData.indexes[idxOffset];
+}
+
class StorageTimestampTest {
public:
ServiceContext::UniqueOperationContext _opCtxRaii = cc().makeOperationContext();
@@ -206,8 +234,17 @@ public:
return optRecord.get().data.toBson();
}
+ BSONCollectionCatalogEntry::MetaData getMetaDataAtTime(KVCatalog* kvCatalog,
+ NamespaceString ns,
+ const Timestamp& ts) {
+ OneOffRead oor(_opCtx, ts);
+ return kvCatalog->getMetaData(_opCtx, ns.ns());
+ }
+
StatusWith<BSONObj> doAtomicApplyOps(const std::string& dbName,
const std::list<BSONObj>& applyOpsList) {
+ OneOffRead oor(_opCtx, Timestamp::min());
+
BSONObjBuilder result;
Status status = applyOps(_opCtx,
dbName,
@@ -224,6 +261,8 @@ public:
// Creates a dummy command operation to persuade `applyOps` to be non-atomic.
StatusWith<BSONObj> doNonAtomicApplyOps(const std::string& dbName,
const std::list<BSONObj>& applyOpsList) {
+ OneOffRead oor(_opCtx, Timestamp::min());
+
BSONObjBuilder result;
Status status = applyOps(_opCtx,
dbName,
@@ -240,9 +279,8 @@ public:
void assertMinValidDocumentAtTimestamp(Collection* coll,
const Timestamp& ts,
const repl::MinValidDocument& expectedDoc) {
- auto recoveryUnit = _opCtx->recoveryUnit();
- recoveryUnit->abandonSnapshot();
- ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(ts));
+ OneOffRead oor(_opCtx, ts);
+
auto doc =
repl::MinValidDocument::parse(IDLParserErrorContext("MinValidDocument"), findOne(coll));
ASSERT_EQ(expectedDoc.getMinValidTimestamp(), doc.getMinValidTimestamp())
@@ -265,10 +303,7 @@ public:
void assertDocumentAtTimestamp(Collection* coll,
const Timestamp& ts,
const BSONObj& expectedDoc) {
-
- auto recoveryUnit = _opCtx->recoveryUnit();
- recoveryUnit->abandonSnapshot();
- ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(ts));
+ OneOffRead oor(_opCtx, ts);
if (expectedDoc.isEmpty()) {
ASSERT_EQ(0, itCount(coll)) << "Should not find any documents in " << coll->ns()
<< " at ts: " << ts;
@@ -290,24 +325,18 @@ public:
* Asserts that the given collection is in (or not in) the KVCatalog's list of idents at the
* provided timestamp.
*/
- void assertNamespaceInIdents(OperationContext* opCtx,
- NamespaceString nss,
- Timestamp ts,
- bool shouldExpect) {
+ void assertNamespaceInIdents(NamespaceString nss, Timestamp ts, bool shouldExpect) {
+ OneOffRead oor(_opCtx, ts);
KVCatalog* kvCatalog =
static_cast<KVStorageEngine*>(_opCtx->getServiceContext()->getStorageEngine())
->getCatalog();
- auto recoveryUnit = _opCtx->recoveryUnit();
- recoveryUnit->abandonSnapshot();
- ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(ts));
-
// getCollectionIdent() returns the ident for the given namespace in the KVCatalog.
// getAllIdents() actually looks in the RecordStore for a list of all idents, and is thus
// versioned by timestamp. These tests do not do any renames, so we can expect the
// namespace to have a consistent ident across timestamps, if it exists.
auto expectedIdent = kvCatalog->getCollectionIdent(nss.ns());
- auto idents = kvCatalog->getAllIdents(opCtx);
+ auto idents = kvCatalog->getAllIdents(_opCtx);
auto found = std::find(idents.begin(), idents.end(), expectedIdent);
if (shouldExpect) {
@@ -319,6 +348,8 @@ public:
}
std::string getNewIndexIdent(KVCatalog* kvCatalog, std::vector<std::string>& origIdents) {
+ OneOffRead oor(_opCtx, Timestamp::min());
+
// Find the collection and index ident by performing a set difference on the original
// idents and the current idents.
std::vector<std::string> identsWithColl = kvCatalog->getAllIdents(_opCtx);
@@ -331,7 +362,7 @@ public:
origIdents.end(),
std::back_inserter(collAndIdxIdents));
- ASSERT(collAndIdxIdents.size() == 1);
+ ASSERT(collAndIdxIdents.size() == 1) << "Num idents: " << collAndIdxIdents.size();
return collAndIdxIdents[0];
}
@@ -366,9 +397,8 @@ public:
const std::string& collIdent,
const std::string& indexIdent,
Timestamp timestamp) {
- auto recoveryUnit = _opCtx->recoveryUnit();
- recoveryUnit->abandonSnapshot();
- ASSERT_OK(_opCtx->recoveryUnit()->setPointInTimeReadTimestamp(timestamp));
+ OneOffRead oor(_opCtx, timestamp);
+
auto allIdents = kvCatalog->getAllIdents(_opCtx);
if (collIdent.size() > 0) {
// Index build test does not pass in a collection ident.
@@ -385,9 +415,7 @@ public:
const std::string& collIdent,
const std::string& indexIdent,
Timestamp timestamp) {
- auto recoveryUnit = _opCtx->recoveryUnit();
- recoveryUnit->abandonSnapshot();
- ASSERT_OK(_opCtx->recoveryUnit()->setPointInTimeReadTimestamp(timestamp));
+ OneOffRead oor(_opCtx, timestamp);
auto allIdents = kvCatalog->getAllIdents(_opCtx);
if (collIdent.size() > 0) {
// Index build test does not pass in a collection ident.
@@ -422,9 +450,7 @@ public:
const MultikeyPaths& expectedMultikeyPaths) {
auto catalog = collection->getCatalogEntry();
- auto recoveryUnit = _opCtx->recoveryUnit();
- recoveryUnit->abandonSnapshot();
- ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(ts));
+ OneOffRead oor(_opCtx, ts);
MultikeyPaths actualMultikeyPaths;
if (!shouldBeMultikey) {
@@ -495,10 +521,8 @@ public:
}
for (std::uint32_t idx = 0; idx < docsToInsert; ++idx) {
- auto recoveryUnit = _opCtx->recoveryUnit();
- recoveryUnit->abandonSnapshot();
- ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(
- firstInsertTime.addTicks(idx).asTimestamp()));
+ OneOffRead oor(_opCtx, firstInsertTime.addTicks(idx).asTimestamp());
+
BSONObj result;
ASSERT(Helpers::getLast(_opCtx, nss.ns().c_str(), result)) << " idx is " << idx;
ASSERT_EQ(0, SimpleBSONObjComparator::kInstance.compare(result, BSON("_id" << idx)))
@@ -578,10 +602,8 @@ public:
for (std::uint32_t idx = 0; idx < docsToInsert; ++idx) {
- auto recoveryUnit = _opCtx->recoveryUnit();
- recoveryUnit->abandonSnapshot();
- ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(
- firstInsertTime.addTicks(idx).asTimestamp()));
+ OneOffRead oor(_opCtx, firstInsertTime.addTicks(idx).asTimestamp());
+
BSONObj result;
ASSERT(Helpers::getLast(_opCtx, nss.ns().c_str(), result)) << " idx is " << idx;
ASSERT_EQ(0, SimpleBSONObjComparator::kInstance.compare(result, BSON("_id" << idx)))
@@ -645,10 +667,7 @@ public:
for (std::int32_t num = 0; num <= docsToInsert; ++num) {
// The first loop queries at `lastInsertTime` and should count all documents. Querying
// at each successive tick counts one less document.
- auto recoveryUnit = _opCtx->recoveryUnit();
- recoveryUnit->abandonSnapshot();
- ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(
- lastInsertTime.addTicks(num).asTimestamp()));
+ OneOffRead oor(_opCtx, lastInsertTime.addTicks(num).asTimestamp());
ASSERT_EQ(docsToInsert - num, itCount(autoColl.getCollection()));
}
}
@@ -722,10 +741,7 @@ public:
for (std::size_t idx = 0; idx < updates.size(); ++idx) {
// Querying at each successive ticks after `insertTime` sees the document transform in
// the series.
- auto recoveryUnit = _opCtx->recoveryUnit();
- recoveryUnit->abandonSnapshot();
- ASSERT_OK(recoveryUnit->setPointInTimeReadTimestamp(
- insertTime.addTicks(idx + 1).asTimestamp()));
+ OneOffRead oor(_opCtx, insertTime.addTicks(idx + 1).asTimestamp());
auto doc = findOne(autoColl.getCollection());
ASSERT_EQ(0, SimpleBSONObjComparator::kInstance.compare(doc, updates[idx].second))
@@ -951,10 +967,10 @@ public:
{ ASSERT(AutoGetCollectionForReadCommand(_opCtx, nss).getCollection()); }
- assertNamespaceInIdents(_opCtx, nss, pastTs, false);
- assertNamespaceInIdents(_opCtx, nss, presentTs, true);
- assertNamespaceInIdents(_opCtx, nss, futureTs, true);
- assertNamespaceInIdents(_opCtx, nss, nullTs, true);
+ assertNamespaceInIdents(nss, pastTs, false);
+ assertNamespaceInIdents(nss, presentTs, true);
+ assertNamespaceInIdents(nss, futureTs, true);
+ assertNamespaceInIdents(nss, nullTs, true);
}
};
@@ -1006,17 +1022,17 @@ public:
{ ASSERT(AutoGetCollectionForReadCommand(_opCtx, nss1).getCollection()); }
{ ASSERT(AutoGetCollectionForReadCommand(_opCtx, nss2).getCollection()); }
- assertNamespaceInIdents(_opCtx, nss1, pastTs, false);
- assertNamespaceInIdents(_opCtx, nss1, presentTs, true);
- assertNamespaceInIdents(_opCtx, nss1, futureTs, true);
- assertNamespaceInIdents(_opCtx, nss1, dummyTs, true);
- assertNamespaceInIdents(_opCtx, nss1, nullTs, true);
-
- assertNamespaceInIdents(_opCtx, nss2, pastTs, false);
- assertNamespaceInIdents(_opCtx, nss2, presentTs, false);
- assertNamespaceInIdents(_opCtx, nss2, futureTs, true);
- assertNamespaceInIdents(_opCtx, nss2, dummyTs, true);
- assertNamespaceInIdents(_opCtx, nss2, nullTs, true);
+ assertNamespaceInIdents(nss1, pastTs, false);
+ assertNamespaceInIdents(nss1, presentTs, true);
+ assertNamespaceInIdents(nss1, futureTs, true);
+ assertNamespaceInIdents(nss1, dummyTs, true);
+ assertNamespaceInIdents(nss1, nullTs, true);
+
+ assertNamespaceInIdents(nss2, pastTs, false);
+ assertNamespaceInIdents(nss2, presentTs, false);
+ assertNamespaceInIdents(nss2, futureTs, true);
+ assertNamespaceInIdents(nss2, dummyTs, true);
+ assertNamespaceInIdents(nss2, nullTs, true);
}
};
@@ -1099,12 +1115,12 @@ public:
assertDocumentAtTimestamp(coll1, dummyTs, doc1);
assertDocumentAtTimestamp(coll1, nullTs, doc1);
- assertNamespaceInIdents(_opCtx, nss2, pastTs, false);
- assertNamespaceInIdents(_opCtx, nss2, presentTs, false);
- assertNamespaceInIdents(_opCtx, nss2, futureTs, true);
- assertNamespaceInIdents(_opCtx, nss2, insert2Ts, true);
- assertNamespaceInIdents(_opCtx, nss2, dummyTs, true);
- assertNamespaceInIdents(_opCtx, nss2, nullTs, true);
+ assertNamespaceInIdents(nss2, pastTs, false);
+ assertNamespaceInIdents(nss2, presentTs, false);
+ assertNamespaceInIdents(nss2, futureTs, true);
+ assertNamespaceInIdents(nss2, insert2Ts, true);
+ assertNamespaceInIdents(nss2, dummyTs, true);
+ assertNamespaceInIdents(nss2, nullTs, true);
assertDocumentAtTimestamp(coll2, pastTs, BSONObj());
assertDocumentAtTimestamp(coll2, presentTs, BSONObj());
@@ -1156,10 +1172,10 @@ public:
ASSERT_EQ(op.getNamespace().ns(), nss.getCommandNS().ns()) << op.toBSON();
ASSERT_BSONOBJ_EQ(op.getObject(), BSON("create" << nss.coll()));
- assertNamespaceInIdents(_opCtx, nss, pastTs, false);
- assertNamespaceInIdents(_opCtx, nss, presentTs, false);
- assertNamespaceInIdents(_opCtx, nss, futureTs, true);
- assertNamespaceInIdents(_opCtx, nss, nullTs, true);
+ assertNamespaceInIdents(nss, pastTs, false);
+ assertNamespaceInIdents(nss, presentTs, false);
+ assertNamespaceInIdents(nss, futureTs, true);
+ assertNamespaceInIdents(nss, nullTs, true);
}
};
@@ -1771,14 +1787,18 @@ public:
{
WriteUnitOfWork wuow(_opCtx);
- // Primaries will perform no timestamping in the indexer's commit. Secondaries will
- // look at the logical clock.
+ // All callers of `MultiIndexBlock::commit` are responsible for timestamping index
+ // completion. Primaries write an oplog entry. Secondaries explicitly set a
+ // timestamp.
indexer.commit();
if (SimulatePrimary) {
// The op observer is not called from the index builder, but rather the
// `createIndexes` command.
_opCtx->getServiceContext()->getOpObserver()->onCreateIndex(
_opCtx, nss, autoColl.getCollection()->uuid(), indexInfoObj, false);
+ } else {
+ ASSERT_OK(
+ _opCtx->recoveryUnit()->setTimestamp(_clock->getClusterTime().asTimestamp()));
}
wuow.commit();
}
@@ -1791,23 +1811,19 @@ public:
// Assert that the index entry exists after init and `ready: false`.
assertIdentsExistAtTimestamp(kvCatalog, "", indexIdent, afterIndexInit.asTimestamp());
{
- _opCtx->recoveryUnit()->abandonSnapshot();
- ASSERT_OK(
- _opCtx->recoveryUnit()->setPointInTimeReadTimestamp(afterIndexInit.asTimestamp()));
- auto collMetaData = kvCatalog->getMetaData(_opCtx, nss.ns());
- auto indexMetaData = collMetaData.indexes[collMetaData.findIndexOffset("a_1")];
- ASSERT_FALSE(indexMetaData.ready);
+ ASSERT_FALSE(getIndexMetaData(
+ getMetaDataAtTime(kvCatalog, nss, afterIndexInit.asTimestamp()), "a_1")
+ .ready);
}
// After the build completes, assert that the index is `ready: true` and multikey.
assertIdentsExistAtTimestamp(kvCatalog, "", indexIdent, afterIndexBuild);
{
- _opCtx->recoveryUnit()->abandonSnapshot();
- ASSERT_OK(_opCtx->recoveryUnit()->setPointInTimeReadTimestamp(afterIndexBuild));
- auto collMetaData = kvCatalog->getMetaData(_opCtx, nss.ns());
- auto indexMetaData = collMetaData.indexes[collMetaData.findIndexOffset("a_1")];
+ auto indexMetaData =
+ getIndexMetaData(getMetaDataAtTime(kvCatalog, nss, afterIndexBuild), "a_1");
ASSERT(indexMetaData.ready);
ASSERT(indexMetaData.multikey);
+
ASSERT_EQ(std::size_t(1), indexMetaData.multikeyPaths.size());
const bool match = indexMetaData.multikeyPaths[0] == std::set<std::size_t>({0});
if (!match) {
@@ -1917,6 +1933,114 @@ public:
}
};
+/**
+ * There are a few scenarios where a primary will be using the IndexBuilder thread to build
+ * indexes. Specifically, when a primary builds an index from an oplog entry which can happen on
+ * primary catch-up, drain, a secondary step-up or `applyOps`.
+ *
+ * This test will exercise IndexBuilder code on primaries by performing a background index build
+ * via an `applyOps` command.
+ */
+template <bool Foreground>
+class TimestampIndexBuilderOnPrimary : public StorageTimestampTest {
+public:
+ void run() {
+ // Only run on 'wiredTiger'. No other storage engines to-date support timestamp writes.
+ if (mongo::storageGlobalParams.engine != "wiredTiger") {
+ return;
+ }
+
+ // In order for applyOps to assign timestamps, we must be in non-replicated mode.
+ repl::UnreplicatedWritesBlock uwb(_opCtx);
+
+ std::string dbName = "unittest";
+ NamespaceString nss(dbName, "indexBuilderOnPrimary");
+ BSONObj doc = BSON("_id" << 1 << "field" << 1);
+
+ const LogicalTime setupStart = _clock->reserveTicks(1);
+
+ UUID collUUID = UUID::gen();
+ {
+ // Create the collection and insert a document.
+ reset(nss);
+ AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_X, LockMode::MODE_IX);
+ collUUID = *(autoColl.getCollection()->uuid());
+ WriteUnitOfWork wuow(_opCtx);
+ insertDocument(autoColl.getCollection(),
+ InsertStatement(doc, setupStart.asTimestamp(), 0LL));
+ wuow.commit();
+ }
+
+
+ {
+ // Sanity check everything exists.
+ AutoGetCollectionForReadCommand autoColl(_opCtx, nss);
+ auto coll = autoColl.getCollection();
+ ASSERT(coll);
+
+ const auto presentTs = _clock->getClusterTime().asTimestamp();
+ assertDocumentAtTimestamp(coll, presentTs, doc);
+ }
+
+ {
+ // Create a background index via `applyOps`. We will timestamp the beginning at
+ // `startBuildTs` and the end, due to manipulation of the logical clock, should be
+ // timestamped at `endBuildTs`.
+ const auto beforeBuildTime = _clock->reserveTicks(3);
+ const auto startBuildTs = beforeBuildTime.addTicks(1).asTimestamp();
+ const auto endBuildTs = beforeBuildTime.addTicks(3).asTimestamp();
+
+ // Grab the existing idents to identify the ident created by the index build.
+ auto kvStorageEngine =
+ dynamic_cast<KVStorageEngine*>(_opCtx->getServiceContext()->getStorageEngine());
+ KVCatalog* kvCatalog = kvStorageEngine->getCatalog();
+ std::vector<std::string> origIdents = kvCatalog->getAllIdents(_opCtx);
+
+ auto indexSpec = BSON("createIndexes" << nss.coll() << "ns" << nss.ns() << "v"
+ << static_cast<int>(kIndexVersion)
+ << "key"
+ << BSON("field" << 1)
+ << "name"
+ << "field_1"
+ << "background"
+ << (Foreground ? false : true));
+
+ auto createIndexOp =
+ BSON("ts" << startBuildTs << "t" << 1LL << "h" << 0xBEEFBEEFLL << "v" << 2 << "op"
+ << "c"
+ << "ns"
+ << nss.getCommandNS().ns()
+ << "ui"
+ << collUUID
+ << "o"
+ << indexSpec);
+
+ ASSERT_OK(doAtomicApplyOps(nss.db().toString(), {createIndexOp}));
+
+ const std::string indexIdent = getNewIndexIdent(kvCatalog, origIdents);
+ assertIdentsMissingAtTimestamp(
+ kvCatalog, "", indexIdent, beforeBuildTime.asTimestamp());
+ assertIdentsExistAtTimestamp(kvCatalog, "", indexIdent, startBuildTs);
+ if (Foreground) {
+ // In the Foreground case, the index build should start and finish at
+ // `startBuildTs`.
+ ASSERT_TRUE(
+ getIndexMetaData(getMetaDataAtTime(kvCatalog, nss, startBuildTs), "field_1")
+ .ready);
+ } else {
+ // In the Background case, the index build should not be "ready" at `startBuildTs`.
+ ASSERT_FALSE(
+ getIndexMetaData(getMetaDataAtTime(kvCatalog, nss, startBuildTs), "field_1")
+ .ready);
+ assertIdentsExistAtTimestamp(kvCatalog, "", indexIdent, endBuildTs);
+ ASSERT_TRUE(
+ getIndexMetaData(getMetaDataAtTime(kvCatalog, nss, endBuildTs), "field_1")
+ .ready);
+ }
+ }
+ }
+};
+
class AllStorageTimestampTests : public unittest::Suite {
public:
@@ -1945,6 +2069,9 @@ public:
// TimestampIndexBuilds<SimulatePrimary>
add<TimestampIndexBuilds<false>>();
add<TimestampIndexBuilds<true>>();
+ // TimestampIndexBuilderOnPrimary<Background>
+ add<TimestampIndexBuilderOnPrimary<false>>();
+ add<TimestampIndexBuilderOnPrimary<true>>();
add<SecondaryReadsDuringBatchApplicationAreAllowed>();
}
};