summaryrefslogtreecommitdiff
path: root/src/mongo/db
diff options
context:
space:
mode:
authorDaniel Gottlieb <daniel.gottlieb@mongodb.com>2022-03-07 15:47:49 -0500
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-03-09 22:29:05 +0000
commitbc8142405dec99dd24cb87b2f61570c600d6f0d4 (patch)
treeca55b37a1228b248a4412f6fd791a9bedb90a6f9 /src/mongo/db
parent44ad2b3406535b927e0f968a1a4a0a022a6dbcb1 (diff)
downloadmongo-bc8142405dec99dd24cb87b2f61570c600d6f0d4.tar.gz
SERVER-60754: Ensure primaries write multikey paths in timestamp order.
Diffstat (limited to 'src/mongo/db')
-rw-r--r--src/mongo/db/catalog/index_catalog_impl.cpp62
-rw-r--r--src/mongo/db/repl/storage_timestamp_test.cpp201
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp11
3 files changed, 157 insertions, 117 deletions
diff --git a/src/mongo/db/catalog/index_catalog_impl.cpp b/src/mongo/db/catalog/index_catalog_impl.cpp
index 4315c583234..115c664d0fc 100644
--- a/src/mongo/db/catalog/index_catalog_impl.cpp
+++ b/src/mongo/db/catalog/index_catalog_impl.cpp
@@ -59,6 +59,7 @@
#include "mongo/db/jsobj.h"
#include "mongo/db/keypattern.h"
#include "mongo/db/matcher/expression.h"
+#include "mongo/db/multi_key_path_tracker.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/ops/delete.h"
#include "mongo/db/query/collation/collation_spec.h"
@@ -1601,16 +1602,61 @@ Status IndexCatalogImpl::indexRecords(OperationContext* opCtx,
*keysInsertedOut = 0;
}
- for (auto&& it : _readyIndexes) {
- Status s = _indexRecords(opCtx, coll, it.get(), bsonRecords, keysInsertedOut);
- if (!s.isOK())
- return s;
+ // For vectored inserts, we insert index keys and flip multikey in "index order". However
+ // because multikey state for different indexes both live on the same _mdb_catalog document,
+ // index order isn't necessarily timestamp order. We track multikey paths here to ensure we make
+ // changes to the _mdb_catalog document with in timestamp order updates.
+ MultikeyPathTracker& tracker = MultikeyPathTracker::get(opCtx);
+
+ // Take care when choosing to aggregate multikey writes. This code will only* track multikey
+ // when:
+ // * No parent is tracking multikey and*
+ // * There are timestamps associated with the input `bsonRecords`.
+ //
+ // If we are not responsible for tracking multikey:
+ // * Leave the multikey tracker in its original "tracking" state.
+ // * Not write any accumulated multikey paths to the _mdb_catalog document.
+ const bool manageMultikeyWrite =
+ !tracker.isTrackingMultikeyPathInfo() && !bsonRecords[0].ts.isNull();
+ {
+ ScopeGuard stopTrackingMultikeyChanges(
+ [&tracker] { tracker.stopTrackingMultikeyPathInfo(); });
+ if (manageMultikeyWrite) {
+ tracker.startTrackingMultikeyPathInfo();
+ } else {
+ stopTrackingMultikeyChanges.dismiss();
+ }
+ for (auto&& it : _readyIndexes) {
+ Status s = _indexRecords(opCtx, coll, it.get(), bsonRecords, keysInsertedOut);
+ if (!s.isOK())
+ return s;
+ }
+
+ for (auto&& it : _buildingIndexes) {
+ Status s = _indexRecords(opCtx, coll, it.get(), bsonRecords, keysInsertedOut);
+ if (!s.isOK())
+ return s;
+ }
}
- for (auto&& it : _buildingIndexes) {
- Status s = _indexRecords(opCtx, coll, it.get(), bsonRecords, keysInsertedOut);
- if (!s.isOK())
- return s;
+ const WorkerMultikeyPathInfo& newPaths = tracker.getMultikeyPathInfo();
+ if (newPaths.size() == 0 || !manageMultikeyWrite) {
+ return Status::OK();
+ }
+
+ if (Status status = opCtx->recoveryUnit()->setTimestamp(bsonRecords[0].ts); !status.isOK()) {
+ return status;
+ }
+
+ for (const MultikeyPathInfo& newPath : newPaths) {
+ auto idx = findIndexByName(opCtx, newPath.indexName, /*includeUnfinishedIndexes=*/true);
+ if (!idx) {
+ return Status(ErrorCodes::IndexNotFound,
+ str::stream()
+ << "Could not find index " << newPath.indexName << " in "
+ << coll->ns() << " (" << coll->uuid() << ") to set to multikey.");
+ }
+ setMultikeyPaths(opCtx, coll, idx, newPath.multikeyMetadataKeys, newPath.multikeyPaths);
}
return Status::OK();
diff --git a/src/mongo/db/repl/storage_timestamp_test.cpp b/src/mongo/db/repl/storage_timestamp_test.cpp
index f5f0abafa5a..b128b2bad20 100644
--- a/src/mongo/db/repl/storage_timestamp_test.cpp
+++ b/src/mongo/db/repl/storage_timestamp_test.cpp
@@ -102,7 +102,10 @@
namespace mongo {
namespace {
-Status createIndexFromSpec(OperationContext* opCtx, StringData ns, const BSONObj& spec) {
+Status createIndexFromSpec(OperationContext* opCtx,
+ VectorClockMutable* clock,
+ StringData ns,
+ const BSONObj& spec) {
AutoGetDb autoDb(opCtx, nsToDatabaseSubstring(ns), MODE_X);
Collection* coll;
{
@@ -125,9 +128,10 @@ Status createIndexFromSpec(OperationContext* opCtx, StringData ns, const BSONObj
.init(opCtx,
collection,
spec,
- [opCtx](const std::vector<BSONObj>& specs) -> Status {
+ [opCtx, clock](const std::vector<BSONObj>& specs) -> Status {
if (opCtx->recoveryUnit()->getCommitTimestamp().isNull()) {
- return opCtx->recoveryUnit()->setTimestamp(Timestamp(1, 1));
+ return opCtx->recoveryUnit()->setTimestamp(
+ clock->tickClusterTime(1).asTimestamp());
}
return Status::OK();
})
@@ -153,7 +157,8 @@ Status createIndexFromSpec(OperationContext* opCtx, StringData ns, const BSONObj
WriteUnitOfWork wunit(opCtx);
ASSERT_OK(indexer.commit(
opCtx, coll, MultiIndexBlock::kNoopOnCreateEachFn, MultiIndexBlock::kNoopOnCommitFn));
- ASSERT_OK(opCtx->recoveryUnit()->setTimestamp(Timestamp(1, 1)));
+ LogicalTime indexTs = clock->tickClusterTime(1);
+ ASSERT_OK(opCtx->recoveryUnit()->setTimestamp(indexTs.asTimestamp()));
wunit.commit();
abortOnExit.dismiss();
return Status::OK();
@@ -473,6 +478,13 @@ public:
return queryCollection(NamespaceString::kRsOplogNamespace, query);
}
+ Timestamp getTopOfOplog() {
+ OneOffRead oor(_opCtx, Timestamp::min());
+ BSONObj ret;
+ ASSERT_TRUE(Helpers::getLast(_opCtx, NamespaceString::kRsOplogNamespace.ns().c_str(), ret));
+ return ret["ts"].timestamp();
+ }
+
void assertMinValidDocumentAtTimestamp(const NamespaceString& nss,
const Timestamp& ts,
const repl::MinValidDocument& expectedDoc) {
@@ -1348,7 +1360,7 @@ TEST_F(StorageTimestampTest, SecondarySetIndexMultikeyOnInsert) {
auto indexName = "a_1";
auto indexSpec = BSON("name" << indexName << "key" << BSON("a" << 1) << "v"
<< static_cast<int>(kIndexVersion));
- ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec));
+ ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec));
_coordinatorMock->alwaysAllowWrites(false);
@@ -1420,7 +1432,7 @@ TEST_F(StorageTimestampTest, SecondarySetWildcardIndexMultikeyOnInsert) {
auto indexName = "a_1";
auto indexSpec = BSON("name" << indexName << "key" << BSON("$**" << 1) << "v"
<< static_cast<int>(kIndexVersion));
- ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec));
+ ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec));
_coordinatorMock->alwaysAllowWrites(false);
@@ -1516,7 +1528,7 @@ TEST_F(StorageTimestampTest, SecondarySetWildcardIndexMultikeyOnUpdate) {
auto indexName = "a_1";
auto indexSpec = BSON("name" << indexName << "key" << BSON("$**" << 1) << "v"
<< static_cast<int>(kIndexVersion));
- ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec));
+ ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec));
_coordinatorMock->alwaysAllowWrites(false);
@@ -1600,99 +1612,6 @@ TEST_F(StorageTimestampTest, SecondarySetWildcardIndexMultikeyOnUpdate) {
}
}
-TEST_F(StorageTimestampTest, InitialSyncSetIndexMultikeyOnInsert) {
- // Pretend to be a secondary.
- repl::UnreplicatedWritesBlock uwb(_opCtx);
-
- NamespaceString nss("unittests.InitialSyncSetIndexMultikeyOnInsert");
- create(nss);
- UUID uuid = UUID::gen();
- {
- AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX);
- uuid = autoColl.getCollection()->uuid();
- }
- auto indexName = "a_1";
- auto indexSpec = BSON("name" << indexName << "key" << BSON("a" << 1) << "v"
- << static_cast<int>(kIndexVersion));
- ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec));
-
- _coordinatorMock->alwaysAllowWrites(false);
- ASSERT_OK(_coordinatorMock->setFollowerMode({repl::MemberState::MS::RS_STARTUP2}));
-
- const LogicalTime pastTime = _clock->tickClusterTime(1);
- const LogicalTime insertTime0 = _clock->tickClusterTime(1);
- const LogicalTime indexBuildTime = _clock->tickClusterTime(1);
- const LogicalTime insertTime1 = _clock->tickClusterTime(1);
- const LogicalTime insertTime2 = _clock->tickClusterTime(1);
-
- BSONObj doc0 = BSON("_id" << 0 << "a" << 3);
- BSONObj doc1 = BSON("_id" << 1 << "a" << BSON_ARRAY(1 << 2));
- BSONObj doc2 = BSON("_id" << 2 << "a" << BSON_ARRAY(1 << 2));
- auto op0 = repl::OplogEntry(
- BSON("ts" << insertTime0.asTimestamp() << "t" << 1LL << "v" << 2 << "op"
- << "i"
- << "ns" << nss.ns() << "ui" << uuid << "wall" << Date_t() << "o" << doc0));
- auto op1 = repl::OplogEntry(
- BSON("ts" << insertTime1.asTimestamp() << "t" << 1LL << "v" << 2 << "op"
- << "i"
- << "ns" << nss.ns() << "ui" << uuid << "wall" << Date_t() << "o" << doc1));
- auto op2 = repl::OplogEntry(
- BSON("ts" << insertTime2.asTimestamp() << "t" << 1LL << "v" << 2 << "op"
- << "i"
- << "ns" << nss.ns() << "ui" << uuid << "wall" << Date_t() << "o" << doc2));
- auto indexSpec2 = BSON("createIndexes" << nss.coll() << "v" << static_cast<int>(kIndexVersion)
- << "key" << BSON("b" << 1) << "name"
- << "b_1");
- auto createIndexOp =
- repl::OplogEntry(BSON("ts" << indexBuildTime.asTimestamp() << "t" << 1LL << "v" << 2 << "op"
- << "c"
- << "ns" << nss.getCommandNS().ns() << "ui" << uuid << "wall"
- << Date_t() << "o" << indexSpec2));
-
- // We add in an index creation op to test that we restart tracking multikey path info
- // after bulk index builds.
- std::vector<repl::OplogEntry> ops = {op0, createIndexOp, op1, op2};
-
- DoNothingOplogApplierObserver observer;
- auto storageInterface = repl::StorageInterface::get(_opCtx);
- auto writerPool = repl::makeReplWriterPool();
-
- repl::OplogApplierImpl oplogApplier(
- nullptr, // task executor. not required for applyOplogBatch().
- nullptr, // oplog buffer. not required for applyOplogBatch().
- &observer,
- _coordinatorMock,
- _consistencyMarkers,
- storageInterface,
- repl::OplogApplier::Options(repl::OplogApplication::Mode::kInitialSync),
- writerPool.get());
- auto lastTime = unittest::assertGet(oplogApplier.applyOplogBatch(_opCtx, ops));
- ASSERT_EQ(lastTime.getTimestamp(), insertTime2.asTimestamp());
-
- // Wait for the index build to finish before making any assertions.
- IndexBuildsCoordinator::get(_opCtx)->awaitNoIndexBuildInProgressForCollection(_opCtx, uuid);
-
- AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX);
-
- // Ensure minimumVisible has not been updated due to the index creation.
- ASSERT_LT(autoColl.getCollection()->getMinimumVisibleSnapshot().get(), pastTime.asTimestamp());
-
- // Reading the multikey state before 'insertTime0' is not valid or reliable to test. If the
- // background index build intercepts and drains writes during inital sync, the index write
- // and the write to the multikey path state will not be timestamped. This write is not
- // timestamped because the lastApplied timestamp, which would normally be used on a primary
- // or secondary, is not always available during initial sync.
- // Additionally, it is not valid to read at a timestamp before inital sync completes, so
- // these assertions below only make sense in the context of this unit test, but would
- // otherwise not be exercised in any normal scenario.
- assertMultikeyPaths(
- _opCtx, autoColl.getCollection(), indexName, insertTime0.asTimestamp(), true, {{0}});
- assertMultikeyPaths(
- _opCtx, autoColl.getCollection(), indexName, insertTime1.asTimestamp(), true, {{0}});
- assertMultikeyPaths(
- _opCtx, autoColl.getCollection(), indexName, insertTime2.asTimestamp(), true, {{0}});
-}
-
TEST_F(StorageTimestampTest, PrimarySetIndexMultikeyOnInsert) {
NamespaceString nss("unittests.PrimarySetIndexMultikeyOnInsert");
create(nss);
@@ -1701,7 +1620,7 @@ TEST_F(StorageTimestampTest, PrimarySetIndexMultikeyOnInsert) {
auto indexName = "a_1";
auto indexSpec = BSON("name" << indexName << "key" << BSON("a" << 1) << "v"
<< static_cast<int>(kIndexVersion));
- ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec));
+ ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec));
const LogicalTime pastTime = _clock->tickClusterTime(1);
const LogicalTime insertTime = pastTime.addTicks(1);
@@ -1726,7 +1645,7 @@ TEST_F(StorageTimestampTest, PrimarySetIndexMultikeyOnInsertUnreplicated) {
auto indexName = "a_1";
auto indexSpec = BSON("name" << indexName << "key" << BSON("a" << 1) << "v"
<< static_cast<int>(kIndexVersion));
- ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec));
+ ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec));
const LogicalTime pastTime = _clock->tickClusterTime(1);
const LogicalTime insertTime = pastTime.addTicks(1);
@@ -1758,7 +1677,7 @@ TEST_F(StorageTimestampTest, PrimarySetsMultikeyInsideMultiDocumentTransaction)
{
AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX);
- ASSERT_OK(createIndexFromSpec(_opCtx, nss.ns(), indexSpec));
+ ASSERT_OK(createIndexFromSpec(_opCtx, _clock, nss.ns(), indexSpec));
}
const auto currentTime = _clock->getTime();
@@ -3185,6 +3104,82 @@ TEST_F(StorageTimestampTest, CreateCollectionWithSystemIndex) {
assertIdentsExistAtTimestamp(durableCatalog, "", indexIdent, _nullTs);
}
+TEST_F(StorageTimestampTest, MultipleTimestampsForMultikeyWrites) {
+ auto storageEngine = _opCtx->getServiceContext()->getStorageEngine();
+ auto durableCatalog = storageEngine->getCatalog();
+ RecordId catalogId;
+
+ // Create config.system.indexBuilds collection to store commit quorum value during index
+ // building.
+ ASSERT_OK(repl::StorageInterface::get(_opCtx)->dropCollection(
+ _opCtx, NamespaceString::kIndexBuildEntryNamespace));
+ ASSERT_OK(
+ createCollection(_opCtx,
+ NamespaceString::kIndexBuildEntryNamespace.db().toString(),
+ BSON("create" << NamespaceString::kIndexBuildEntryNamespace.coll())));
+
+ NamespaceString nss("unittests.timestampVectoredInsertMultikey");
+ create(nss);
+
+ {
+ AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX);
+
+ WriteUnitOfWork wuow(_opCtx);
+ insertDocument(
+ autoColl.getCollection(),
+ InsertStatement(BSON("_id" << 0 << "a" << 1 << "b" << 2), Timestamp(), _presentTerm));
+ wuow.commit();
+ ASSERT_EQ(1, itCount(autoColl.getCollection()));
+ catalogId = autoColl.getCollection()->getCatalogId();
+ }
+
+ DBDirectClient client(_opCtx);
+ {
+ auto index1 = BSON("v" << kIndexVersion << "key" << BSON("a" << 1) << "name"
+ << "a_1");
+ auto index2 = BSON("v" << kIndexVersion << "key" << BSON("b" << 1) << "name"
+ << "b_1");
+ // Disable index build commit quorum as we don't have support of replication subsystem for
+ // voting.
+ auto createIndexesCmdObj =
+ BSON("createIndexes" << nss.coll() << "indexes" << BSON_ARRAY(index1 << index2)
+ << "commitQuorum" << 0);
+ BSONObj result;
+ ASSERT(client.runCommand(nss.db().toString(), createIndexesCmdObj, result)) << result;
+ }
+
+ AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_IX);
+ Timestamp tsBeforeMultikeyWrites = getTopOfOplog();
+ {
+ const LogicalTime firstInsertTime = _clock->tickClusterTime(2);
+
+ std::vector<InsertStatement> vectoredInsert;
+ vectoredInsert.emplace_back(BSON("_id" << 1 << "b" << BSON_ARRAY(1 << 2)),
+ firstInsertTime.asTimestamp(),
+ _presentTerm);
+ vectoredInsert.emplace_back(BSON("_id" << 2 << "a" << BSON_ARRAY(1 << 2)),
+ firstInsertTime.addTicks(1).asTimestamp(),
+ _presentTerm);
+
+ WriteUnitOfWork wuow(_opCtx);
+ ASSERT_OK(autoColl.getCollection()->insertDocuments(
+ _opCtx, vectoredInsert.begin(), vectoredInsert.end(), nullptr, false));
+ wuow.commit();
+ }
+
+ // By virtue of not crashing due to WT observing an out of order write, this test has
+ // succeeded. For completeness, check the index metadata.
+ std::shared_ptr<BSONCollectionCatalogEntry::MetaData> mdBeforeInserts =
+ getMetaDataAtTime(durableCatalog, catalogId, tsBeforeMultikeyWrites);
+ ASSERT_FALSE(getIndexMetaData(mdBeforeInserts, "a_1").multikey);
+ ASSERT_FALSE(getIndexMetaData(mdBeforeInserts, "b_1").multikey);
+
+ std::shared_ptr<BSONCollectionCatalogEntry::MetaData> mdAfterInserts =
+ getMetaDataAtTime(durableCatalog, catalogId, getTopOfOplog());
+ ASSERT(getIndexMetaData(mdAfterInserts, "a_1").multikey);
+ ASSERT(getIndexMetaData(mdAfterInserts, "b_1").multikey);
+}
+
class RetryableFindAndModifyTest : public StorageTimestampTest {
public:
const StringData dbName = "unittest"_sd;
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
index 0f4806c32a4..8cb5ce919ad 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
@@ -791,18 +791,17 @@ StatusWith<std::string> WiredTigerRecordStore::generateCreateString(
ss << "split_pct=90,";
ss << "leaf_value_max=64MB,";
if (TestingProctor::instance().isEnabled()) {
- if (NamespaceString(ns).isOplog() ||
- // TODO (SERVER-60754): Remove special handling for setting multikey.
- ident.startsWith("_mdb_catalog")) {
-
+ if (NamespaceString(ns).isOplog()) {
// For the above clauses we do not assert any particular `write_timestamp_usage`. In
// particular for the oplog, WT removes all timestamp information. There's nothing in
// MDB's control to assert against.
} else if (
// Side table drains are not timestamped.
ident.startsWith("internal-") ||
- // TODO (SERVER-60753): Remove special handling for index build during recovery.
- ns == NamespaceString::kIndexBuildEntryNamespace.ns()) {
+ // TODO (SERVER-60753): Remove special handling for index build during recovery. This
+ // includes the following _mdb_catalog ident.
+ ns == NamespaceString::kIndexBuildEntryNamespace.ns() ||
+ ident.startsWith("_mdb_catalog")) {
ss << "write_timestamp_usage=mixed_mode,";
} else {
ss << "write_timestamp_usage=ordered,";