From 1417eee440b4132e24d1388011d681e2c9fcec41 Mon Sep 17 00:00:00 2001 From: Judah Schvimer Date: Sun, 10 May 2020 17:22:26 -0400 Subject: SERVER-48010 Substitute ghost timestamp with no-op write in multi-statement txn multikey sidetxn write --- src/mongo/db/catalog/index_catalog_entry_impl.cpp | 60 +++++++++++++---------- src/mongo/db/op_observer_impl.cpp | 10 ++-- src/mongo/dbtests/storage_timestamp_tests.cpp | 22 ++++++--- 3 files changed, 54 insertions(+), 38 deletions(-) diff --git a/src/mongo/db/catalog/index_catalog_entry_impl.cpp b/src/mongo/db/catalog/index_catalog_entry_impl.cpp index f62335fd8b7..5d7454a4b27 100644 --- a/src/mongo/db/catalog/index_catalog_entry_impl.cpp +++ b/src/mongo/db/catalog/index_catalog_entry_impl.cpp @@ -46,6 +46,7 @@ #include "mongo/db/matcher/expression.h" #include "mongo/db/matcher/expression_parser.h" #include "mongo/db/multi_key_path_tracker.h" +#include "mongo/db/op_observer.h" #include "mongo/db/operation_context.h" #include "mongo/db/pipeline/expression_context.h" #include "mongo/db/query/collation/collator_factory_interface.h" @@ -271,33 +272,42 @@ Status IndexCatalogEntryImpl::_setMultikeyInMultiDocumentTransaction( return {ErrorCodes::SnapshotUnavailable, "index not visible in side transaction"}; } - writeConflictRetry(opCtx, "set index multikey", ns().ns(), [&] { - WriteUnitOfWork wuow(opCtx); - - // If we have a prepare optime for recovery, then we always use that. During recovery of - // prepared transactions, the logical clock may not yet be initialized, so we use the - // prepare timestamp of the transaction for this write. This is safe since the prepare - // timestamp is always <= the commit timestamp of a transaction, which satisfies the - // correctness requirement for multikey writes i.e. they must occur at or before the - // first write that set the multikey flag. - auto recoveryPrepareOpTime = txnParticipant.getPrepareOpTimeForRecovery(); - Timestamp writeTs = recoveryPrepareOpTime.isNull() - ? LogicalClock::get(opCtx)->getClusterTime().asTimestamp() - : recoveryPrepareOpTime.getTimestamp(); - - auto status = opCtx->recoveryUnit()->setTimestamp(writeTs); - if (status.code() == ErrorCodes::BadValue) { - LOGV2(20352, - "Temporarily could not timestamp the multikey catalog write, retrying.", - "reason"_attr = status.reason()); - throw WriteConflictException(); - } - fassert(31164, status); + writeConflictRetry( + opCtx, "set index multikey", ns().ns(), [&] { + WriteUnitOfWork wuow(opCtx); + + // If we have a prepare optime for recovery, then we always use that. This is safe since + // the prepare timestamp is always <= the commit timestamp of a transaction, which + // satisfies the correctness requirement for multikey writes i.e. they must occur at or + // before the first write that set the multikey flag. This only occurs when + // reconstructing prepared transactions, and not during replication recovery oplog + // application. + auto recoveryPrepareOpTime = txnParticipant.getPrepareOpTimeForRecovery(); + if (!recoveryPrepareOpTime.isNull()) { + auto status = + opCtx->recoveryUnit()->setTimestamp(recoveryPrepareOpTime.getTimestamp()); + if (status.code() == ErrorCodes::BadValue) { + LOGV2(20352, + "Temporarily could not timestamp the multikey catalog write, retrying.", + "reason"_attr = status.reason()); + throw WriteConflictException(); + } + fassert(31164, status); + } else { + // If there is no recovery prepare OpTime, then this node must be a primary. We + // write a noop oplog entry to get a properly ordered timestamp. + invariant(opCtx->writesAreReplicated()); + + auto msg = BSON("msg" + << "Setting index to multikey" + << "coll" << ns().ns() << "index" << _descriptor->indexName()); + opCtx->getClient()->getServiceContext()->getOpObserver()->onOpMessage(opCtx, msg); + } - _catalogSetMultikey(opCtx, multikeyPaths); + _catalogSetMultikey(opCtx, multikeyPaths); - wuow.commit(); - }); + wuow.commit(); + }); return Status::OK(); } diff --git a/src/mongo/db/op_observer_impl.cpp b/src/mongo/db/op_observer_impl.cpp index acfad92cd8c..8735e82e287 100644 --- a/src/mongo/db/op_observer_impl.cpp +++ b/src/mongo/db/op_observer_impl.cpp @@ -347,12 +347,10 @@ void OpObserverImpl::onStartIndexBuildSinglePhase(OperationContext* opCtx, // 4. All other cases, we generate a timestamp by writing a no-op oplog entry. This is // better than using a ghost timestamp. Writing an oplog entry ensures this node is // primary. - onInternalOpMessage( - opCtx, - {}, - boost::none, - BSON("msg" << std::string(str::stream() << "Creating indexes. Coll: " << nss)), - boost::none); + auto msg = BSON("msg" + << "Creating indexes" + << "coll" << nss.ns()); + onOpMessage(opCtx, msg); } void OpObserverImpl::onCommitIndexBuild(OperationContext* opCtx, diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp index f79de6a7570..d148f48d416 100644 --- a/src/mongo/dbtests/storage_timestamp_tests.cpp +++ b/src/mongo/dbtests/storage_timestamp_tests.cpp @@ -686,7 +686,8 @@ public: const bool match = (expectedMultikeyPaths == actualMultikeyPaths); if (!match) { - FAIL(str::stream() << "Expected: " << dumpMultikeyPaths(expectedMultikeyPaths) + FAIL(str::stream() << "TS: " << ts.toString() + << ", Expected: " << dumpMultikeyPaths(expectedMultikeyPaths) << ", Actual: " << dumpMultikeyPaths(actualMultikeyPaths)); } ASSERT_TRUE(match); @@ -1528,11 +1529,14 @@ public: const auto beforeTxnTime = _clock->reserveTicks(1); auto beforeTxnTs = beforeTxnTime.asTimestamp(); - auto commitEntryTs = beforeTxnTime.addTicks(1).asTimestamp(); + const auto multikeyNoopTime = beforeTxnTime.addTicks(1); + auto multikeyNoopTs = multikeyNoopTime.asTimestamp(); + auto commitEntryTs = multikeyNoopTime.addTicks(1).asTimestamp(); - LOGV2(22502, "Present TS: {presentTs}", "presentTs"_attr = presentTs); - LOGV2(22503, "Before transaction TS: {beforeTxnTs}", "beforeTxnTs"_attr = beforeTxnTs); - LOGV2(22504, "Commit entry TS: {commitEntryTs}", "commitEntryTs"_attr = commitEntryTs); + LOGV2(22502, "Present time", "timestamp"_attr = presentTs); + LOGV2(22503, "Before transaction time", "timestamp"_attr = beforeTxnTs); + LOGV2(4801000, "Multikey noop time", "timestamp"_attr = multikeyNoopTs); + LOGV2(22504, "Commit entry time", "timestamp"_attr = commitEntryTs); const auto sessionId = makeLogicalSessionIdForTest(); _opCtx->setLogicalSessionId(sessionId); @@ -1561,8 +1565,9 @@ public: auto coll = autoColl.getCollection(); // Make sure the transaction committed and its writes were timestamped correctly. - assertDocumentAtTimestamp(coll, beforeTxnTs, BSONObj()); assertDocumentAtTimestamp(coll, presentTs, BSONObj()); + assertDocumentAtTimestamp(coll, beforeTxnTs, BSONObj()); + assertDocumentAtTimestamp(coll, multikeyNoopTs, BSONObj()); assertDocumentAtTimestamp(coll, commitEntryTs, doc); assertDocumentAtTimestamp(coll, nullTs, doc); @@ -1575,9 +1580,12 @@ public: // commit time. assertMultikeyPaths(_opCtx, coll, indexName, presentTs, false /* shouldBeMultikey */, {{}}); assertMultikeyPaths( - _opCtx, coll, indexName, beforeTxnTs, true /* shouldBeMultikey */, {{0}}); + _opCtx, coll, indexName, beforeTxnTs, false /* shouldBeMultikey */, {{}}); + assertMultikeyPaths( + _opCtx, coll, indexName, multikeyNoopTs, true /* shouldBeMultikey */, {{0}}); assertMultikeyPaths( _opCtx, coll, indexName, commitEntryTs, true /* shouldBeMultikey */, {{0}}); + assertMultikeyPaths(_opCtx, coll, indexName, nullTs, true /* shouldBeMultikey */, {{0}}); } }; -- cgit v1.2.1