summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJordi Olivares Provencio <jordi.olivares-provencio@mongodb.com>2022-04-01 09:31:03 +0000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2022-04-01 09:58:34 +0000
commitc2de464c661ec8fc3198373932d0e3aab7fde971 (patch)
tree4492a72c1f90fba707e3bf9ecbe730cf84557a5a /src
parent5f1bc9e0463e261fbbf0a0c7ad69c7522f07d750 (diff)
downloadmongo-c2de464c661ec8fc3198373932d0e3aab7fde971.tar.gz
SERVER-64125 Avoid committing at the stable timestamp
Diffstat (limited to 'src')
-rw-r--r--src/mongo/db/catalog/index_catalog_entry_impl.cpp16
-rw-r--r--src/mongo/db/repl/initial_syncer.cpp16
-rw-r--r--src/mongo/db/storage/kv/kv_engine_test_harness.cpp69
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp23
-rw-r--r--src/mongo/dbtests/querytests.cpp40
5 files changed, 68 insertions, 96 deletions
diff --git a/src/mongo/db/catalog/index_catalog_entry_impl.cpp b/src/mongo/db/catalog/index_catalog_entry_impl.cpp
index d7ab018fabc..5a11aec117f 100644
--- a/src/mongo/db/catalog/index_catalog_entry_impl.cpp
+++ b/src/mongo/db/catalog/index_catalog_entry_impl.cpp
@@ -317,12 +317,24 @@ Status IndexCatalogEntryImpl::_setMultikeyInMultiDocumentTransaction(
// may not have a stable timestamp. Therefore, we need to round up
// the multi-key write timestamp to the max of the three so that we don't write
// behind the oldest/stable timestamp. This code path is only hit during initial
- // sync/recovery when reconstructing prepared transactions and so we don't expect
+ // sync/recovery when reconstructing prepared transactions, and so we don't expect
// the oldest/stable timestamp to advance concurrently.
+ //
+ // WiredTiger disallows committing at the stable timestamp to avoid confusion during
+ // checkpoints, to overcome that we allow setting the timestamp slightly after the
+ // prepared timestamp of the original transaction. This is currently not an issue as
+ // the index metadata state is read from in-memory cache and this is modifying the
+ // state on-disk from the _mdb_catalog document. To put in other words, multikey
+ // doesn't go backwards. This would be a problem if we move to a versioned catalog
+ // world as a different transaction could choose an earlier timestamp (i.e. the
+ // original transaction timestamp) and encounter an invalid system state where the
+ // document that enables multikey hasn't enabled it yet but is present in the
+ // collection. In other words, the index is not set for multikey but there is
+ // already data present that relies on it.
auto status = opCtx->recoveryUnit()->setTimestamp(std::max(
{recoveryPrepareOpTime.getTimestamp(),
opCtx->getServiceContext()->getStorageEngine()->getOldestTimestamp(),
- opCtx->getServiceContext()->getStorageEngine()->getStableTimestamp()}));
+ opCtx->getServiceContext()->getStorageEngine()->getStableTimestamp() + 1}));
fassert(31164, status);
} else {
// If there is no recovery prepare OpTime, then this node must be a primary. We
diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp
index 127f382f0c5..53b5b8d00c2 100644
--- a/src/mongo/db/repl/initial_syncer.cpp
+++ b/src/mongo/db/repl/initial_syncer.cpp
@@ -1468,11 +1468,17 @@ void InitialSyncer::_lastOplogEntryFetcherCallbackForStopTimestamp(
// override its behavior in tests. See InitialSyncerReturnsCallbackCanceledAndDoesNot-
// ScheduleRollbackCheckerIfShutdownAfterInsertingInsertOplogSeedDocument in
// initial_syncer_test.cpp
- auto status = _storage->insertDocument(
- opCtx.get(),
- NamespaceString::kRsOplogNamespace,
- TimestampedBSONObj{oplogSeedDoc, resultOpTimeAndWallTime.opTime.getTimestamp()},
- resultOpTimeAndWallTime.opTime.getTerm());
+ //
+ // Note that the initial seed oplog insertion is not timestamped, this is safe to do as the
+ // logic for navigating the oplog is reliant on the timestamp value of the oplog document
+ // itself. Additionally, this also prevents confusion in the storage engine as the last
+ // insertion can be produced at precisely the stable timestamp, which could lead to invalid
+ // data consistency due to the stable timestamp signalling that no operations before or at
+ // that point will be rolled back. So transactions shouldn't happen at precisely that point.
+ auto status = _storage->insertDocument(opCtx.get(),
+ NamespaceString::kRsOplogNamespace,
+ TimestampedBSONObj{oplogSeedDoc},
+ resultOpTimeAndWallTime.opTime.getTerm());
if (!status.isOK()) {
stdx::lock_guard<Latch> lock(_mutex);
onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status);
diff --git a/src/mongo/db/storage/kv/kv_engine_test_harness.cpp b/src/mongo/db/storage/kv/kv_engine_test_harness.cpp
index e774531904f..f50dea13d6a 100644
--- a/src/mongo/db/storage/kv/kv_engine_test_harness.cpp
+++ b/src/mongo/db/storage/kv/kv_engine_test_harness.cpp
@@ -1071,7 +1071,7 @@ TEST_F(KVEngineTestHarness, RollingBackToLastStable) {
*/
DEATH_TEST_REGEX_F(KVEngineTestHarness,
CommitBehindStable,
- ".*commit timestamp.*is less than the stable timestamp.*") {
+ ".*commit timestamp.*(is less than|must be after) the stable timestamp.*") {
std::unique_ptr<KVHarnessHelper> helper(KVHarnessHelper::create(getServiceContext()));
KVEngine* engine = helper->getEngine();
// TODO SERVER-48314: Remove after implementing correct behavior on biggie.
@@ -1102,7 +1102,7 @@ DEATH_TEST_REGEX_F(KVEngineTestHarness,
}
{
- // Committing a behind the stable timestamp is not allowed.
+ // Committing at or behind the stable timestamp is not allowed.
auto opCtx = _makeOperationContext(engine);
WriteUnitOfWork uow(opCtx.get());
auto swRid = rs->insertRecord(opCtx.get(), "abc", 4, Timestamp(1, 1));
@@ -1110,71 +1110,6 @@ DEATH_TEST_REGEX_F(KVEngineTestHarness,
}
}
-/*
- * Commit at stable
- * | Session | GlobalActor |
- * |---------------------------------+----------------------------|
- * | | GlobalTimestamp :stable 2 |
- * | Begin | |
- * | Write A 1 | |
- * | Timestamp :commit 2 | |
- */
-TEST_F(KVEngineTestHarness, CommitAtStable) {
- std::unique_ptr<KVHarnessHelper> helper(KVHarnessHelper::create(getServiceContext()));
- KVEngine* engine = helper->getEngine();
- // TODO SERVER-48314: Remove after implementing correct behavior on biggie.
- if (engine->isEphemeral())
- return;
-
- // The initial data timestamp has to be set to take stable checkpoints.
- engine->setInitialDataTimestamp(Timestamp(1, 1));
- std::string ns = "a.b";
- std::unique_ptr<RecordStore> rs;
- {
- auto opCtx = _makeOperationContext(engine);
- ASSERT_OK(engine->createRecordStore(opCtx.get(), ns, ns, CollectionOptions()));
- rs = engine->getRecordStore(opCtx.get(), ns, ns, CollectionOptions());
- ASSERT(rs);
- }
-
- {
- // Set the stable timestamp to (2, 2).
- ASSERT(!engine->getLastStableRecoveryTimestamp());
- engine->setStableTimestamp(Timestamp(2, 2), false);
- ASSERT(!engine->getLastStableRecoveryTimestamp());
-
- // Force a checkpoint to be taken. This should advance the last stable timestamp.
- auto opCtx = _makeOperationContext(engine);
- engine->flushAllFiles(opCtx.get(), false);
- ASSERT_EQ(engine->getLastStableRecoveryTimestamp(), Timestamp(2, 2));
- }
-
- RecordId rid;
- {
- // For a non-prepared transaction, the commit timestamp can be equal to the stable
- // timestamp.
- auto opCtx = _makeOperationContext(engine);
- WriteUnitOfWork uow(opCtx.get());
- auto swRid = rs->insertRecord(opCtx.get(), "abc", 4, Timestamp(2, 2));
- ASSERT_OK(swRid);
- rid = swRid.getValue();
- uow.commit();
- }
-
- {
- // Rollback to the last stable timestamp.
- auto opCtx = _makeOperationContext(engine);
- StatusWith<Timestamp> swTimestamp = engine->recoverToStableTimestamp(opCtx.get());
- ASSERT_EQ(swTimestamp.getValue(), Timestamp(2, 2));
-
- // Transaction with timestamps equal to lastStable will not be rolled back.
- opCtx->recoveryUnit()->abandonSnapshot();
- RecordData data;
- ASSERT_TRUE(rs->findRecord(opCtx.get(), rid, &data));
- ASSERT_EQUALS(1, rs->numRecords(opCtx.get()));
- }
-}
-
TEST_F(DurableCatalogImplTest, Coll1) {
KVEngine* engine = helper->getEngine();
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
index 4666a7b3a50..0ab08790fb7 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp
@@ -1369,18 +1369,19 @@ Status WiredTigerRecordStore::_insertRecords(OperationContext* opCtx,
auto& record = records[i];
invariant(!record.id.isNull());
invariant(!record_id_helpers::isReserved(record.id));
- Timestamp ts;
- if (timestamps[i].isNull() && _isOplog) {
- // If the timestamp is 0, that probably means someone inserted a document directly
- // into the oplog. In this case, use the RecordId as the timestamp, since they are
- // one and the same. Setting this transaction to be unordered will trigger a journal
- // flush. Because these are direct writes into the oplog, the machinery to trigger a
- // journal flush is bypassed. A followup oplog read will require a fresh visibility
- // value to make progress.
- ts = Timestamp(record.id.getLong());
+ Timestamp ts = timestamps[i];
+ if (_isOplog) {
+ // Setting this transaction to be unordered will trigger a journal flush. Because these
+ // are direct writes into the oplog, the machinery to trigger a journal flush is
+ // bypassed. A followup oplog read will require a fres value to make progress.
opCtx->recoveryUnit()->setOrderedCommit(false);
- } else {
- ts = timestamps[i];
+ auto oplogKeyTs = Timestamp(record.id.getLong());
+ if (!ts.isNull()) {
+ invariant(oplogKeyTs == ts);
+ }
+ if (!opCtx->recoveryUnit()->getCommitTimestamp().isNull()) {
+ invariant(oplogKeyTs == opCtx->recoveryUnit()->getCommitTimestamp());
+ }
}
if (!ts.isNull()) {
LOGV2_DEBUG(22403, 4, "inserting record with timestamp {ts}", "ts"_attr = ts);
diff --git a/src/mongo/dbtests/querytests.cpp b/src/mongo/dbtests/querytests.cpp
index 3310f766ff4..1309ecedc8c 100644
--- a/src/mongo/dbtests/querytests.cpp
+++ b/src/mongo/dbtests/querytests.cpp
@@ -55,6 +55,20 @@
namespace mongo {
namespace {
+void insertOplogDocument(OperationContext* opCtx, Timestamp ts, const char* ns) {
+ AutoGetCollection coll(opCtx, NamespaceString{ns}, MODE_IX);
+ WriteUnitOfWork wuow(opCtx);
+ auto doc = BSON("ts" << ts);
+ InsertStatement stmt;
+ stmt.doc = doc;
+ stmt.oplogSlot = OplogSlot{ts, OplogSlot::kInitialTerm};
+ auto status = coll->insertDocument(opCtx, stmt, nullptr);
+ if (!status.isOK()) {
+ std::cout << "Failed to insert oplog document: " << status.toString() << std::endl;
+ }
+ wuow.commit();
+}
+
using std::endl;
using std::string;
using std::unique_ptr;
@@ -689,6 +703,7 @@ public:
~OplogScanWithGtTimstampPred() {
_client.dropCollection(ns);
}
+
void run() {
// Skip the test if the storage engine doesn't support capped collections.
if (!_opCtx.getServiceContext()->getStorageEngine()->supportsCappedCollections()) {
@@ -712,9 +727,10 @@ public:
info);
}
- insert(ns, BSON("ts" << Timestamp(1000, 0)));
- insert(ns, BSON("ts" << Timestamp(1000, 1)));
- insert(ns, BSON("ts" << Timestamp(1000, 2)));
+
+ insertOplogDocument(&_opCtx, Timestamp(1000, 0), ns);
+ insertOplogDocument(&_opCtx, Timestamp(1000, 1), ns);
+ insertOplogDocument(&_opCtx, Timestamp(1000, 2), ns);
FindCommandRequest findRequest{NamespaceString{ns}};
findRequest.setFilter(BSON("ts" << GT << Timestamp(1000, 1)));
findRequest.setHint(BSON("$natural" << 1));
@@ -739,6 +755,7 @@ public:
~OplogScanGtTsExplain() {
_client.dropCollection(string(ns));
}
+
void run() {
// Skip the test if the storage engine doesn't support capped collections.
if (!_opCtx.getServiceContext()->getStorageEngine()->supportsCappedCollections()) {
@@ -762,9 +779,9 @@ public:
info);
}
- insert(ns, BSON("ts" << Timestamp(1000, 0)));
- insert(ns, BSON("ts" << Timestamp(1000, 1)));
- insert(ns, BSON("ts" << Timestamp(1000, 2)));
+ insertOplogDocument(&_opCtx, Timestamp(1000, 0), ns);
+ insertOplogDocument(&_opCtx, Timestamp(1000, 1), ns);
+ insertOplogDocument(&_opCtx, Timestamp(1000, 2), ns);
BSONObj explainCmdObj =
BSON("explain" << BSON("find"
@@ -1519,7 +1536,7 @@ public:
while (1) {
int oldCount = count();
- _client.insert(ns(), BSON("ts" << Timestamp(1000, i++)));
+ insertOplogDocument(&_opCtx, Timestamp(1000, i++), ns());
int newCount = count();
if (oldCount == newCount || newCount < max)
break;
@@ -1529,7 +1546,8 @@ public:
}
for (int k = 0; k < 5; ++k) {
- _client.insert(ns(), BSON("ts" << Timestamp(1000, i++)));
+ auto ts = Timestamp(1000, i++);
+ insertOplogDocument(&_opCtx, ts, ns());
FindCommandRequest findRequest{NamespaceString{ns()}};
findRequest.setSort(BSON("$natural" << 1));
unsigned min = _client.find(findRequest)->next()["ts"].timestamp().getInc();
@@ -1583,11 +1601,11 @@ public:
}
unsigned i = 0;
- for (; i < 150; _client.insert(ns(), BSON("ts" << Timestamp(1000, i++))))
+ for (; i < 150; insertOplogDocument(&_opCtx, Timestamp(1000, i++), ns()))
;
for (int k = 0; k < 5; ++k) {
- _client.insert(ns(), BSON("ts" << Timestamp(1000, i++)));
+ insertOplogDocument(&_opCtx, Timestamp(1000, i++), ns());
FindCommandRequest findRequest{NamespaceString{ns()}};
findRequest.setSort(BSON("$natural" << 1));
unsigned min = _client.find(findRequest)->next()["ts"].timestamp().getInc();
@@ -1660,7 +1678,7 @@ public:
ASSERT(!c->more());
// Check with some docs in the collection.
- for (int i = 100; i < 150; _client.insert(ns(), BSON("ts" << Timestamp(1000, i++))))
+ for (int i = 100; i < 150; insertOplogDocument(&_opCtx, Timestamp(1000, i++), ns()))
;
c = _client.find(findRequest);
ASSERT(c->more());