diff options
author | Jordi Olivares Provencio <jordi.olivares-provencio@mongodb.com> | 2022-04-01 09:31:03 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-04-01 09:58:34 +0000 |
commit | c2de464c661ec8fc3198373932d0e3aab7fde971 (patch) | |
tree | 4492a72c1f90fba707e3bf9ecbe730cf84557a5a /src | |
parent | 5f1bc9e0463e261fbbf0a0c7ad69c7522f07d750 (diff) | |
download | mongo-c2de464c661ec8fc3198373932d0e3aab7fde971.tar.gz |
SERVER-64125 Avoid committing at the stable timestamp
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/db/catalog/index_catalog_entry_impl.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/repl/initial_syncer.cpp | 16 | ||||
-rw-r--r-- | src/mongo/db/storage/kv/kv_engine_test_harness.cpp | 69 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp | 23 | ||||
-rw-r--r-- | src/mongo/dbtests/querytests.cpp | 40 |
5 files changed, 68 insertions, 96 deletions
diff --git a/src/mongo/db/catalog/index_catalog_entry_impl.cpp b/src/mongo/db/catalog/index_catalog_entry_impl.cpp index d7ab018fabc..5a11aec117f 100644 --- a/src/mongo/db/catalog/index_catalog_entry_impl.cpp +++ b/src/mongo/db/catalog/index_catalog_entry_impl.cpp @@ -317,12 +317,24 @@ Status IndexCatalogEntryImpl::_setMultikeyInMultiDocumentTransaction( // may not have a stable timestamp. Therefore, we need to round up // the multi-key write timestamp to the max of the three so that we don't write // behind the oldest/stable timestamp. This code path is only hit during initial - // sync/recovery when reconstructing prepared transactions and so we don't expect + // sync/recovery when reconstructing prepared transactions, and so we don't expect // the oldest/stable timestamp to advance concurrently. + // + // WiredTiger disallows committing at the stable timestamp to avoid confusion during + // checkpoints, to overcome that we allow setting the timestamp slightly after the + // prepared timestamp of the original transaction. This is currently not an issue as + // the index metadata state is read from in-memory cache and this is modifying the + // state on-disk from the _mdb_catalog document. To put in other words, multikey + // doesn't go backwards. This would be a problem if we move to a versioned catalog + // world as a different transaction could choose an earlier timestamp (i.e. the + // original transaction timestamp) and encounter an invalid system state where the + // document that enables multikey hasn't enabled it yet but is present in the + // collection. In other words, the index is not set for multikey but there is + // already data present that relies on it. auto status = opCtx->recoveryUnit()->setTimestamp(std::max( {recoveryPrepareOpTime.getTimestamp(), opCtx->getServiceContext()->getStorageEngine()->getOldestTimestamp(), - opCtx->getServiceContext()->getStorageEngine()->getStableTimestamp()})); + opCtx->getServiceContext()->getStorageEngine()->getStableTimestamp() + 1})); fassert(31164, status); } else { // If there is no recovery prepare OpTime, then this node must be a primary. We diff --git a/src/mongo/db/repl/initial_syncer.cpp b/src/mongo/db/repl/initial_syncer.cpp index 127f382f0c5..53b5b8d00c2 100644 --- a/src/mongo/db/repl/initial_syncer.cpp +++ b/src/mongo/db/repl/initial_syncer.cpp @@ -1468,11 +1468,17 @@ void InitialSyncer::_lastOplogEntryFetcherCallbackForStopTimestamp( // override its behavior in tests. See InitialSyncerReturnsCallbackCanceledAndDoesNot- // ScheduleRollbackCheckerIfShutdownAfterInsertingInsertOplogSeedDocument in // initial_syncer_test.cpp - auto status = _storage->insertDocument( - opCtx.get(), - NamespaceString::kRsOplogNamespace, - TimestampedBSONObj{oplogSeedDoc, resultOpTimeAndWallTime.opTime.getTimestamp()}, - resultOpTimeAndWallTime.opTime.getTerm()); + // + // Note that the initial seed oplog insertion is not timestamped, this is safe to do as the + // logic for navigating the oplog is reliant on the timestamp value of the oplog document + // itself. Additionally, this also prevents confusion in the storage engine as the last + // insertion can be produced at precisely the stable timestamp, which could lead to invalid + // data consistency due to the stable timestamp signalling that no operations before or at + // that point will be rolled back. So transactions shouldn't happen at precisely that point. + auto status = _storage->insertDocument(opCtx.get(), + NamespaceString::kRsOplogNamespace, + TimestampedBSONObj{oplogSeedDoc}, + resultOpTimeAndWallTime.opTime.getTerm()); if (!status.isOK()) { stdx::lock_guard<Latch> lock(_mutex); onCompletionGuard->setResultAndCancelRemainingWork_inlock(lock, status); diff --git a/src/mongo/db/storage/kv/kv_engine_test_harness.cpp b/src/mongo/db/storage/kv/kv_engine_test_harness.cpp index e774531904f..f50dea13d6a 100644 --- a/src/mongo/db/storage/kv/kv_engine_test_harness.cpp +++ b/src/mongo/db/storage/kv/kv_engine_test_harness.cpp @@ -1071,7 +1071,7 @@ TEST_F(KVEngineTestHarness, RollingBackToLastStable) { */ DEATH_TEST_REGEX_F(KVEngineTestHarness, CommitBehindStable, - ".*commit timestamp.*is less than the stable timestamp.*") { + ".*commit timestamp.*(is less than|must be after) the stable timestamp.*") { std::unique_ptr<KVHarnessHelper> helper(KVHarnessHelper::create(getServiceContext())); KVEngine* engine = helper->getEngine(); // TODO SERVER-48314: Remove after implementing correct behavior on biggie. @@ -1102,7 +1102,7 @@ DEATH_TEST_REGEX_F(KVEngineTestHarness, } { - // Committing a behind the stable timestamp is not allowed. + // Committing at or behind the stable timestamp is not allowed. auto opCtx = _makeOperationContext(engine); WriteUnitOfWork uow(opCtx.get()); auto swRid = rs->insertRecord(opCtx.get(), "abc", 4, Timestamp(1, 1)); @@ -1110,71 +1110,6 @@ DEATH_TEST_REGEX_F(KVEngineTestHarness, } } -/* - * Commit at stable - * | Session | GlobalActor | - * |---------------------------------+----------------------------| - * | | GlobalTimestamp :stable 2 | - * | Begin | | - * | Write A 1 | | - * | Timestamp :commit 2 | | - */ -TEST_F(KVEngineTestHarness, CommitAtStable) { - std::unique_ptr<KVHarnessHelper> helper(KVHarnessHelper::create(getServiceContext())); - KVEngine* engine = helper->getEngine(); - // TODO SERVER-48314: Remove after implementing correct behavior on biggie. - if (engine->isEphemeral()) - return; - - // The initial data timestamp has to be set to take stable checkpoints. - engine->setInitialDataTimestamp(Timestamp(1, 1)); - std::string ns = "a.b"; - std::unique_ptr<RecordStore> rs; - { - auto opCtx = _makeOperationContext(engine); - ASSERT_OK(engine->createRecordStore(opCtx.get(), ns, ns, CollectionOptions())); - rs = engine->getRecordStore(opCtx.get(), ns, ns, CollectionOptions()); - ASSERT(rs); - } - - { - // Set the stable timestamp to (2, 2). - ASSERT(!engine->getLastStableRecoveryTimestamp()); - engine->setStableTimestamp(Timestamp(2, 2), false); - ASSERT(!engine->getLastStableRecoveryTimestamp()); - - // Force a checkpoint to be taken. This should advance the last stable timestamp. - auto opCtx = _makeOperationContext(engine); - engine->flushAllFiles(opCtx.get(), false); - ASSERT_EQ(engine->getLastStableRecoveryTimestamp(), Timestamp(2, 2)); - } - - RecordId rid; - { - // For a non-prepared transaction, the commit timestamp can be equal to the stable - // timestamp. - auto opCtx = _makeOperationContext(engine); - WriteUnitOfWork uow(opCtx.get()); - auto swRid = rs->insertRecord(opCtx.get(), "abc", 4, Timestamp(2, 2)); - ASSERT_OK(swRid); - rid = swRid.getValue(); - uow.commit(); - } - - { - // Rollback to the last stable timestamp. - auto opCtx = _makeOperationContext(engine); - StatusWith<Timestamp> swTimestamp = engine->recoverToStableTimestamp(opCtx.get()); - ASSERT_EQ(swTimestamp.getValue(), Timestamp(2, 2)); - - // Transaction with timestamps equal to lastStable will not be rolled back. - opCtx->recoveryUnit()->abandonSnapshot(); - RecordData data; - ASSERT_TRUE(rs->findRecord(opCtx.get(), rid, &data)); - ASSERT_EQUALS(1, rs->numRecords(opCtx.get())); - } -} - TEST_F(DurableCatalogImplTest, Coll1) { KVEngine* engine = helper->getEngine(); diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp index 4666a7b3a50..0ab08790fb7 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_record_store.cpp @@ -1369,18 +1369,19 @@ Status WiredTigerRecordStore::_insertRecords(OperationContext* opCtx, auto& record = records[i]; invariant(!record.id.isNull()); invariant(!record_id_helpers::isReserved(record.id)); - Timestamp ts; - if (timestamps[i].isNull() && _isOplog) { - // If the timestamp is 0, that probably means someone inserted a document directly - // into the oplog. In this case, use the RecordId as the timestamp, since they are - // one and the same. Setting this transaction to be unordered will trigger a journal - // flush. Because these are direct writes into the oplog, the machinery to trigger a - // journal flush is bypassed. A followup oplog read will require a fresh visibility - // value to make progress. - ts = Timestamp(record.id.getLong()); + Timestamp ts = timestamps[i]; + if (_isOplog) { + // Setting this transaction to be unordered will trigger a journal flush. Because these + // are direct writes into the oplog, the machinery to trigger a journal flush is + // bypassed. A followup oplog read will require a fres value to make progress. opCtx->recoveryUnit()->setOrderedCommit(false); - } else { - ts = timestamps[i]; + auto oplogKeyTs = Timestamp(record.id.getLong()); + if (!ts.isNull()) { + invariant(oplogKeyTs == ts); + } + if (!opCtx->recoveryUnit()->getCommitTimestamp().isNull()) { + invariant(oplogKeyTs == opCtx->recoveryUnit()->getCommitTimestamp()); + } } if (!ts.isNull()) { LOGV2_DEBUG(22403, 4, "inserting record with timestamp {ts}", "ts"_attr = ts); diff --git a/src/mongo/dbtests/querytests.cpp b/src/mongo/dbtests/querytests.cpp index 3310f766ff4..1309ecedc8c 100644 --- a/src/mongo/dbtests/querytests.cpp +++ b/src/mongo/dbtests/querytests.cpp @@ -55,6 +55,20 @@ namespace mongo { namespace { +void insertOplogDocument(OperationContext* opCtx, Timestamp ts, const char* ns) { + AutoGetCollection coll(opCtx, NamespaceString{ns}, MODE_IX); + WriteUnitOfWork wuow(opCtx); + auto doc = BSON("ts" << ts); + InsertStatement stmt; + stmt.doc = doc; + stmt.oplogSlot = OplogSlot{ts, OplogSlot::kInitialTerm}; + auto status = coll->insertDocument(opCtx, stmt, nullptr); + if (!status.isOK()) { + std::cout << "Failed to insert oplog document: " << status.toString() << std::endl; + } + wuow.commit(); +} + using std::endl; using std::string; using std::unique_ptr; @@ -689,6 +703,7 @@ public: ~OplogScanWithGtTimstampPred() { _client.dropCollection(ns); } + void run() { // Skip the test if the storage engine doesn't support capped collections. if (!_opCtx.getServiceContext()->getStorageEngine()->supportsCappedCollections()) { @@ -712,9 +727,10 @@ public: info); } - insert(ns, BSON("ts" << Timestamp(1000, 0))); - insert(ns, BSON("ts" << Timestamp(1000, 1))); - insert(ns, BSON("ts" << Timestamp(1000, 2))); + + insertOplogDocument(&_opCtx, Timestamp(1000, 0), ns); + insertOplogDocument(&_opCtx, Timestamp(1000, 1), ns); + insertOplogDocument(&_opCtx, Timestamp(1000, 2), ns); FindCommandRequest findRequest{NamespaceString{ns}}; findRequest.setFilter(BSON("ts" << GT << Timestamp(1000, 1))); findRequest.setHint(BSON("$natural" << 1)); @@ -739,6 +755,7 @@ public: ~OplogScanGtTsExplain() { _client.dropCollection(string(ns)); } + void run() { // Skip the test if the storage engine doesn't support capped collections. if (!_opCtx.getServiceContext()->getStorageEngine()->supportsCappedCollections()) { @@ -762,9 +779,9 @@ public: info); } - insert(ns, BSON("ts" << Timestamp(1000, 0))); - insert(ns, BSON("ts" << Timestamp(1000, 1))); - insert(ns, BSON("ts" << Timestamp(1000, 2))); + insertOplogDocument(&_opCtx, Timestamp(1000, 0), ns); + insertOplogDocument(&_opCtx, Timestamp(1000, 1), ns); + insertOplogDocument(&_opCtx, Timestamp(1000, 2), ns); BSONObj explainCmdObj = BSON("explain" << BSON("find" @@ -1519,7 +1536,7 @@ public: while (1) { int oldCount = count(); - _client.insert(ns(), BSON("ts" << Timestamp(1000, i++))); + insertOplogDocument(&_opCtx, Timestamp(1000, i++), ns()); int newCount = count(); if (oldCount == newCount || newCount < max) break; @@ -1529,7 +1546,8 @@ public: } for (int k = 0; k < 5; ++k) { - _client.insert(ns(), BSON("ts" << Timestamp(1000, i++))); + auto ts = Timestamp(1000, i++); + insertOplogDocument(&_opCtx, ts, ns()); FindCommandRequest findRequest{NamespaceString{ns()}}; findRequest.setSort(BSON("$natural" << 1)); unsigned min = _client.find(findRequest)->next()["ts"].timestamp().getInc(); @@ -1583,11 +1601,11 @@ public: } unsigned i = 0; - for (; i < 150; _client.insert(ns(), BSON("ts" << Timestamp(1000, i++)))) + for (; i < 150; insertOplogDocument(&_opCtx, Timestamp(1000, i++), ns())) ; for (int k = 0; k < 5; ++k) { - _client.insert(ns(), BSON("ts" << Timestamp(1000, i++))); + insertOplogDocument(&_opCtx, Timestamp(1000, i++), ns()); FindCommandRequest findRequest{NamespaceString{ns()}}; findRequest.setSort(BSON("$natural" << 1)); unsigned min = _client.find(findRequest)->next()["ts"].timestamp().getInc(); @@ -1660,7 +1678,7 @@ public: ASSERT(!c->more()); // Check with some docs in the collection. - for (int i = 100; i < 150; _client.insert(ns(), BSON("ts" << Timestamp(1000, i++)))) + for (int i = 100; i < 150; insertOplogDocument(&_opCtx, Timestamp(1000, i++), ns())) ; c = _client.find(findRequest); ASSERT(c->more()); |