diff options
author | Daniel Gottlieb <daniel.gottlieb@mongodb.com> | 2018-01-24 13:45:05 -0500 |
---|---|---|
committer | Daniel Gottlieb <daniel.gottlieb@mongodb.com> | 2018-01-24 13:45:05 -0500 |
commit | fbf03e93dad1d2d081944c05436e777380873eb2 (patch) | |
tree | 8e2bd0d16febafcc10c0a201094967efb876c61b | |
parent | e59b03c06a034bac37435dbcdb2b631babe0f055 (diff) | |
download | mongo-fbf03e93dad1d2d081944c05436e777380873eb2.tar.gz |
SERVER-32251: Timestamp drop collection/database
-rw-r--r-- | src/mongo/db/catalog/drop_database.cpp | 15 | ||||
-rw-r--r-- | src/mongo/db/namespace_string.cpp | 19 | ||||
-rw-r--r-- | src/mongo/db/namespace_string.h | 7 | ||||
-rw-r--r-- | src/mongo/db/repl/SConscript | 3 | ||||
-rw-r--r-- | src/mongo/db/repl/storage_interface_impl.cpp | 5 | ||||
-rw-r--r-- | src/mongo/db/repl/sync_tail_test.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/storage/kv/SConscript | 3 | ||||
-rw-r--r-- | src/mongo/db/storage/kv/kv_storage_engine.cpp | 149 | ||||
-rw-r--r-- | src/mongo/db/storage/kv/kv_storage_engine.h | 14 | ||||
-rw-r--r-- | src/mongo/db/storage/recovery_unit.h | 2 | ||||
-rw-r--r-- | src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h | 2 | ||||
-rw-r--r-- | src/mongo/dbtests/storage_timestamp_tests.cpp | 273 |
12 files changed, 483 insertions, 15 deletions
diff --git a/src/mongo/db/catalog/drop_database.cpp b/src/mongo/db/catalog/drop_database.cpp index d28b516eb58..a613f06cc92 100644 --- a/src/mongo/db/catalog/drop_database.cpp +++ b/src/mongo/db/catalog/drop_database.cpp @@ -137,7 +137,7 @@ Status dropDatabase(OperationContext* opCtx, const std::string& dbName) { auto dropPendingGuard = MakeGuard([&db, opCtx] { db->setDropPending(opCtx, false); }); std::vector<NamespaceString> collectionsToDrop; - for (auto collection : *db) { + for (Collection* collection : *db) { const auto& nss = collection->ns(); if (nss.isDropPendingNamespace() && replCoord->isReplEnabled() && opCtx->writesAreReplicated()) { @@ -157,7 +157,20 @@ Status dropDatabase(OperationContext* opCtx, const std::string& dbName) { << " collections"; for (auto nss : collectionsToDrop) { log() << "dropDatabase " << dbName << " - dropping collection: " << nss; + if (!opCtx->writesAreReplicated()) { + // Dropping a database on a primary replicates individual collection drops + // followed by a database drop oplog entry. When a secondary observes the database + // drop oplog entry, all of the replicated collections that were dropped must have + // been processed. Only non-replicated collections like `system.profile` should be + // left to remove. Collections with the `tmp.mr` namespace may or may not be + // getting replicated; be conservative and assume they are not. + invariant(!nss.isReplicated() || nss.coll().startsWith("tmp.mr")); + } + WriteUnitOfWork wunit(opCtx); + // A primary processing this will assign a timestamp when the operation is written to + // the oplog. As stated above, a secondary processing must only observe non-replicated + // collections, thus this should not be timestamped. fassertStatusOK(40476, db->dropCollectionEvenIfSystem(opCtx, nss)); wunit.commit(); } diff --git a/src/mongo/db/namespace_string.cpp b/src/mongo/db/namespace_string.cpp index ba941475923..73e54524fe8 100644 --- a/src/mongo/db/namespace_string.cpp +++ b/src/mongo/db/namespace_string.cpp @@ -221,6 +221,25 @@ Status NamespaceString::checkLengthForRename( return Status::OK(); } +bool NamespaceString::isReplicated() const { + if (isLocal()) { + return false; + } + + // Of collections not in the `local` database, only `system` collections might not be + // replicated. + if (!isSystem()) { + return true; + } + + if (isSystemDotProfile()) { + return false; + } + + // E.g: `system.version` is replicated. + return true; +} + std::ostream& operator<<(std::ostream& stream, const NamespaceString& nss) { return stream << nss.toString(); } diff --git a/src/mongo/db/namespace_string.h b/src/mongo/db/namespace_string.h index 87e22d2b394..21be2b4c133 100644 --- a/src/mongo/db/namespace_string.h +++ b/src/mongo/db/namespace_string.h @@ -221,6 +221,13 @@ public: } /** + * Returns whether a namespace is replicated, based only on its string value. One notable + * omission is that map reduce `tmp.mr` collections may or may not be replicated. Callers must + * decide how to handle that case separately. + */ + bool isReplicated() const; + + /** * Returns true if cursors for this namespace are registered with the global cursor manager. */ bool isGloballyManagedNamespace() const { diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript index 6b45e9e1636..adeaf4d6dad 100644 --- a/src/mongo/db/repl/SConscript +++ b/src/mongo/db/repl/SConscript @@ -247,6 +247,9 @@ env.Library( '$BUILD_DIR/mongo/db/dbhelpers', '$BUILD_DIR/mongo/db/query_exec', ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/logical_clock', + ], ) env.CppUnitTest( diff --git a/src/mongo/db/repl/storage_interface_impl.cpp b/src/mongo/db/repl/storage_interface_impl.cpp index 6a5841bb3b1..7657dae752b 100644 --- a/src/mongo/db/repl/storage_interface_impl.cpp +++ b/src/mongo/db/repl/storage_interface_impl.cpp @@ -61,6 +61,7 @@ #include "mongo/db/exec/update.h" #include "mongo/db/jsobj.h" #include "mongo/db/keypattern.h" +#include "mongo/db/logical_clock.h" #include "mongo/db/operation_context.h" #include "mongo/db/ops/delete_request.h" #include "mongo/db/ops/parsed_update.h" @@ -429,6 +430,10 @@ Status StorageInterfaceImpl::dropCollection(OperationContext* opCtx, const Names if (!status.isOK()) { return status; } + if (nss.isDropPendingNamespace() && !opCtx->writesAreReplicated()) { + Timestamp ts = LogicalClock::get(opCtx)->getClusterTime().asTimestamp(); + fassertStatusOK(50661, opCtx->recoveryUnit()->setTimestamp(ts)); + } wunit.commit(); return Status::OK(); }); diff --git a/src/mongo/db/repl/sync_tail_test.cpp b/src/mongo/db/repl/sync_tail_test.cpp index aa2db8e14aa..aa565b57da1 100644 --- a/src/mongo/db/repl/sync_tail_test.cpp +++ b/src/mongo/db/repl/sync_tail_test.cpp @@ -1645,7 +1645,8 @@ TEST_F(IdempotencyTest, InsertToFCVCollectionBesidesFCVDocumentSucceeds) { } TEST_F(IdempotencyTest, DropDatabaseSucceeds) { - auto ns = NamespaceString("foo.bar"); + // Choose `system.profile` so the storage engine doesn't expect the drop to be timestamped. + auto ns = NamespaceString("foo.system.profile"); ::mongo::repl::createCollection(_opCtx.get(), ns, CollectionOptions()); ASSERT_OK( ReplicationCoordinator::get(_opCtx.get())->setFollowerMode(MemberState::RS_RECOVERING)); @@ -1803,7 +1804,8 @@ TEST_F(SyncTailTest, UpgradeWithNoUUIDFailsInSecondary) { } TEST_F(SyncTailTest, DropDatabaseSucceedsInRecovering) { - auto ns = NamespaceString("foo.bar"); + // Choose `system.profile` so the storage engine doesn't expect the drop to be timestamped. + auto ns = NamespaceString("foo.system.profile"); ::mongo::repl::createCollection(_opCtx.get(), ns, CollectionOptions()); ASSERT_OK( ReplicationCoordinator::get(_opCtx.get())->setFollowerMode(MemberState::RS_RECOVERING)); diff --git a/src/mongo/db/storage/kv/SConscript b/src/mongo/db/storage/kv/SConscript index 617c47c993e..ec3965aab62 100644 --- a/src/mongo/db/storage/kv/SConscript +++ b/src/mongo/db/storage/kv/SConscript @@ -52,6 +52,9 @@ env.Library( '$BUILD_DIR/mongo/db/storage/storage_options', 'kv_database_catalog_entry_core', ], + LIBDEPS_PRIVATE=[ + '$BUILD_DIR/mongo/db/logical_clock', + ], ) env.Library( diff --git a/src/mongo/db/storage/kv/kv_storage_engine.cpp b/src/mongo/db/storage/kv/kv_storage_engine.cpp index dc4fed07d78..f107c32a780 100644 --- a/src/mongo/db/storage/kv/kv_storage_engine.cpp +++ b/src/mongo/db/storage/kv/kv_storage_engine.cpp @@ -32,12 +32,14 @@ #include <algorithm> +#include "mongo/db/logical_clock.h" #include "mongo/db/operation_context_noop.h" #include "mongo/db/storage/kv/kv_database_catalog_entry.h" #include "mongo/db/storage/kv/kv_engine.h" #include "mongo/util/assert_util.h" #include "mongo/util/log.h" #include "mongo/util/mongoutils/str.h" +#include "mongo/util/scopeguard.h" namespace mongo { @@ -290,32 +292,156 @@ Status KVStorageEngine::dropDatabase(OperationContext* opCtx, StringData db) { entry = it->second; } + std::list<std::string> toDrop; + entry->getCollectionNamespaces(&toDrop); + + // Partition `toDrop` into ranges of `[untimestampedCollections..., + // timestampedCollections...]`. All timestamped collections must have already been renamed to + // a drop-pending namespace. Running without replication treats all collections as not + // timestamped. + auto untimestampedDropsEnd = + std::partition(toDrop.begin(), toDrop.end(), [](const std::string& dropNs) { + return !NamespaceString(dropNs).isDropPendingNamespace(); + }); + + // The primary caller (`DatabaseImpl::dropDatabase`) of this method currently + // `transitional_ignore`s the result. To minimize the impact of that, while also returning a + // correct status, attempt to drop every collection, and if there were any errors, return the + // first one. + Status firstError = Status::OK(); + + // First drop the "non-timestamped" collections. "Non-timestamped" collections such as user + // collections in `local` or `system.profile` do not get rolled back. This means we also + // should not rollback their creation or deletion. To achieve that, the code takes care to + // suppress any timestamping state. + firstError = _dropCollectionsNoTimestamp(opCtx, entry, toDrop.begin(), untimestampedDropsEnd); + + // Now drop any leftover timestamped collections (i.e: not already dropped by the reaper). On + // secondaries there is already a `commit timestamp` set and these drops inherit the timestamp + // of the `dropDatabase` oplog entry. On primaries, we look at the logical clock and set the + // commit timestamp state. + // + // Additionally, before returning, this method will remove the `KVDatabaseCatalogEntry` from + // the `_dbs` map. This action creates a new constraint that this "timestamped drop" method + // must happen after the "non-timestamped drops". + auto status = + _dropCollectionsWithTimestamp(opCtx, entry, toDrop, untimestampedDropsEnd, toDrop.end()); + if (firstError.isOK()) { + firstError = status; + } + + return firstError; +} + +/** + * Returns the first `dropCollection` error that this method encounters. This method will attempt + * to drop all collections, regardless of the error status. + */ +Status KVStorageEngine::_dropCollectionsNoTimestamp(OperationContext* opCtx, + KVDatabaseCatalogEntryBase* dbce, + CollIter begin, + CollIter end) { + // On primaries, this method will be called outside of any `TimestampBlock` state meaning the + // "commit timestamp" will not be set. For this case, this method needs no special logic to + // avoid timestamping the upcoming writes. + // + // On secondaries, there will be a wrapping `TimestampBlock` and the "commit timestamp" will + // be set. Carefully save that to the side so the following writes can go through without that + // context. + const Timestamp commitTs = opCtx->recoveryUnit()->getCommitTimestamp(); + if (!commitTs.isNull()) { + opCtx->recoveryUnit()->clearCommitTimestamp(); + } + + // Ensure the method exits with the same "commit timestamp" state that it was called with. + auto addCommitTimestamp = MakeGuard([&opCtx, commitTs] { + if (!commitTs.isNull()) { + opCtx->recoveryUnit()->setCommitTimestamp(commitTs); + } + }); + + Status firstError = Status::OK(); + WriteUnitOfWork untimestampedDropWuow(opCtx); + for (auto toDrop = begin; toDrop != end; ++toDrop) { + std::string coll = *toDrop; + NamespaceString nss(coll); + + // When in steady state replication and after filtering out drop-pending namespaces, the + // only collections that may show up here are either 1) not replicated 2) `tmp.mr`. + if (_initialDataTimestamp != Timestamp::kAllowUnstableCheckpointsSentinel) { + invariant(!nss.isReplicated() || nss.coll().startsWith("tmp.mr"), + str::stream() << "Collection drop is not being timestamped. Namespace: " + << nss.ns()); + } + + Status result = dbce->dropCollection(opCtx, coll); + if (!result.isOK() && firstError.isOK()) { + firstError = result; + } + } + + untimestampedDropWuow.commit(); + return firstError; +} + +Status KVStorageEngine::_dropCollectionsWithTimestamp(OperationContext* opCtx, + KVDatabaseCatalogEntryBase* dbce, + std::list<std::string>& toDrop, + CollIter begin, + CollIter end) { + // On primaries, these collection drops are performed in a separate WUOW than the insertion of + // the `dropDatabase` oplog entry. In this case, we expect the `existingCommitTs` to be null + // and the code looks at the logical clock to assign a timestamp to the writes. + // + // Secondaries reach this from within a `TimestampBlock` where there should be a non-null + // `existingCommitTs`. + const Timestamp existingCommitTs = opCtx->recoveryUnit()->getCommitTimestamp(); + + // `LogicalClock`s on standalones and master/slave do not necessarily return real + // optimes. Assume it's safe to not timestamp the write. + const Timestamp chosenCommitTs = LogicalClock::get(opCtx)->getClusterTime().asTimestamp(); + const bool setCommitTs = existingCommitTs.isNull() && !chosenCommitTs.isNull(); + if (setCommitTs) { + opCtx->recoveryUnit()->setCommitTimestamp(chosenCommitTs); + } + + // Ensure the method exits with the same "commit timestamp" state that it was called with. + auto removeCommitTimestamp = MakeGuard([&opCtx, setCommitTs] { + if (setCommitTs) { + opCtx->recoveryUnit()->clearCommitTimestamp(); + } + }); + // This is called outside of a WUOW since MMAPv1 has unfortunate behavior around dropping // databases. We need to create one here since we want db dropping to all-or-nothing // wherever possible. Eventually we want to move this up so that it can include the logOp // inside of the WUOW, but that would require making DB dropping happen inside the Dur // system for MMAPv1. - WriteUnitOfWork wuow(opCtx); + WriteUnitOfWork timestampedDropWuow(opCtx); - std::list<std::string> toDrop; - entry->getCollectionNamespaces(&toDrop); + Status firstError = Status::OK(); + for (auto toDropStr = begin; toDropStr != toDrop.end(); ++toDropStr) { + std::string coll = *toDropStr; + NamespaceString nss(coll); - for (std::list<std::string>::iterator it = toDrop.begin(); it != toDrop.end(); ++it) { - string coll = *it; - entry->dropCollection(opCtx, coll).transitional_ignore(); + Status result = dbce->dropCollection(opCtx, coll); + if (!result.isOK() && firstError.isOK()) { + firstError = result; + } } + toDrop.clear(); - entry->getCollectionNamespaces(&toDrop); + dbce->getCollectionNamespaces(&toDrop); invariant(toDrop.empty()); { stdx::lock_guard<stdx::mutex> lk(_dbsLock); - opCtx->recoveryUnit()->registerChange(new RemoveDBChange(this, db, entry)); - _dbs.erase(db.toString()); + opCtx->recoveryUnit()->registerChange(new RemoveDBChange(this, dbce->name(), dbce)); + _dbs.erase(dbce->name()); } - wuow.commit(); - return Status::OK(); + timestampedDropWuow.commit(); + return firstError; } int KVStorageEngine::flushAllFiles(OperationContext* opCtx, bool sync) { @@ -369,6 +495,7 @@ void KVStorageEngine::setStableTimestamp(Timestamp stableTimestamp) { } void KVStorageEngine::setInitialDataTimestamp(Timestamp initialDataTimestamp) { + _initialDataTimestamp = initialDataTimestamp; _engine->setInitialDataTimestamp(initialDataTimestamp); } diff --git a/src/mongo/db/storage/kv/kv_storage_engine.h b/src/mongo/db/storage/kv/kv_storage_engine.h index aca5e6ad6b6..6f70c588924 100644 --- a/src/mongo/db/storage/kv/kv_storage_engine.h +++ b/src/mongo/db/storage/kv/kv_storage_engine.h @@ -152,6 +152,19 @@ public: OperationContext* opCtx) override; private: + using CollIter = std::list<std::string>::iterator; + + Status _dropCollectionsNoTimestamp(OperationContext* opCtx, + KVDatabaseCatalogEntryBase* dbce, + CollIter begin, + CollIter end); + + Status _dropCollectionsWithTimestamp(OperationContext* opCtx, + KVDatabaseCatalogEntryBase* dbce, + std::list<std::string>& toDrop, + CollIter begin, + CollIter end); + class RemoveDBChange; stdx::function<KVDatabaseCatalogEntryFactory> _databaseCatalogEntryFactory; @@ -163,6 +176,7 @@ private: const bool _supportsDocLocking; const bool _supportsDBLocking; + Timestamp _initialDataTimestamp = Timestamp::kAllowUnstableCheckpointsSentinel; std::unique_ptr<RecordStore> _catalogRecordStore; std::unique_ptr<KVCatalog> _catalog; diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h index 09875eeb22e..48d1e83ece4 100644 --- a/src/mongo/db/storage/recovery_unit.h +++ b/src/mongo/db/storage/recovery_unit.h @@ -161,7 +161,9 @@ public: * must not be called while a commit timestamp is set. */ virtual void setCommitTimestamp(Timestamp timestamp) {} + virtual void clearCommitTimestamp() {} + virtual Timestamp getCommitTimestamp() { return {}; } diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h index 062bc8f7504..feccabf38e4 100644 --- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h +++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h @@ -84,7 +84,9 @@ public: Status setTimestamp(Timestamp timestamp) override; void setCommitTimestamp(Timestamp timestamp) override; + void clearCommitTimestamp() override; + Timestamp getCommitTimestamp() override; Status selectSnapshot(Timestamp timestamp) override; diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp index 0e04a261e92..352b04051bf 100644 --- a/src/mongo/dbtests/storage_timestamp_tests.cpp +++ b/src/mongo/dbtests/storage_timestamp_tests.cpp @@ -33,8 +33,10 @@ #include "mongo/bson/simple_bsonobj_comparator.h" #include "mongo/bson/timestamp.h" #include "mongo/db/catalog/collection.h" +#include "mongo/db/catalog/drop_database.h" #include "mongo/db/catalog/index_catalog.h" #include "mongo/db/catalog/index_create.h" +#include "mongo/db/catalog/uuid_catalog.h" #include "mongo/db/client.h" #include "mongo/db/concurrency/write_conflict_exception.h" #include "mongo/db/db.h" @@ -44,7 +46,9 @@ #include "mongo/db/index/index_descriptor.h" #include "mongo/db/logical_clock.h" #include "mongo/db/op_observer_impl.h" +#include "mongo/db/op_observer_registry.h" #include "mongo/db/repl/apply_ops.h" +#include "mongo/db/repl/drop_pending_collection_reaper.h" #include "mongo/db/repl/oplog.h" #include "mongo/db/repl/oplog_entry.h" #include "mongo/db/repl/optime.h" @@ -53,6 +57,7 @@ #include "mongo/db/repl/replication_coordinator_global.h" #include "mongo/db/repl/replication_coordinator_mock.h" #include "mongo/db/repl/storage_interface_impl.h" +#include "mongo/db/repl/timestamp_block.h" #include "mongo/db/service_context.h" #include "mongo/db/storage/kv/kv_storage_engine.h" #include "mongo/unittest/unittest.h" @@ -96,7 +101,10 @@ public: // to avoid the invariant in ReplClientInfo::setLastOp that the optime only goes forward. repl::ReplClientInfo::forClient(_opCtx->getClient()).clearLastOp_forTest(); - getGlobalServiceContext()->setOpObserver(stdx::make_unique<OpObserverImpl>()); + auto registry = stdx::make_unique<OpObserverRegistry>(); + registry->addObserver(stdx::make_unique<UUIDCatalogObserver>()); + registry->addObserver(stdx::make_unique<OpObserverImpl>()); + _opCtx->getServiceContext()->setOpObserver(std::move(registry)); repl::setOplogCollectionName(getGlobalServiceContext()); repl::createOplog(_opCtx); @@ -291,6 +299,58 @@ public: << " when it should not have been."; } } + + std::tuple<std::string, std::string> getNewCollectionIndexIdent( + KVCatalog* kvCatalog, std::vector<std::string>& origIdents) { + // Find the collection and index ident by performing a set difference on the original + // idents and the current idents. + std::vector<std::string> identsWithColl = kvCatalog->getAllIdents(_opCtx); + std::sort(origIdents.begin(), origIdents.end()); + std::sort(identsWithColl.begin(), identsWithColl.end()); + std::vector<std::string> collAndIdxIdents; + std::set_difference(identsWithColl.begin(), + identsWithColl.end(), + origIdents.begin(), + origIdents.end(), + std::back_inserter(collAndIdxIdents)); + + ASSERT(collAndIdxIdents.size() == 1 || collAndIdxIdents.size() == 2); + if (collAndIdxIdents.size() == 1) { + // `system.profile` collections do not have an `_id` index. + return std::tie(collAndIdxIdents[0], ""); + } + if (collAndIdxIdents.size() == 2) { + // The idents are sorted, so the `collection-...` comes before `index-...` + return std::tie(collAndIdxIdents[0], collAndIdxIdents[1]); + } + + MONGO_UNREACHABLE; + } + + void assertIdentsExistAtTimestamp(KVCatalog* kvCatalog, + const std::string& collIdent, + const std::string& indexIdent, + Timestamp timestamp) { + WriteUnitOfWork wuow(_opCtx); + ASSERT_OK(_opCtx->recoveryUnit()->selectSnapshot(timestamp)); + auto allIdents = kvCatalog->getAllIdents(_opCtx); + ASSERT(std::find(allIdents.begin(), allIdents.end(), collIdent) != allIdents.end()); + if (indexIdent.size() > 0) { + // `system.profile` does not have an `_id` index. + ASSERT(std::find(allIdents.begin(), allIdents.end(), indexIdent) != allIdents.end()); + } + } + + void assertIdentsMissingAtTimestamp(KVCatalog* kvCatalog, + const std::string& collIdent, + const std::string& indexIdent, + Timestamp timestamp) { + WriteUnitOfWork wuow(_opCtx); + ASSERT_OK(_opCtx->recoveryUnit()->selectSnapshot(timestamp)); + auto allIdents = kvCatalog->getAllIdents(_opCtx); + ASSERT(std::find(allIdents.begin(), allIdents.end(), collIdent) == allIdents.end()); + ASSERT(std::find(allIdents.begin(), allIdents.end(), indexIdent) == allIdents.end()); + } }; class SecondaryInsertTimes : public StorageTimestampTest { @@ -1192,6 +1252,213 @@ public: } }; +class ReaperDropIsTimestamped : public StorageTimestampTest { +public: + void run() { + // Only run on 'wiredTiger'. No other storage engines to-date support timestamp writes. + if (mongo::storageGlobalParams.engine != "wiredTiger") { + return; + } + + auto storageInterface = repl::StorageInterface::get(_opCtx); + repl::DropPendingCollectionReaper::set( + _opCtx->getServiceContext(), + stdx::make_unique<repl::DropPendingCollectionReaper>(storageInterface)); + auto reaper = repl::DropPendingCollectionReaper::get(_opCtx); + + auto kvStorageEngine = + dynamic_cast<KVStorageEngine*>(_opCtx->getServiceContext()->getGlobalStorageEngine()); + KVCatalog* kvCatalog = kvStorageEngine->getCatalog(); + + // Save the pre-state idents so we can capture the specific idents related to collection + // creation. + std::vector<std::string> origIdents = kvCatalog->getAllIdents(_opCtx); + + NamespaceString nss("unittests.reaperDropIsTimestamped"); + reset(nss); + + AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_X, LockMode::MODE_X); + + const LogicalTime insertTimestamp = _clock->reserveTicks(1); + { + WriteUnitOfWork wuow(_opCtx); + insertDocument(autoColl.getCollection(), + InsertStatement(BSON("_id" << 0), insertTimestamp.asTimestamp(), 0LL)); + wuow.commit(); + ASSERT_EQ(1, itCount(autoColl.getCollection())); + } + + // The KVCatalog only adheres to timestamp requests on `getAllIdents`. To know the right + // collection/index that gets removed on a drop, we must capture the randomized "ident" + // string for the target collection and index. + std::string collIdent; + std::string indexIdent; + std::tie(collIdent, indexIdent) = getNewCollectionIndexIdent(kvCatalog, origIdents); + + // The first phase of a drop in a replica set is to perform a rename. This does not change + // the ident values. + { + WriteUnitOfWork wuow(_opCtx); + Database* db = autoColl.getDb(); + ASSERT_OK(db->dropCollection(_opCtx, nss.ns())); + wuow.commit(); + } + + // Bump the clock two. The drop will get the second tick. The first tick will identify a + // snapshot of the data with the collection renamed. + const LogicalTime postRenameTimestamp = _clock->reserveTicks(2); + + // Actually drop the collection, propagating to the KVCatalog. This drop will be + // timestamped at the logical clock value. + reaper->dropCollectionsOlderThan( + _opCtx, repl::OpTime(_clock->getClusterTime().asTimestamp(), presentTerm)); + const LogicalTime postDropTime = _clock->reserveTicks(1); + + // Querying the catalog at insert time shows the collection and index existing. + assertIdentsExistAtTimestamp( + kvCatalog, collIdent, indexIdent, insertTimestamp.asTimestamp()); + + // Querying the catalog at rename time continues to show the collection and index exist. + assertIdentsExistAtTimestamp( + kvCatalog, collIdent, indexIdent, postRenameTimestamp.asTimestamp()); + + // Querying the catalog after the drop shows the collection and index being deleted. + assertIdentsMissingAtTimestamp( + kvCatalog, collIdent, indexIdent, postDropTime.asTimestamp()); + } +}; + +/** + * The first step of `mongo::dropDatabase` is to rename all replicated collections, generating a + * "drop collection" oplog entry. Then when those entries become majority commited, calls + * `StorageEngine::dropDatabase`. At this point, two separate code paths can perform the final + * removal of the collections from the storage engine: the reaper, or + * `[KV]StorageEngine::dropDatabase` when it is called from `mongo::dropDatabase`. This race + * exists on both primaries and secondaries. This test asserts `[KV]StorageEngine::dropDatabase` + * correctly timestamps the final drop. + */ +template <bool IsPrimary> +class KVDropDatabase : public StorageTimestampTest { +public: + void run() { + // Only run on 'wiredTiger'. No other storage engines to-date support timestamp writes. + if (mongo::storageGlobalParams.engine != "wiredTiger") { + return; + } + + auto storageInterface = repl::StorageInterface::get(_opCtx); + repl::DropPendingCollectionReaper::set( + _opCtx->getServiceContext(), + stdx::make_unique<repl::DropPendingCollectionReaper>(storageInterface)); + + auto kvStorageEngine = + dynamic_cast<KVStorageEngine*>(_opCtx->getServiceContext()->getGlobalStorageEngine()); + KVCatalog* kvCatalog = kvStorageEngine->getCatalog(); + + // Declare the database to be in a "synced" state, i.e: in steady-state replication. + Timestamp syncTime = _clock->reserveTicks(1).asTimestamp(); + invariant(!syncTime.isNull()); + kvStorageEngine->setInitialDataTimestamp(syncTime); + + // This test is dropping collections individually before following up with a + // `dropDatabase` call. This is illegal in typical replication operation as `dropDatabase` + // may find collections that haven't been renamed to a "drop-pending" + // namespace. Workaround this by operating on a separate DB from the other tests. + const NamespaceString nss("unittestsDropDB.kvDropDatabase"); + const NamespaceString sysProfile("unittestsDropDB.system.profile"); + + std::string collIdent; + std::string indexIdent; + std::string sysProfileIdent; + // `*.system.profile` does not have an `_id` index. Just create it to abide by the API. This + // value will be the empty string. Helper methods accommodate this. + std::string sysProfileIndexIdent; + for (auto& tuple : {std::tie(nss, collIdent, indexIdent), + std::tie(sysProfile, sysProfileIdent, sysProfileIndexIdent)}) { + // Save the pre-state idents so we can capture the specific idents related to collection + // creation. + std::vector<std::string> origIdents = kvCatalog->getAllIdents(_opCtx); + const auto& nss = std::get<0>(tuple); + + // Non-replicated namespaces are wrapped in an unreplicated writes block. This has the + // side-effect of not timestamping the collection creation. + repl::UnreplicatedWritesBlock notReplicated(_opCtx); + if (nss.isReplicated()) { + TimestampBlock tsBlock(_opCtx, _clock->reserveTicks(1).asTimestamp()); + reset(nss); + } else { + reset(nss); + } + + AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_X, LockMode::MODE_X); + + // Bind the local values to the variables in the parent scope. + auto& collIdent = std::get<1>(tuple); + auto& indexIdent = std::get<2>(tuple); + std::tie(collIdent, indexIdent) = getNewCollectionIndexIdent(kvCatalog, origIdents); + } + + const Timestamp postCreateTime = _clock->reserveTicks(1).asTimestamp(); + + // Assert that `kvDropDatabase` came into creation between `syncTime` and `postCreateTime`. + assertIdentsMissingAtTimestamp(kvCatalog, collIdent, indexIdent, syncTime); + assertIdentsExistAtTimestamp(kvCatalog, collIdent, indexIdent, postCreateTime); + + // `system.profile` is never timestamped. This means the creation appears to have taken + // place at the beginning of time. + assertIdentsExistAtTimestamp(kvCatalog, sysProfileIdent, sysProfileIndexIdent, syncTime); + assertIdentsExistAtTimestamp( + kvCatalog, sysProfileIdent, sysProfileIndexIdent, postCreateTime); + + AutoGetCollection coll(_opCtx, nss, LockMode::MODE_X); + { + // Drop/rename `kvDropDatabase`. `system.profile` does not get dropped/renamed. + WriteUnitOfWork wuow(_opCtx); + Database* db = coll.getDb(); + ASSERT_OK(db->dropCollection(_opCtx, nss.ns())); + wuow.commit(); + } + + // Reserve two ticks. The first represents after the rename in which the `kvDropDatabase` + // idents still exist. The second will be used by the `dropDatabase`, as that only looks + // at the clock; it does not advance it. + const Timestamp postRenameTime = _clock->reserveTicks(2).asTimestamp(); + // The namespace has changed, but the ident still exists as-is after the rename. + assertIdentsExistAtTimestamp(kvCatalog, collIdent, indexIdent, postRenameTime); + + // Primaries and secondaries call `dropDatabase` (and thus, `StorageEngine->dropDatabase`) + // in different contexts. Both contexts must end up with correct results. + if (IsPrimary) { + // Primaries call `StorageEngine->dropDatabase` outside of the WUOW that logs the + // `dropDatabase` oplog entry. It is not called in the context of a `TimestampBlock`. + + ASSERT_OK(dropDatabase(_opCtx, nss.db().toString())); + } else { + // Secondaries processing a `dropDatabase` oplog entry wrap the call in an + // UnreplicatedWritesBlock and a TimestampBlock with the oplog entry's optime. + + repl::UnreplicatedWritesBlock norep(_opCtx); + const Timestamp preDropTime = _clock->getClusterTime().asTimestamp(); + TimestampBlock dropTime(_opCtx, preDropTime); + ASSERT_OK(dropDatabase(_opCtx, nss.db().toString())); + } + + const Timestamp postDropTime = _clock->reserveTicks(1).asTimestamp(); + + // First, assert that `system.profile` never seems to have existed. + for (const auto& ts : {syncTime, postCreateTime, postDropTime}) { + assertIdentsMissingAtTimestamp(kvCatalog, sysProfileIdent, sysProfileIndexIdent, ts); + } + + // Now assert that `kvDropDatabase` still existed at `postCreateTime`, but was deleted at + // `postDropTime`. + assertIdentsExistAtTimestamp(kvCatalog, collIdent, indexIdent, postCreateTime); + assertIdentsExistAtTimestamp(kvCatalog, collIdent, indexIdent, postRenameTime); + assertIdentsMissingAtTimestamp(kvCatalog, collIdent, indexIdent, postDropTime); + } +}; + + class AllStorageTimestampTests : public unittest::Suite { public: AllStorageTimestampTests() : unittest::Suite("StorageTimestampTests") {} @@ -1211,6 +1478,10 @@ public: add<SetMinValidInitialSyncFlag>(); add<SetMinValidToAtLeast>(); add<SetMinValidAppliedThrough>(); + add<ReaperDropIsTimestamped>(); + // KVDropDatabase<IsPrimary> + add<KVDropDatabase<false>>(); + add<KVDropDatabase<true>>(); } }; |