summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDaniel Gottlieb <daniel.gottlieb@mongodb.com>2018-01-24 13:45:05 -0500
committerDaniel Gottlieb <daniel.gottlieb@mongodb.com>2018-01-24 13:45:05 -0500
commitfbf03e93dad1d2d081944c05436e777380873eb2 (patch)
tree8e2bd0d16febafcc10c0a201094967efb876c61b
parente59b03c06a034bac37435dbcdb2b631babe0f055 (diff)
downloadmongo-fbf03e93dad1d2d081944c05436e777380873eb2.tar.gz
SERVER-32251: Timestamp drop collection/database
-rw-r--r--src/mongo/db/catalog/drop_database.cpp15
-rw-r--r--src/mongo/db/namespace_string.cpp19
-rw-r--r--src/mongo/db/namespace_string.h7
-rw-r--r--src/mongo/db/repl/SConscript3
-rw-r--r--src/mongo/db/repl/storage_interface_impl.cpp5
-rw-r--r--src/mongo/db/repl/sync_tail_test.cpp6
-rw-r--r--src/mongo/db/storage/kv/SConscript3
-rw-r--r--src/mongo/db/storage/kv/kv_storage_engine.cpp149
-rw-r--r--src/mongo/db/storage/kv/kv_storage_engine.h14
-rw-r--r--src/mongo/db/storage/recovery_unit.h2
-rw-r--r--src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h2
-rw-r--r--src/mongo/dbtests/storage_timestamp_tests.cpp273
12 files changed, 483 insertions, 15 deletions
diff --git a/src/mongo/db/catalog/drop_database.cpp b/src/mongo/db/catalog/drop_database.cpp
index d28b516eb58..a613f06cc92 100644
--- a/src/mongo/db/catalog/drop_database.cpp
+++ b/src/mongo/db/catalog/drop_database.cpp
@@ -137,7 +137,7 @@ Status dropDatabase(OperationContext* opCtx, const std::string& dbName) {
auto dropPendingGuard = MakeGuard([&db, opCtx] { db->setDropPending(opCtx, false); });
std::vector<NamespaceString> collectionsToDrop;
- for (auto collection : *db) {
+ for (Collection* collection : *db) {
const auto& nss = collection->ns();
if (nss.isDropPendingNamespace() && replCoord->isReplEnabled() &&
opCtx->writesAreReplicated()) {
@@ -157,7 +157,20 @@ Status dropDatabase(OperationContext* opCtx, const std::string& dbName) {
<< " collections";
for (auto nss : collectionsToDrop) {
log() << "dropDatabase " << dbName << " - dropping collection: " << nss;
+ if (!opCtx->writesAreReplicated()) {
+ // Dropping a database on a primary replicates individual collection drops
+ // followed by a database drop oplog entry. When a secondary observes the database
+ // drop oplog entry, all of the replicated collections that were dropped must have
+ // been processed. Only non-replicated collections like `system.profile` should be
+ // left to remove. Collections with the `tmp.mr` namespace may or may not be
+ // getting replicated; be conservative and assume they are not.
+ invariant(!nss.isReplicated() || nss.coll().startsWith("tmp.mr"));
+ }
+
WriteUnitOfWork wunit(opCtx);
+ // A primary processing this will assign a timestamp when the operation is written to
+ // the oplog. As stated above, a secondary processing must only observe non-replicated
+ // collections, thus this should not be timestamped.
fassertStatusOK(40476, db->dropCollectionEvenIfSystem(opCtx, nss));
wunit.commit();
}
diff --git a/src/mongo/db/namespace_string.cpp b/src/mongo/db/namespace_string.cpp
index ba941475923..73e54524fe8 100644
--- a/src/mongo/db/namespace_string.cpp
+++ b/src/mongo/db/namespace_string.cpp
@@ -221,6 +221,25 @@ Status NamespaceString::checkLengthForRename(
return Status::OK();
}
+bool NamespaceString::isReplicated() const {
+ if (isLocal()) {
+ return false;
+ }
+
+ // Of collections not in the `local` database, only `system` collections might not be
+ // replicated.
+ if (!isSystem()) {
+ return true;
+ }
+
+ if (isSystemDotProfile()) {
+ return false;
+ }
+
+ // E.g: `system.version` is replicated.
+ return true;
+}
+
std::ostream& operator<<(std::ostream& stream, const NamespaceString& nss) {
return stream << nss.toString();
}
diff --git a/src/mongo/db/namespace_string.h b/src/mongo/db/namespace_string.h
index 87e22d2b394..21be2b4c133 100644
--- a/src/mongo/db/namespace_string.h
+++ b/src/mongo/db/namespace_string.h
@@ -221,6 +221,13 @@ public:
}
/**
+ * Returns whether a namespace is replicated, based only on its string value. One notable
+ * omission is that map reduce `tmp.mr` collections may or may not be replicated. Callers must
+ * decide how to handle that case separately.
+ */
+ bool isReplicated() const;
+
+ /**
* Returns true if cursors for this namespace are registered with the global cursor manager.
*/
bool isGloballyManagedNamespace() const {
diff --git a/src/mongo/db/repl/SConscript b/src/mongo/db/repl/SConscript
index 6b45e9e1636..adeaf4d6dad 100644
--- a/src/mongo/db/repl/SConscript
+++ b/src/mongo/db/repl/SConscript
@@ -247,6 +247,9 @@ env.Library(
'$BUILD_DIR/mongo/db/dbhelpers',
'$BUILD_DIR/mongo/db/query_exec',
],
+ LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/db/logical_clock',
+ ],
)
env.CppUnitTest(
diff --git a/src/mongo/db/repl/storage_interface_impl.cpp b/src/mongo/db/repl/storage_interface_impl.cpp
index 6a5841bb3b1..7657dae752b 100644
--- a/src/mongo/db/repl/storage_interface_impl.cpp
+++ b/src/mongo/db/repl/storage_interface_impl.cpp
@@ -61,6 +61,7 @@
#include "mongo/db/exec/update.h"
#include "mongo/db/jsobj.h"
#include "mongo/db/keypattern.h"
+#include "mongo/db/logical_clock.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/ops/delete_request.h"
#include "mongo/db/ops/parsed_update.h"
@@ -429,6 +430,10 @@ Status StorageInterfaceImpl::dropCollection(OperationContext* opCtx, const Names
if (!status.isOK()) {
return status;
}
+ if (nss.isDropPendingNamespace() && !opCtx->writesAreReplicated()) {
+ Timestamp ts = LogicalClock::get(opCtx)->getClusterTime().asTimestamp();
+ fassertStatusOK(50661, opCtx->recoveryUnit()->setTimestamp(ts));
+ }
wunit.commit();
return Status::OK();
});
diff --git a/src/mongo/db/repl/sync_tail_test.cpp b/src/mongo/db/repl/sync_tail_test.cpp
index aa2db8e14aa..aa565b57da1 100644
--- a/src/mongo/db/repl/sync_tail_test.cpp
+++ b/src/mongo/db/repl/sync_tail_test.cpp
@@ -1645,7 +1645,8 @@ TEST_F(IdempotencyTest, InsertToFCVCollectionBesidesFCVDocumentSucceeds) {
}
TEST_F(IdempotencyTest, DropDatabaseSucceeds) {
- auto ns = NamespaceString("foo.bar");
+ // Choose `system.profile` so the storage engine doesn't expect the drop to be timestamped.
+ auto ns = NamespaceString("foo.system.profile");
::mongo::repl::createCollection(_opCtx.get(), ns, CollectionOptions());
ASSERT_OK(
ReplicationCoordinator::get(_opCtx.get())->setFollowerMode(MemberState::RS_RECOVERING));
@@ -1803,7 +1804,8 @@ TEST_F(SyncTailTest, UpgradeWithNoUUIDFailsInSecondary) {
}
TEST_F(SyncTailTest, DropDatabaseSucceedsInRecovering) {
- auto ns = NamespaceString("foo.bar");
+ // Choose `system.profile` so the storage engine doesn't expect the drop to be timestamped.
+ auto ns = NamespaceString("foo.system.profile");
::mongo::repl::createCollection(_opCtx.get(), ns, CollectionOptions());
ASSERT_OK(
ReplicationCoordinator::get(_opCtx.get())->setFollowerMode(MemberState::RS_RECOVERING));
diff --git a/src/mongo/db/storage/kv/SConscript b/src/mongo/db/storage/kv/SConscript
index 617c47c993e..ec3965aab62 100644
--- a/src/mongo/db/storage/kv/SConscript
+++ b/src/mongo/db/storage/kv/SConscript
@@ -52,6 +52,9 @@ env.Library(
'$BUILD_DIR/mongo/db/storage/storage_options',
'kv_database_catalog_entry_core',
],
+ LIBDEPS_PRIVATE=[
+ '$BUILD_DIR/mongo/db/logical_clock',
+ ],
)
env.Library(
diff --git a/src/mongo/db/storage/kv/kv_storage_engine.cpp b/src/mongo/db/storage/kv/kv_storage_engine.cpp
index dc4fed07d78..f107c32a780 100644
--- a/src/mongo/db/storage/kv/kv_storage_engine.cpp
+++ b/src/mongo/db/storage/kv/kv_storage_engine.cpp
@@ -32,12 +32,14 @@
#include <algorithm>
+#include "mongo/db/logical_clock.h"
#include "mongo/db/operation_context_noop.h"
#include "mongo/db/storage/kv/kv_database_catalog_entry.h"
#include "mongo/db/storage/kv/kv_engine.h"
#include "mongo/util/assert_util.h"
#include "mongo/util/log.h"
#include "mongo/util/mongoutils/str.h"
+#include "mongo/util/scopeguard.h"
namespace mongo {
@@ -290,32 +292,156 @@ Status KVStorageEngine::dropDatabase(OperationContext* opCtx, StringData db) {
entry = it->second;
}
+ std::list<std::string> toDrop;
+ entry->getCollectionNamespaces(&toDrop);
+
+ // Partition `toDrop` into ranges of `[untimestampedCollections...,
+ // timestampedCollections...]`. All timestamped collections must have already been renamed to
+ // a drop-pending namespace. Running without replication treats all collections as not
+ // timestamped.
+ auto untimestampedDropsEnd =
+ std::partition(toDrop.begin(), toDrop.end(), [](const std::string& dropNs) {
+ return !NamespaceString(dropNs).isDropPendingNamespace();
+ });
+
+ // The primary caller (`DatabaseImpl::dropDatabase`) of this method currently
+ // `transitional_ignore`s the result. To minimize the impact of that, while also returning a
+ // correct status, attempt to drop every collection, and if there were any errors, return the
+ // first one.
+ Status firstError = Status::OK();
+
+ // First drop the "non-timestamped" collections. "Non-timestamped" collections such as user
+ // collections in `local` or `system.profile` do not get rolled back. This means we also
+ // should not rollback their creation or deletion. To achieve that, the code takes care to
+ // suppress any timestamping state.
+ firstError = _dropCollectionsNoTimestamp(opCtx, entry, toDrop.begin(), untimestampedDropsEnd);
+
+ // Now drop any leftover timestamped collections (i.e: not already dropped by the reaper). On
+ // secondaries there is already a `commit timestamp` set and these drops inherit the timestamp
+ // of the `dropDatabase` oplog entry. On primaries, we look at the logical clock and set the
+ // commit timestamp state.
+ //
+ // Additionally, before returning, this method will remove the `KVDatabaseCatalogEntry` from
+ // the `_dbs` map. This action creates a new constraint that this "timestamped drop" method
+ // must happen after the "non-timestamped drops".
+ auto status =
+ _dropCollectionsWithTimestamp(opCtx, entry, toDrop, untimestampedDropsEnd, toDrop.end());
+ if (firstError.isOK()) {
+ firstError = status;
+ }
+
+ return firstError;
+}
+
+/**
+ * Returns the first `dropCollection` error that this method encounters. This method will attempt
+ * to drop all collections, regardless of the error status.
+ */
+Status KVStorageEngine::_dropCollectionsNoTimestamp(OperationContext* opCtx,
+ KVDatabaseCatalogEntryBase* dbce,
+ CollIter begin,
+ CollIter end) {
+ // On primaries, this method will be called outside of any `TimestampBlock` state meaning the
+ // "commit timestamp" will not be set. For this case, this method needs no special logic to
+ // avoid timestamping the upcoming writes.
+ //
+ // On secondaries, there will be a wrapping `TimestampBlock` and the "commit timestamp" will
+ // be set. Carefully save that to the side so the following writes can go through without that
+ // context.
+ const Timestamp commitTs = opCtx->recoveryUnit()->getCommitTimestamp();
+ if (!commitTs.isNull()) {
+ opCtx->recoveryUnit()->clearCommitTimestamp();
+ }
+
+ // Ensure the method exits with the same "commit timestamp" state that it was called with.
+ auto addCommitTimestamp = MakeGuard([&opCtx, commitTs] {
+ if (!commitTs.isNull()) {
+ opCtx->recoveryUnit()->setCommitTimestamp(commitTs);
+ }
+ });
+
+ Status firstError = Status::OK();
+ WriteUnitOfWork untimestampedDropWuow(opCtx);
+ for (auto toDrop = begin; toDrop != end; ++toDrop) {
+ std::string coll = *toDrop;
+ NamespaceString nss(coll);
+
+ // When in steady state replication and after filtering out drop-pending namespaces, the
+ // only collections that may show up here are either 1) not replicated 2) `tmp.mr`.
+ if (_initialDataTimestamp != Timestamp::kAllowUnstableCheckpointsSentinel) {
+ invariant(!nss.isReplicated() || nss.coll().startsWith("tmp.mr"),
+ str::stream() << "Collection drop is not being timestamped. Namespace: "
+ << nss.ns());
+ }
+
+ Status result = dbce->dropCollection(opCtx, coll);
+ if (!result.isOK() && firstError.isOK()) {
+ firstError = result;
+ }
+ }
+
+ untimestampedDropWuow.commit();
+ return firstError;
+}
+
+Status KVStorageEngine::_dropCollectionsWithTimestamp(OperationContext* opCtx,
+ KVDatabaseCatalogEntryBase* dbce,
+ std::list<std::string>& toDrop,
+ CollIter begin,
+ CollIter end) {
+ // On primaries, these collection drops are performed in a separate WUOW than the insertion of
+ // the `dropDatabase` oplog entry. In this case, we expect the `existingCommitTs` to be null
+ // and the code looks at the logical clock to assign a timestamp to the writes.
+ //
+ // Secondaries reach this from within a `TimestampBlock` where there should be a non-null
+ // `existingCommitTs`.
+ const Timestamp existingCommitTs = opCtx->recoveryUnit()->getCommitTimestamp();
+
+ // `LogicalClock`s on standalones and master/slave do not necessarily return real
+ // optimes. Assume it's safe to not timestamp the write.
+ const Timestamp chosenCommitTs = LogicalClock::get(opCtx)->getClusterTime().asTimestamp();
+ const bool setCommitTs = existingCommitTs.isNull() && !chosenCommitTs.isNull();
+ if (setCommitTs) {
+ opCtx->recoveryUnit()->setCommitTimestamp(chosenCommitTs);
+ }
+
+ // Ensure the method exits with the same "commit timestamp" state that it was called with.
+ auto removeCommitTimestamp = MakeGuard([&opCtx, setCommitTs] {
+ if (setCommitTs) {
+ opCtx->recoveryUnit()->clearCommitTimestamp();
+ }
+ });
+
// This is called outside of a WUOW since MMAPv1 has unfortunate behavior around dropping
// databases. We need to create one here since we want db dropping to all-or-nothing
// wherever possible. Eventually we want to move this up so that it can include the logOp
// inside of the WUOW, but that would require making DB dropping happen inside the Dur
// system for MMAPv1.
- WriteUnitOfWork wuow(opCtx);
+ WriteUnitOfWork timestampedDropWuow(opCtx);
- std::list<std::string> toDrop;
- entry->getCollectionNamespaces(&toDrop);
+ Status firstError = Status::OK();
+ for (auto toDropStr = begin; toDropStr != toDrop.end(); ++toDropStr) {
+ std::string coll = *toDropStr;
+ NamespaceString nss(coll);
- for (std::list<std::string>::iterator it = toDrop.begin(); it != toDrop.end(); ++it) {
- string coll = *it;
- entry->dropCollection(opCtx, coll).transitional_ignore();
+ Status result = dbce->dropCollection(opCtx, coll);
+ if (!result.isOK() && firstError.isOK()) {
+ firstError = result;
+ }
}
+
toDrop.clear();
- entry->getCollectionNamespaces(&toDrop);
+ dbce->getCollectionNamespaces(&toDrop);
invariant(toDrop.empty());
{
stdx::lock_guard<stdx::mutex> lk(_dbsLock);
- opCtx->recoveryUnit()->registerChange(new RemoveDBChange(this, db, entry));
- _dbs.erase(db.toString());
+ opCtx->recoveryUnit()->registerChange(new RemoveDBChange(this, dbce->name(), dbce));
+ _dbs.erase(dbce->name());
}
- wuow.commit();
- return Status::OK();
+ timestampedDropWuow.commit();
+ return firstError;
}
int KVStorageEngine::flushAllFiles(OperationContext* opCtx, bool sync) {
@@ -369,6 +495,7 @@ void KVStorageEngine::setStableTimestamp(Timestamp stableTimestamp) {
}
void KVStorageEngine::setInitialDataTimestamp(Timestamp initialDataTimestamp) {
+ _initialDataTimestamp = initialDataTimestamp;
_engine->setInitialDataTimestamp(initialDataTimestamp);
}
diff --git a/src/mongo/db/storage/kv/kv_storage_engine.h b/src/mongo/db/storage/kv/kv_storage_engine.h
index aca5e6ad6b6..6f70c588924 100644
--- a/src/mongo/db/storage/kv/kv_storage_engine.h
+++ b/src/mongo/db/storage/kv/kv_storage_engine.h
@@ -152,6 +152,19 @@ public:
OperationContext* opCtx) override;
private:
+ using CollIter = std::list<std::string>::iterator;
+
+ Status _dropCollectionsNoTimestamp(OperationContext* opCtx,
+ KVDatabaseCatalogEntryBase* dbce,
+ CollIter begin,
+ CollIter end);
+
+ Status _dropCollectionsWithTimestamp(OperationContext* opCtx,
+ KVDatabaseCatalogEntryBase* dbce,
+ std::list<std::string>& toDrop,
+ CollIter begin,
+ CollIter end);
+
class RemoveDBChange;
stdx::function<KVDatabaseCatalogEntryFactory> _databaseCatalogEntryFactory;
@@ -163,6 +176,7 @@ private:
const bool _supportsDocLocking;
const bool _supportsDBLocking;
+ Timestamp _initialDataTimestamp = Timestamp::kAllowUnstableCheckpointsSentinel;
std::unique_ptr<RecordStore> _catalogRecordStore;
std::unique_ptr<KVCatalog> _catalog;
diff --git a/src/mongo/db/storage/recovery_unit.h b/src/mongo/db/storage/recovery_unit.h
index 09875eeb22e..48d1e83ece4 100644
--- a/src/mongo/db/storage/recovery_unit.h
+++ b/src/mongo/db/storage/recovery_unit.h
@@ -161,7 +161,9 @@ public:
* must not be called while a commit timestamp is set.
*/
virtual void setCommitTimestamp(Timestamp timestamp) {}
+
virtual void clearCommitTimestamp() {}
+
virtual Timestamp getCommitTimestamp() {
return {};
}
diff --git a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
index 062bc8f7504..feccabf38e4 100644
--- a/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
+++ b/src/mongo/db/storage/wiredtiger/wiredtiger_recovery_unit.h
@@ -84,7 +84,9 @@ public:
Status setTimestamp(Timestamp timestamp) override;
void setCommitTimestamp(Timestamp timestamp) override;
+
void clearCommitTimestamp() override;
+
Timestamp getCommitTimestamp() override;
Status selectSnapshot(Timestamp timestamp) override;
diff --git a/src/mongo/dbtests/storage_timestamp_tests.cpp b/src/mongo/dbtests/storage_timestamp_tests.cpp
index 0e04a261e92..352b04051bf 100644
--- a/src/mongo/dbtests/storage_timestamp_tests.cpp
+++ b/src/mongo/dbtests/storage_timestamp_tests.cpp
@@ -33,8 +33,10 @@
#include "mongo/bson/simple_bsonobj_comparator.h"
#include "mongo/bson/timestamp.h"
#include "mongo/db/catalog/collection.h"
+#include "mongo/db/catalog/drop_database.h"
#include "mongo/db/catalog/index_catalog.h"
#include "mongo/db/catalog/index_create.h"
+#include "mongo/db/catalog/uuid_catalog.h"
#include "mongo/db/client.h"
#include "mongo/db/concurrency/write_conflict_exception.h"
#include "mongo/db/db.h"
@@ -44,7 +46,9 @@
#include "mongo/db/index/index_descriptor.h"
#include "mongo/db/logical_clock.h"
#include "mongo/db/op_observer_impl.h"
+#include "mongo/db/op_observer_registry.h"
#include "mongo/db/repl/apply_ops.h"
+#include "mongo/db/repl/drop_pending_collection_reaper.h"
#include "mongo/db/repl/oplog.h"
#include "mongo/db/repl/oplog_entry.h"
#include "mongo/db/repl/optime.h"
@@ -53,6 +57,7 @@
#include "mongo/db/repl/replication_coordinator_global.h"
#include "mongo/db/repl/replication_coordinator_mock.h"
#include "mongo/db/repl/storage_interface_impl.h"
+#include "mongo/db/repl/timestamp_block.h"
#include "mongo/db/service_context.h"
#include "mongo/db/storage/kv/kv_storage_engine.h"
#include "mongo/unittest/unittest.h"
@@ -96,7 +101,10 @@ public:
// to avoid the invariant in ReplClientInfo::setLastOp that the optime only goes forward.
repl::ReplClientInfo::forClient(_opCtx->getClient()).clearLastOp_forTest();
- getGlobalServiceContext()->setOpObserver(stdx::make_unique<OpObserverImpl>());
+ auto registry = stdx::make_unique<OpObserverRegistry>();
+ registry->addObserver(stdx::make_unique<UUIDCatalogObserver>());
+ registry->addObserver(stdx::make_unique<OpObserverImpl>());
+ _opCtx->getServiceContext()->setOpObserver(std::move(registry));
repl::setOplogCollectionName(getGlobalServiceContext());
repl::createOplog(_opCtx);
@@ -291,6 +299,58 @@ public:
<< " when it should not have been.";
}
}
+
+ std::tuple<std::string, std::string> getNewCollectionIndexIdent(
+ KVCatalog* kvCatalog, std::vector<std::string>& origIdents) {
+ // Find the collection and index ident by performing a set difference on the original
+ // idents and the current idents.
+ std::vector<std::string> identsWithColl = kvCatalog->getAllIdents(_opCtx);
+ std::sort(origIdents.begin(), origIdents.end());
+ std::sort(identsWithColl.begin(), identsWithColl.end());
+ std::vector<std::string> collAndIdxIdents;
+ std::set_difference(identsWithColl.begin(),
+ identsWithColl.end(),
+ origIdents.begin(),
+ origIdents.end(),
+ std::back_inserter(collAndIdxIdents));
+
+ ASSERT(collAndIdxIdents.size() == 1 || collAndIdxIdents.size() == 2);
+ if (collAndIdxIdents.size() == 1) {
+ // `system.profile` collections do not have an `_id` index.
+ return std::tie(collAndIdxIdents[0], "");
+ }
+ if (collAndIdxIdents.size() == 2) {
+ // The idents are sorted, so the `collection-...` comes before `index-...`
+ return std::tie(collAndIdxIdents[0], collAndIdxIdents[1]);
+ }
+
+ MONGO_UNREACHABLE;
+ }
+
+ void assertIdentsExistAtTimestamp(KVCatalog* kvCatalog,
+ const std::string& collIdent,
+ const std::string& indexIdent,
+ Timestamp timestamp) {
+ WriteUnitOfWork wuow(_opCtx);
+ ASSERT_OK(_opCtx->recoveryUnit()->selectSnapshot(timestamp));
+ auto allIdents = kvCatalog->getAllIdents(_opCtx);
+ ASSERT(std::find(allIdents.begin(), allIdents.end(), collIdent) != allIdents.end());
+ if (indexIdent.size() > 0) {
+ // `system.profile` does not have an `_id` index.
+ ASSERT(std::find(allIdents.begin(), allIdents.end(), indexIdent) != allIdents.end());
+ }
+ }
+
+ void assertIdentsMissingAtTimestamp(KVCatalog* kvCatalog,
+ const std::string& collIdent,
+ const std::string& indexIdent,
+ Timestamp timestamp) {
+ WriteUnitOfWork wuow(_opCtx);
+ ASSERT_OK(_opCtx->recoveryUnit()->selectSnapshot(timestamp));
+ auto allIdents = kvCatalog->getAllIdents(_opCtx);
+ ASSERT(std::find(allIdents.begin(), allIdents.end(), collIdent) == allIdents.end());
+ ASSERT(std::find(allIdents.begin(), allIdents.end(), indexIdent) == allIdents.end());
+ }
};
class SecondaryInsertTimes : public StorageTimestampTest {
@@ -1192,6 +1252,213 @@ public:
}
};
+class ReaperDropIsTimestamped : public StorageTimestampTest {
+public:
+ void run() {
+ // Only run on 'wiredTiger'. No other storage engines to-date support timestamp writes.
+ if (mongo::storageGlobalParams.engine != "wiredTiger") {
+ return;
+ }
+
+ auto storageInterface = repl::StorageInterface::get(_opCtx);
+ repl::DropPendingCollectionReaper::set(
+ _opCtx->getServiceContext(),
+ stdx::make_unique<repl::DropPendingCollectionReaper>(storageInterface));
+ auto reaper = repl::DropPendingCollectionReaper::get(_opCtx);
+
+ auto kvStorageEngine =
+ dynamic_cast<KVStorageEngine*>(_opCtx->getServiceContext()->getGlobalStorageEngine());
+ KVCatalog* kvCatalog = kvStorageEngine->getCatalog();
+
+ // Save the pre-state idents so we can capture the specific idents related to collection
+ // creation.
+ std::vector<std::string> origIdents = kvCatalog->getAllIdents(_opCtx);
+
+ NamespaceString nss("unittests.reaperDropIsTimestamped");
+ reset(nss);
+
+ AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_X, LockMode::MODE_X);
+
+ const LogicalTime insertTimestamp = _clock->reserveTicks(1);
+ {
+ WriteUnitOfWork wuow(_opCtx);
+ insertDocument(autoColl.getCollection(),
+ InsertStatement(BSON("_id" << 0), insertTimestamp.asTimestamp(), 0LL));
+ wuow.commit();
+ ASSERT_EQ(1, itCount(autoColl.getCollection()));
+ }
+
+ // The KVCatalog only adheres to timestamp requests on `getAllIdents`. To know the right
+ // collection/index that gets removed on a drop, we must capture the randomized "ident"
+ // string for the target collection and index.
+ std::string collIdent;
+ std::string indexIdent;
+ std::tie(collIdent, indexIdent) = getNewCollectionIndexIdent(kvCatalog, origIdents);
+
+ // The first phase of a drop in a replica set is to perform a rename. This does not change
+ // the ident values.
+ {
+ WriteUnitOfWork wuow(_opCtx);
+ Database* db = autoColl.getDb();
+ ASSERT_OK(db->dropCollection(_opCtx, nss.ns()));
+ wuow.commit();
+ }
+
+ // Bump the clock two. The drop will get the second tick. The first tick will identify a
+ // snapshot of the data with the collection renamed.
+ const LogicalTime postRenameTimestamp = _clock->reserveTicks(2);
+
+ // Actually drop the collection, propagating to the KVCatalog. This drop will be
+ // timestamped at the logical clock value.
+ reaper->dropCollectionsOlderThan(
+ _opCtx, repl::OpTime(_clock->getClusterTime().asTimestamp(), presentTerm));
+ const LogicalTime postDropTime = _clock->reserveTicks(1);
+
+ // Querying the catalog at insert time shows the collection and index existing.
+ assertIdentsExistAtTimestamp(
+ kvCatalog, collIdent, indexIdent, insertTimestamp.asTimestamp());
+
+ // Querying the catalog at rename time continues to show the collection and index exist.
+ assertIdentsExistAtTimestamp(
+ kvCatalog, collIdent, indexIdent, postRenameTimestamp.asTimestamp());
+
+ // Querying the catalog after the drop shows the collection and index being deleted.
+ assertIdentsMissingAtTimestamp(
+ kvCatalog, collIdent, indexIdent, postDropTime.asTimestamp());
+ }
+};
+
+/**
+ * The first step of `mongo::dropDatabase` is to rename all replicated collections, generating a
+ * "drop collection" oplog entry. Then when those entries become majority commited, calls
+ * `StorageEngine::dropDatabase`. At this point, two separate code paths can perform the final
+ * removal of the collections from the storage engine: the reaper, or
+ * `[KV]StorageEngine::dropDatabase` when it is called from `mongo::dropDatabase`. This race
+ * exists on both primaries and secondaries. This test asserts `[KV]StorageEngine::dropDatabase`
+ * correctly timestamps the final drop.
+ */
+template <bool IsPrimary>
+class KVDropDatabase : public StorageTimestampTest {
+public:
+ void run() {
+ // Only run on 'wiredTiger'. No other storage engines to-date support timestamp writes.
+ if (mongo::storageGlobalParams.engine != "wiredTiger") {
+ return;
+ }
+
+ auto storageInterface = repl::StorageInterface::get(_opCtx);
+ repl::DropPendingCollectionReaper::set(
+ _opCtx->getServiceContext(),
+ stdx::make_unique<repl::DropPendingCollectionReaper>(storageInterface));
+
+ auto kvStorageEngine =
+ dynamic_cast<KVStorageEngine*>(_opCtx->getServiceContext()->getGlobalStorageEngine());
+ KVCatalog* kvCatalog = kvStorageEngine->getCatalog();
+
+ // Declare the database to be in a "synced" state, i.e: in steady-state replication.
+ Timestamp syncTime = _clock->reserveTicks(1).asTimestamp();
+ invariant(!syncTime.isNull());
+ kvStorageEngine->setInitialDataTimestamp(syncTime);
+
+ // This test is dropping collections individually before following up with a
+ // `dropDatabase` call. This is illegal in typical replication operation as `dropDatabase`
+ // may find collections that haven't been renamed to a "drop-pending"
+ // namespace. Workaround this by operating on a separate DB from the other tests.
+ const NamespaceString nss("unittestsDropDB.kvDropDatabase");
+ const NamespaceString sysProfile("unittestsDropDB.system.profile");
+
+ std::string collIdent;
+ std::string indexIdent;
+ std::string sysProfileIdent;
+ // `*.system.profile` does not have an `_id` index. Just create it to abide by the API. This
+ // value will be the empty string. Helper methods accommodate this.
+ std::string sysProfileIndexIdent;
+ for (auto& tuple : {std::tie(nss, collIdent, indexIdent),
+ std::tie(sysProfile, sysProfileIdent, sysProfileIndexIdent)}) {
+ // Save the pre-state idents so we can capture the specific idents related to collection
+ // creation.
+ std::vector<std::string> origIdents = kvCatalog->getAllIdents(_opCtx);
+ const auto& nss = std::get<0>(tuple);
+
+ // Non-replicated namespaces are wrapped in an unreplicated writes block. This has the
+ // side-effect of not timestamping the collection creation.
+ repl::UnreplicatedWritesBlock notReplicated(_opCtx);
+ if (nss.isReplicated()) {
+ TimestampBlock tsBlock(_opCtx, _clock->reserveTicks(1).asTimestamp());
+ reset(nss);
+ } else {
+ reset(nss);
+ }
+
+ AutoGetCollection autoColl(_opCtx, nss, LockMode::MODE_X, LockMode::MODE_X);
+
+ // Bind the local values to the variables in the parent scope.
+ auto& collIdent = std::get<1>(tuple);
+ auto& indexIdent = std::get<2>(tuple);
+ std::tie(collIdent, indexIdent) = getNewCollectionIndexIdent(kvCatalog, origIdents);
+ }
+
+ const Timestamp postCreateTime = _clock->reserveTicks(1).asTimestamp();
+
+ // Assert that `kvDropDatabase` came into creation between `syncTime` and `postCreateTime`.
+ assertIdentsMissingAtTimestamp(kvCatalog, collIdent, indexIdent, syncTime);
+ assertIdentsExistAtTimestamp(kvCatalog, collIdent, indexIdent, postCreateTime);
+
+ // `system.profile` is never timestamped. This means the creation appears to have taken
+ // place at the beginning of time.
+ assertIdentsExistAtTimestamp(kvCatalog, sysProfileIdent, sysProfileIndexIdent, syncTime);
+ assertIdentsExistAtTimestamp(
+ kvCatalog, sysProfileIdent, sysProfileIndexIdent, postCreateTime);
+
+ AutoGetCollection coll(_opCtx, nss, LockMode::MODE_X);
+ {
+ // Drop/rename `kvDropDatabase`. `system.profile` does not get dropped/renamed.
+ WriteUnitOfWork wuow(_opCtx);
+ Database* db = coll.getDb();
+ ASSERT_OK(db->dropCollection(_opCtx, nss.ns()));
+ wuow.commit();
+ }
+
+ // Reserve two ticks. The first represents after the rename in which the `kvDropDatabase`
+ // idents still exist. The second will be used by the `dropDatabase`, as that only looks
+ // at the clock; it does not advance it.
+ const Timestamp postRenameTime = _clock->reserveTicks(2).asTimestamp();
+ // The namespace has changed, but the ident still exists as-is after the rename.
+ assertIdentsExistAtTimestamp(kvCatalog, collIdent, indexIdent, postRenameTime);
+
+ // Primaries and secondaries call `dropDatabase` (and thus, `StorageEngine->dropDatabase`)
+ // in different contexts. Both contexts must end up with correct results.
+ if (IsPrimary) {
+ // Primaries call `StorageEngine->dropDatabase` outside of the WUOW that logs the
+ // `dropDatabase` oplog entry. It is not called in the context of a `TimestampBlock`.
+
+ ASSERT_OK(dropDatabase(_opCtx, nss.db().toString()));
+ } else {
+ // Secondaries processing a `dropDatabase` oplog entry wrap the call in an
+ // UnreplicatedWritesBlock and a TimestampBlock with the oplog entry's optime.
+
+ repl::UnreplicatedWritesBlock norep(_opCtx);
+ const Timestamp preDropTime = _clock->getClusterTime().asTimestamp();
+ TimestampBlock dropTime(_opCtx, preDropTime);
+ ASSERT_OK(dropDatabase(_opCtx, nss.db().toString()));
+ }
+
+ const Timestamp postDropTime = _clock->reserveTicks(1).asTimestamp();
+
+ // First, assert that `system.profile` never seems to have existed.
+ for (const auto& ts : {syncTime, postCreateTime, postDropTime}) {
+ assertIdentsMissingAtTimestamp(kvCatalog, sysProfileIdent, sysProfileIndexIdent, ts);
+ }
+
+ // Now assert that `kvDropDatabase` still existed at `postCreateTime`, but was deleted at
+ // `postDropTime`.
+ assertIdentsExistAtTimestamp(kvCatalog, collIdent, indexIdent, postCreateTime);
+ assertIdentsExistAtTimestamp(kvCatalog, collIdent, indexIdent, postRenameTime);
+ assertIdentsMissingAtTimestamp(kvCatalog, collIdent, indexIdent, postDropTime);
+ }
+};
+
+
class AllStorageTimestampTests : public unittest::Suite {
public:
AllStorageTimestampTests() : unittest::Suite("StorageTimestampTests") {}
@@ -1211,6 +1478,10 @@ public:
add<SetMinValidInitialSyncFlag>();
add<SetMinValidToAtLeast>();
add<SetMinValidAppliedThrough>();
+ add<ReaperDropIsTimestamped>();
+ // KVDropDatabase<IsPrimary>
+ add<KVDropDatabase<false>>();
+ add<KVDropDatabase<true>>();
}
};