summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNathan Myers <nathan.myers@10gen.com>2017-05-25 18:00:24 -0400
committerNathan Myers <nathan.myers@10gen.com>2017-06-01 17:12:20 -0400
commit6bf062db95d69f06b02da708b54f5e3efbb9dfa5 (patch)
tree6b3d37fd60fbd601351928289898461a592db5d9
parent12ac469b6d79dc2169fd45ab2befff62295b87c8 (diff)
downloadmongo-6bf062db95d69f06b02da708b54f5e3efbb9dfa5.tar.gz
SERVER-29342 CollectionShardState/RangeDeleter support for safe secondary reads
-rw-r--r--jstests/core/set1.js2
-rw-r--r--jstests/core/update_replace.js8
-rw-r--r--jstests/core/upsert_fields.js2
-rw-r--r--src/mongo/db/exec/update.cpp6
-rw-r--r--src/mongo/db/s/collection_range_deleter.cpp57
-rw-r--r--src/mongo/db/s/collection_range_deleter.h37
-rw-r--r--src/mongo/db/s/collection_range_deleter_test.cpp131
-rw-r--r--src/mongo/db/s/collection_sharding_state.cpp6
-rw-r--r--src/mongo/db/s/collection_sharding_state.h14
-rw-r--r--src/mongo/db/s/metadata_manager.cpp157
-rw-r--r--src/mongo/db/s/metadata_manager.h42
-rw-r--r--src/mongo/db/s/metadata_manager_test.cpp33
12 files changed, 343 insertions, 152 deletions
diff --git a/jstests/core/set1.js b/jstests/core/set1.js
index 33840e3f431..bae41fc5803 100644
--- a/jstests/core/set1.js
+++ b/jstests/core/set1.js
@@ -4,4 +4,4 @@ t.drop();
t.insert({_id: 1, emb: {}});
t.update({_id: 1}, {$set: {emb: {'a.dot': 'data'}}});
-assert.eq({_id: 1, emb: {}}, t.findOne(), "A");
+assert.eq({_id: 1, emb: {"a.dot": 'data'}}, t.findOne(), "A");
diff --git a/jstests/core/update_replace.js b/jstests/core/update_replace.js
index 44099851ef4..e62d03eb09f 100644
--- a/jstests/core/update_replace.js
+++ b/jstests/core/update_replace.js
@@ -13,13 +13,13 @@ var res;
// Bypass validation in shell so we can test the server.
conn._skipValidation = true;
-// Should not allow "." in field names
+// Allow "." in field names
res = t.save({_id: 1, "a.a": 1});
-assert(res.hasWriteError(), "a.a");
+assert(!res.hasWriteError(), "a.a");
-// Should not allow "." in field names, embedded
+// Allow "." in field names, embedded
res = t.save({_id: 1, a: {"a.a": 1}});
-assert(res.hasWriteError(), "a: a.a");
+assert(!res.hasWriteError(), "a: a.a");
// Should not allow "$"-prefixed field names, caught before "." check
res = t.save({_id: 1, $a: {"a.a": 1}});
diff --git a/jstests/core/upsert_fields.js b/jstests/core/upsert_fields.js
index 311d6984ce9..490625c23db 100644
--- a/jstests/core/upsert_fields.js
+++ b/jstests/core/upsert_fields.js
@@ -154,6 +154,7 @@ for (var i = 0; i < 3; i++) {
assert.eq(value, upsertedXVal({$or: [{x: {$eq: 1}}]}, expr));
// Special types extracted
assert.eq(isReplStyle ? undefined : [1, 2], upsertedXVal({x: [1, 2]}, expr));
+ assert.eq({'x.x': 1}, upsertedXVal({x: {'x.x': 1}}, expr));
// field not extracted
assert.eq(undefined, upsertedXVal({x: {$gt: 1}}, expr));
@@ -172,7 +173,6 @@ for (var i = 0; i < 3; i++) {
assert.writeError(upsertedResult({x: undefined}, expr));
if (!isReplStyle) {
- assert.writeError(upsertedResult({x: {'x.x': 1}}, expr));
assert.writeError(upsertedResult({x: {$all: [1, 2]}}, expr));
assert.writeError(upsertedResult({$and: [{x: 1}, {x: 1}]}, expr));
assert.writeError(upsertedResult({$and: [{x: {$eq: 1}}, {x: 2}]}, expr));
diff --git a/src/mongo/db/exec/update.cpp b/src/mongo/db/exec/update.cpp
index 9bcc5460290..90cd46ab423 100644
--- a/src/mongo/db/exec/update.cpp
+++ b/src/mongo/db/exec/update.cpp
@@ -254,12 +254,6 @@ StatusWith<std::uint32_t> storageValid(const mb::ConstElement& elem,
Status status = validateDollarPrefixElement(elem, deep);
if (!status.isOK())
return status;
- } else if (fieldName.find(".") != string::npos) {
- // Field name cannot have a "." in it.
- return Status(ErrorCodes::DottedFieldName,
- str::stream() << "The dotted field '" << elem.getFieldName() << "' in '"
- << mb::getFullName(elem)
- << "' is not valid for storage.");
}
}
diff --git a/src/mongo/db/s/collection_range_deleter.cpp b/src/mongo/db/s/collection_range_deleter.cpp
index df690affe3d..2392305fe5d 100644
--- a/src/mongo/db/s/collection_range_deleter.cpp
+++ b/src/mongo/db/s/collection_range_deleter.cpp
@@ -76,13 +76,16 @@ const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority,
CollectionRangeDeleter::~CollectionRangeDeleter() {
// notify anybody still sleeping on orphan ranges
clear(Status{ErrorCodes::InterruptedDueToReplStateChange,
- "Collection sharding metadata destroyed"});
+ "Collection sharding metadata discarded"});
}
-bool CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx,
+auto CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx,
NamespaceString const& nss,
+ Action action,
int maxToDelete,
- CollectionRangeDeleter* rangeDeleterForTestOnly) {
+ CollectionRangeDeleter* forTestOnly) -> Action {
+
+ invariant(action != Action::kFinished);
StatusWith<int> wrote = 0;
auto range = boost::optional<ChunkRange>(boost::none);
auto notification = DeleteNotification();
@@ -92,28 +95,55 @@ bool CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx,
auto* css = CollectionShardingState::get(opCtx, nss);
{
auto scopedCollectionMetadata = css->getMetadata();
- if ((!collection || !scopedCollectionMetadata) && !rangeDeleterForTestOnly) {
- log() << "Abandoning range deletions in collection " << nss.ns()
- << " left over from sharded state";
+ if ((!collection || !scopedCollectionMetadata) && !forTestOnly) {
+ log() << "Abandoning range deletions left over from previously sharded collection"
+ << nss.ns();
stdx::lock_guard<stdx::mutex> lk(css->_metadataManager->_managerLock);
css->_metadataManager->_clearAllCleanups();
- return false; // collection was unsharded
+ return Action::kFinished;
}
// We don't actually know if this is the same collection that we were originally
// scheduled to do deletions on, or another one with the same name. But it doesn't
- // matter: if it has deletions scheduled, now is as good a time as any to do them.
- auto self = rangeDeleterForTestOnly ? rangeDeleterForTestOnly
- : &css->_metadataManager->_rangesToClean;
+ // matter: if it has a record of deletions scheduled, now is as good a time as any
+ // to do them.
+
+ auto self = forTestOnly ? forTestOnly : &css->_metadataManager->_rangesToClean;
{
stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock);
if (self->isEmpty())
- return false;
+ return Action::kFinished;
const auto& frontRange = self->_orphans.front().range;
range.emplace(frontRange.getMin().getOwned(), frontRange.getMax().getOwned());
notification = self->_orphans.front().notification;
}
+ invariant(range);
+
+ if (action == Action::kWriteOpLog) {
+ // clang-format off
+ // Secondaries will watch for this update, and kill any queries that may depend on
+ // documents in the range -- excepting any queries with a read-concern option
+ // 'ignoreChunkMigration'
+ try {
+ auto& adminSystemVersion = NamespaceString::kConfigCollectionNamespace;
+ auto epoch = scopedCollectionMetadata->getCollVersion().epoch();
+ AutoGetCollection autoAdmin(opCtx, adminSystemVersion, MODE_IX);
+
+ Helpers::upsert(opCtx, adminSystemVersion.ns(),
+ BSON("_id" << "startRangeDeletion" << "ns" << nss.ns() << "epoch" << epoch
+ << "min" << range->getMin() << "max" << range->getMax()));
+
+ } catch (DBException const& e) {
+ stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock);
+ css->_metadataManager->_clearAllCleanups(
+ {ErrorCodes::fromInt(e.getCode()),
+ str::stream() << "cannot push startRangeDeletion record to Op Log,"
+ " abandoning scheduled range deletions: " << e.what()});
+ return Action::kFinished;
+ }
+ // clang-format on
+ }
try {
auto keyPattern = scopedCollectionMetadata->getKeyPattern();
@@ -123,7 +153,6 @@ bool CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx,
wrote = e.toStatus();
warning() << e.what();
}
-
if (!wrote.isOK() || wrote.getValue() == 0) {
if (wrote.isOK()) {
log() << "No documents remain to delete in " << nss << " range "
@@ -131,7 +160,7 @@ bool CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx,
}
stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock);
self->_pop(wrote.getStatus());
- return true;
+ return Action::kWriteOpLog;
}
} // drop scopedCollectionMetadata
} // drop autoColl
@@ -174,7 +203,7 @@ bool CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx,
}
notification.abandon();
- return true;
+ return Action::kMore;
}
StatusWith<int> CollectionRangeDeleter::_doDeletion(OperationContext* opCtx,
diff --git a/src/mongo/db/s/collection_range_deleter.h b/src/mongo/db/s/collection_range_deleter.h
index a0a6625ba02..52d3ec44256 100644
--- a/src/mongo/db/s/collection_range_deleter.h
+++ b/src/mongo/db/s/collection_range_deleter.h
@@ -46,7 +46,8 @@ public:
/**
* This is an object n that asynchronously changes state when a scheduled range deletion
* completes or fails. Call n.ready() to discover if the event has already occurred. Call
- * n.waitStatus(opCtx) to sleep waiting for the event, and get its result.
+ * n.waitStatus(opCtx) to sleep waiting for the event, and get its result. If the wait is
+ * interrupted, waitStatus throws.
*
* It is an error to destroy a returned CleanupNotification object n unless either n.ready()
* is true or n.abandon() has been called. After n.abandon(), n is in a moved-from state.
@@ -85,6 +86,9 @@ public:
bool operator==(DeleteNotification const& other) const {
return notification == other.notification;
}
+ bool operator!=(DeleteNotification const& other) const {
+ return notification != other.notification;
+ }
private:
std::shared_ptr<Notification<Status>> notification;
@@ -96,6 +100,8 @@ public:
DeleteNotification notification{};
};
+ enum class Action { kFinished, kMore, kWriteOpLog };
+
CollectionRangeDeleter() = default;
~CollectionRangeDeleter();
@@ -128,8 +134,8 @@ public:
bool isEmpty() const;
/*
- * Notify with the specified status anything waiting on ranges scheduled, before discarding the
- * ranges and notifications.
+ * Notifies with the specified status anything waiting on ranges scheduled, and then discards
+ * the ranges and notifications. Is called in the destructor.
*/
void clear(Status);
@@ -139,21 +145,22 @@ public:
void append(BSONObjBuilder* builder) const;
/**
- * If any ranges are scheduled to clean, deletes up to maxToDelete documents, notifying watchers
- * of ranges as they are done being deleted. It performs its own collection locking so it must
- * be called without locks.
+ * If any range deletions are scheduled, deletes up to maxToDelete documents, notifying
+ * watchers of ranges as they are done being deleted. It performs its own collection locking, so
+ * it must be called without locks.
*
- * The 'rangeDeleterForTestOnly' is used as a utility for unit-tests that directly test the
- * CollectionRangeDeleter class so they do not need to set up CollectionShardingState and
- * MetadataManager objects.
+ * Returns kMore or kWriteOpLog if it should be scheduled to run again because there might be
+ * more documents to delete, or kFinished otherwise. When calling again, pass the value
+ * returned.
*
- * Returns true if it should be scheduled to run again because there might be more documents to
- * delete, or false otherwise.
+ * Argument 'forTestOnly' is used in unit tests that exercise the CollectionRangeDeleter class,
+ * so that they do not need to set up CollectionShardingState and MetadataManager objects.
*/
- static bool cleanUpNextRange(OperationContext*,
- NamespaceString const& nss,
- int maxToDelete,
- CollectionRangeDeleter* rangeDeleterForTestOnly = nullptr);
+ static Action cleanUpNextRange(OperationContext*,
+ NamespaceString const& nss,
+ Action,
+ int maxToDelete,
+ CollectionRangeDeleter* forTestOnly = nullptr);
private:
/**
diff --git a/src/mongo/db/s/collection_range_deleter_test.cpp b/src/mongo/db/s/collection_range_deleter_test.cpp
index 6442734d927..adaf90e7d0f 100644
--- a/src/mongo/db/s/collection_range_deleter_test.cpp
+++ b/src/mongo/db/s/collection_range_deleter_test.cpp
@@ -60,18 +60,25 @@ const std::string kPattern = "_id";
const BSONObj kKeyPattern = BSON(kPattern << 1);
const std::string kShardName{"a"};
const HostAndPort dummyHost("dummy", 123);
+const NamespaceString kAdminSystemVersion = NamespaceString("admin", "system.version");
class CollectionRangeDeleterTest : public ShardingMongodTestFixture {
+public:
+ using Deletion = CollectionRangeDeleter::Deletion;
+ using Action = CollectionRangeDeleter::Action;
+
protected:
- bool next(CollectionRangeDeleter& rangeDeleter, int maxToDelete) {
+ auto next(CollectionRangeDeleter& rangeDeleter, Action action, int maxToDelete)
+ -> CollectionRangeDeleter::Action {
return CollectionRangeDeleter::cleanUpNextRange(
- operationContext(), kNss, maxToDelete, &rangeDeleter);
+ operationContext(), kNss, action, maxToDelete, &rangeDeleter);
}
std::shared_ptr<RemoteCommandTargeterMock> configTargeter() {
return RemoteCommandTargeterMock::get(shardRegistry()->getConfigShard()->getTargeter());
}
-
- using Deletion = CollectionRangeDeleter::Deletion;
+ OID const& epoch() {
+ return _epoch;
+ }
private:
void setUp() override;
@@ -91,9 +98,19 @@ private:
std::unique_ptr<DistLockManager> distLockManager) override {
return stdx::make_unique<ShardingCatalogClientMock>(std::move(distLockManager));
}
+
+ OID _epoch;
};
+bool operator==(CollectionRangeDeleter::Action a, CollectionRangeDeleter::Action b) {
+ return (int)a == (int)b;
+}
+std::ostream& operator<<(std::ostream& os, CollectionRangeDeleter::Action a) {
+ return os << (int)a;
+}
+
void CollectionRangeDeleterTest::setUp() {
+ _epoch = OID::gen();
serverGlobalParams.clusterRole = ClusterRole::ShardServer;
ShardingMongodTestFixture::setUp();
replicationCoordinator()->alwaysAllowWrites(true);
@@ -108,13 +125,12 @@ void CollectionRangeDeleterTest::setUp() {
{
AutoGetCollection autoColl(operationContext(), kNss, MODE_IX);
auto collectionShardingState = CollectionShardingState::get(operationContext(), kNss);
- const OID epoch = OID::gen();
collectionShardingState->refreshMetadata(
operationContext(),
stdx::make_unique<CollectionMetadata>(
kKeyPattern,
- ChunkVersion(1, 0, epoch),
- ChunkVersion(0, 0, epoch),
+ ChunkVersion(1, 0, epoch()),
+ ChunkVersion(0, 0, epoch()),
SimpleBSONObjComparator::kInstance.makeBSONObjIndexedMap<CachedChunkInfo>()));
}
}
@@ -133,7 +149,7 @@ namespace {
// Tests the case that there is nothing in the database.
TEST_F(CollectionRangeDeleterTest, EmptyDatabase) {
CollectionRangeDeleter rangeDeleter;
- ASSERT_FALSE(next(rangeDeleter, 1));
+ ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 1));
}
// Tests the case that there is data, but it is not in a range to clean.
@@ -145,14 +161,14 @@ TEST_F(CollectionRangeDeleterTest, NoDataInGivenRangeToClean) {
ASSERT_BSONOBJ_EQ(insertedDoc, dbclient.findOne(kNss.toString(), QUERY(kPattern << 25)));
std::list<Deletion> ranges;
ranges.emplace_back(Deletion(ChunkRange(BSON(kPattern << 0), BSON(kPattern << 10))));
- rangeDeleter.add(std::move(ranges));
+ ASSERT_TRUE(rangeDeleter.add(std::move(ranges)));
ASSERT_EQ(1u, rangeDeleter.size());
- ASSERT_TRUE(next(rangeDeleter, 1));
+ ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kWriteOpLog, 1));
ASSERT_EQ(0u, rangeDeleter.size());
ASSERT_BSONOBJ_EQ(insertedDoc, dbclient.findOne(kNss.toString(), QUERY(kPattern << 25)));
- ASSERT_FALSE(next(rangeDeleter, 1));
+ ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 1));
}
// Tests the case that there is a single document within a range to clean.
@@ -166,23 +182,24 @@ TEST_F(CollectionRangeDeleterTest, OneDocumentInOneRangeToClean) {
std::list<Deletion> ranges;
Deletion deletion{ChunkRange(BSON(kPattern << 0), BSON(kPattern << 10))};
ranges.emplace_back(std::move(deletion));
- rangeDeleter.add(std::move(ranges));
+ ASSERT_TRUE(rangeDeleter.add(std::move(ranges)));
ASSERT_TRUE(ranges.empty()); // spliced elements out of it
auto optNotifn = rangeDeleter.overlaps(ChunkRange(BSON(kPattern << 0), BSON(kPattern << 10)));
ASSERT(optNotifn);
auto notifn = *optNotifn;
ASSERT(!notifn.ready());
- ASSERT_TRUE(next(rangeDeleter, 1)); // actually delete one
+ ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kWriteOpLog, 1)); // actually delete one
ASSERT(!notifn.ready());
ASSERT_EQ(rangeDeleter.size(), 1u);
- ASSERT_TRUE(next(rangeDeleter, 1)); // range empty, pop range, notify
+ // range empty, pop range, notify
+ ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kMore, 1));
ASSERT_TRUE(rangeDeleter.isEmpty());
ASSERT(notifn.ready() && notifn.waitStatus(operationContext()).isOK());
ASSERT_TRUE(dbclient.findOne(kNss.toString(), QUERY(kPattern << 5)).isEmpty());
- ASSERT_FALSE(next(rangeDeleter, 1));
+ ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 1));
}
// Tests the case that there are multiple documents within a range to clean.
@@ -197,12 +214,12 @@ TEST_F(CollectionRangeDeleterTest, MultipleDocumentsInOneRangeToClean) {
std::list<Deletion> ranges;
Deletion deletion{ChunkRange(BSON(kPattern << 0), BSON(kPattern << 10))};
ranges.emplace_back(std::move(deletion));
- rangeDeleter.add(std::move(ranges));
+ ASSERT_TRUE(rangeDeleter.add(std::move(ranges)));
- ASSERT_TRUE(next(rangeDeleter, 100));
- ASSERT_TRUE(next(rangeDeleter, 100));
+ ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kWriteOpLog, 100));
+ ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kMore, 100));
ASSERT_EQUALS(0ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 5)));
- ASSERT_FALSE(next(rangeDeleter, 100));
+ ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 100));
}
// Tests the case that there are multiple documents within a range to clean, and the range deleter
@@ -218,21 +235,20 @@ TEST_F(CollectionRangeDeleterTest, MultipleCleanupNextRangeCalls) {
std::list<Deletion> ranges;
Deletion deletion{ChunkRange(BSON(kPattern << 0), BSON(kPattern << 10))};
ranges.emplace_back(std::move(deletion));
- rangeDeleter.add(std::move(ranges));
+ ASSERT_TRUE(rangeDeleter.add(std::move(ranges)));
- ASSERT_TRUE(next(rangeDeleter, 1));
+ ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kWriteOpLog, 1));
ASSERT_EQUALS(2ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 5)));
- ASSERT_TRUE(next(rangeDeleter, 1));
+ ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kMore, 1));
ASSERT_EQUALS(1ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 5)));
- ASSERT_TRUE(next(rangeDeleter, 1));
- ASSERT_TRUE(next(rangeDeleter, 1));
+ ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kMore, 1));
+ ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kMore, 1));
ASSERT_EQUALS(0ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 5)));
- ASSERT_FALSE(next(rangeDeleter, 1));
+ ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 1));
}
-
// Tests the case that there are two ranges to clean, each containing multiple documents.
TEST_F(CollectionRangeDeleterTest, MultipleDocumentsInMultipleRangesToClean) {
CollectionRangeDeleter rangeDeleter;
@@ -247,10 +263,10 @@ TEST_F(CollectionRangeDeleterTest, MultipleDocumentsInMultipleRangesToClean) {
std::list<Deletion> ranges;
ranges.emplace_back(Deletion{ChunkRange{BSON(kPattern << 0), BSON(kPattern << 4)}});
- rangeDeleter.add(std::move(ranges));
+ ASSERT_TRUE(rangeDeleter.add(std::move(ranges)));
ASSERT_TRUE(ranges.empty());
ranges.emplace_back(Deletion{ChunkRange{BSON(kPattern << 4), BSON(kPattern << 7)}});
- rangeDeleter.add(std::move(ranges));
+ ASSERT_FALSE(rangeDeleter.add(std::move(ranges)));
auto optNotifn1 = rangeDeleter.overlaps(ChunkRange{BSON(kPattern << 0), BSON(kPattern << 4)});
ASSERT_TRUE(optNotifn1);
@@ -261,29 +277,68 @@ TEST_F(CollectionRangeDeleterTest, MultipleDocumentsInMultipleRangesToClean) {
auto& notifn2 = *optNotifn2;
ASSERT_FALSE(notifn2.ready());
- ASSERT_TRUE(next(rangeDeleter, 100));
+ // test op== on notifications
+ ASSERT_TRUE(notifn1 == *optNotifn1);
+ ASSERT_FALSE(notifn1 == *optNotifn2);
+ ASSERT_TRUE(notifn1 != *optNotifn2);
+ ASSERT_FALSE(notifn1 != *optNotifn1);
+
+ ASSERT_EQUALS(0ULL,
+ dbclient.count(kAdminSystemVersion.ns(), BSON(kPattern << "startRangeDeletion")));
+
+ ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kWriteOpLog, 100));
ASSERT_FALSE(notifn1.ready()); // no trigger yet
ASSERT_FALSE(notifn2.ready()); // no trigger yet
- ASSERT_EQUALS(0ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 4)));
- ASSERT_EQUALS(3ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 10)));
-
- ASSERT_TRUE(next(rangeDeleter, 100)); // discover there are no more < 4, pop range 1
+ ASSERT_EQUALS(1ULL,
+ dbclient.count(kAdminSystemVersion.ns(), BSON(kPattern << "startRangeDeletion")));
+ // clang-format off
+ ASSERT_BSONOBJ_EQ(
+ BSON("_id" << "startRangeDeletion" << "ns" << kNss.ns()
+ << "epoch" << epoch() << "min" << BSON("_id" << 0) << "max" << BSON("_id" << 4)),
+ dbclient.findOne(kAdminSystemVersion.ns(), QUERY("_id" << "startRangeDeletion")));
+ // clang-format on
+
+ ASSERT_EQUALS(0ULL, dbclient.count(kNss.ns(), BSON(kPattern << LT << 4)));
+ ASSERT_EQUALS(3ULL, dbclient.count(kNss.ns(), BSON(kPattern << LT << 10)));
+
+ // discover there are no more < 4, pop range 1
+ ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kMore, 100));
+
+ ASSERT_EQUALS(1ULL,
+ dbclient.count(kAdminSystemVersion.ns(), BSON(kPattern << "startRangeDeletion")));
+ // clang-format off
+ ASSERT_BSONOBJ_EQ(
+ BSON("_id" << "startRangeDeletion" << "ns" << kNss.ns()
+ << "epoch" << epoch() << "min" << BSON("_id" << 0) << "max" << BSON("_id" << 4)),
+ dbclient.findOne(kAdminSystemVersion.ns(), QUERY("_id" << "startRangeDeletion")));
+ // clang-format on
ASSERT_TRUE(notifn1.ready() && notifn1.waitStatus(operationContext()).isOK());
ASSERT_FALSE(notifn2.ready());
- ASSERT_EQUALS(3ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 10)));
+ ASSERT_EQUALS(3ULL, dbclient.count(kNss.ns(), BSON(kPattern << LT << 10)));
- ASSERT_TRUE(next(rangeDeleter, 100)); // delete the remaining documents
+ // delete the remaining documents
+ ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kWriteOpLog, 100));
ASSERT_FALSE(notifn2.ready());
- ASSERT_EQUALS(0ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 10)));
+ // clang-format off
+ ASSERT_BSONOBJ_EQ(
+ BSON("_id" << "startRangeDeletion" << "ns" << kNss.ns()
+ << "epoch" << epoch() << "min" << BSON("_id" << 4) << "max" << BSON("_id" << 7)),
+ dbclient.findOne(kAdminSystemVersion.ns(), QUERY("_id" << "startRangeDeletion")));
+ // clang-format on
+
+ ASSERT_EQUALS(0ULL, dbclient.count(kNss.ns(), BSON(kPattern << LT << 10)));
+
+ // discover there are no more, pop range 2
+ ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kMore, 1));
- ASSERT_TRUE(next(rangeDeleter, 1)); // discover there are no more, pop range 2
ASSERT_TRUE(notifn2.ready() && notifn2.waitStatus(operationContext()).isOK());
- ASSERT_FALSE(next(rangeDeleter, 1)); // discover there are no more ranges
+ // discover there are no more ranges
+ ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 1));
}
} // unnamed namespace
diff --git a/src/mongo/db/s/collection_sharding_state.cpp b/src/mongo/db/s/collection_sharding_state.cpp
index 6440415d511..e4a04ed2aaf 100644
--- a/src/mongo/db/s/collection_sharding_state.cpp
+++ b/src/mongo/db/s/collection_sharding_state.cpp
@@ -134,6 +134,12 @@ auto CollectionShardingState::cleanUpRange(ChunkRange const& range) -> CleanupNo
return _metadataManager->cleanUpRange(range);
}
+auto CollectionShardingState::overlappingMetadata(ChunkRange const& range) const
+ -> std::vector<ScopedCollectionMetadata> {
+ return _metadataManager->overlappingMetadata(_metadataManager, range);
+}
+
+
MigrationSourceManager* CollectionShardingState::getMigrationSourceManager() {
return _sourceMgr;
}
diff --git a/src/mongo/db/s/collection_sharding_state.h b/src/mongo/db/s/collection_sharding_state.h
index ff0489f4d9a..28ad2ad12d1 100644
--- a/src/mongo/db/s/collection_sharding_state.h
+++ b/src/mongo/db/s/collection_sharding_state.h
@@ -141,6 +141,14 @@ public:
auto cleanUpRange(ChunkRange const& range) -> CleanupNotification;
/**
+ * Returns a vector of ScopedCollectionMetadata objects representing metadata instances in use
+ * by running queries that overlap the argument range, suitable for identifying and invalidating
+ * those queries.
+ */
+ auto overlappingMetadata(ChunkRange const& range) const
+ -> std::vector<ScopedCollectionMetadata>;
+
+ /**
* Returns the active migration source manager, if one is available.
*/
MigrationSourceManager* getMigrationSourceManager();
@@ -243,10 +251,12 @@ private:
MigrationSourceManager* _sourceMgr{nullptr};
// for access to _metadataManager
- friend bool CollectionRangeDeleter::cleanUpNextRange(OperationContext*,
+ friend auto CollectionRangeDeleter::cleanUpNextRange(OperationContext*,
NamespaceString const&,
+ CollectionRangeDeleter::Action,
int maxToDelete,
- CollectionRangeDeleter*);
+ CollectionRangeDeleter*)
+ -> CollectionRangeDeleter::Action;
};
} // namespace mongo
diff --git a/src/mongo/db/s/metadata_manager.cpp b/src/mongo/db/s/metadata_manager.cpp
index 70cc7de5ee5..de6cfcabd96 100644
--- a/src/mongo/db/s/metadata_manager.cpp
+++ b/src/mongo/db/s/metadata_manager.cpp
@@ -44,60 +44,67 @@
#include "mongo/util/assert_util.h"
#include "mongo/util/log.h"
-// MetadataManager maintains std::shared_ptr<CollectionMetadataManager> pointers in a list
-// _metadata. It also contains a CollectionRangeDeleter that queues orphan ranges to delete in
-// a background thread, and a record of the ranges being migrated in, to avoid deleting them.
+// MetadataManager maintains pointers to CollectionMetadata objects in a member list named
+// _metadata. Each CollectionMetadata contains an immutable _chunksMap of chunks assigned to this
+// shard, along with details related to its own lifecycle in a member _tracker.
//
-// Free-floating CollectionMetadata objects are maintained by these pointers, and also by clients
-// via shared pointers in ScopedCollectionMetadata objects.
+// The current chunk mapping, used by queries starting up, is at _metadata.back(). Each query,
+// when it starts up, requests and holds a ScopedCollectionMetadata object, and destroys it on
+// termination. Each ScopedCollectionMetadata keeps a shared_ptr to its CollectionMetadata chunk
+// mapping, and to the MetadataManager itself. CollectionMetadata mappings also keep a record of
+// chunk ranges that may be deleted when it is determined that the range can no longer be in use.
//
-// The _tracker member of CollectionMetadata keeps:
-// a count of the ScopedCollectionMetadata objects that have pointers to the CollectionMetadata
-// a list of key ranges [min,max) of orphaned documents that may be deleted when the count goes
-// to zero
-// ____________________________
-// (s): std::shared_ptr<> Clients:| ScopedCollectionMetadata |
-// _________________________ +----(s) manager metadata (s)-----------------+
-// | CollectionShardingState | | |____________________________| | |
-// | _metadataManager (s) | +-------(s) manager metadata (s)-------------+ |
-// |____________________|____| | |____________________________| | | |
-// ____________________v_______ +----------(s) manager metadata (s) | | |
-// | MetadataManager | | |________________________|___| | |
-// | |<---+ | | |
-// | | ________________________ | | |
-// | /----------->| CollectionMetadata |<----+ (1 use) | |
-// | [(s),----/ | | ______________________|_ | |
-// | (s),------------------->| CollectionMetadata | (0 uses) | |
-// | _metadata: (s)]----\ | | | ______________________|_ | |
-// | \--------------->| CollectionMetadata | | |
-// | | | | | | | |
-// | _rangesToClean: | | | | _tracker: |<------------+ |
-// | ________________________ | | | | ____________________ |<--------------+
-// | | CollectionRangeDeleter | | | | | | Tracker | | (2 uses)
-// | | | | | | | | | |
-// | | _orphans [[min,max), | | | | | | usageCounter | |
-// | | [min,max), | | | | | | orphans [min,max), | |
-// | | ... ] | | | | | | ... ] | |
-// | |________________________| | |_| | |____________________| |
-// |____________________________| | | _chunksMap |
-// |_| _chunkVersion |
-// | ... |
-// |________________________|
+// ScopedCollectionMetadata's destructor decrements the CollectionMetadata's usageCounter.
+// Whenever a usageCounter drops to zero, we check whether any now-unused CollectionMetadata
+// elements can be popped off the front of _metadata. We need to keep the unused elements in the
+// middle (as seen below) because they may schedule deletions of chunks depended on by older
+// mappings.
//
-// A ScopedCollectionMetadata object is created and held during a query, and destroyed when the
-// query no longer needs access to the collection. Its destructor decrements the CollectionMetadata
-// _tracker member's usageCounter. Note that the collection may become unsharded, and even get
-// sharded again, between construction and destruction of a ScopedCollectionMetadata.
+// New chunk mappings are pushed onto the back of _metadata. Subsequently started queries use the
+// new mapping while still-running queries continue using the older "snapshot" mappings. We treat
+// _metadata.back()'s usage count differently from the snapshots because it can't reliably be
+// compared to zero; a new query may increment it at any time.
//
-// When a new chunk mapping replaces the active mapping, it is pushed onto the back of _metadata.
+// (Note that the collection may be dropped or become unsharded, and even get made and sharded
+// again, between construction and destruction of a ScopedCollectionMetadata).
//
-// A CollectionMetadata object pointed to from _metadata is maintained at least as long as any
-// query holds a ScopedCollectionMetadata object referring to it, or to any older one. In the
-// diagram above, the middle CollectionMetadata is kept until the one below it is disposed of.
+// MetadataManager also contains a CollectionRangeDeleter _rangesToClean that queues orphan ranges
+// being deleted in a background thread, and a mapping _receivingChunks of the ranges being migrated
+// in, to avoid deleting them. Each range deletion is paired with a notification object triggered
+// when the deletion is completed or abandoned.
+//
+// ____________________________
+// (s): std::shared_ptr<> Clients:| ScopedCollectionMetadata |
+// _________________________ +----(s) manager metadata (s)------------------+
+// | CollectionShardingState | | |____________________________| | |
+// | _metadataManager (s) | +-------(s) manager metadata (s)--------------+ |
+// |____________________|____| | |____________________________| | | |
+// ____________________v________ +------------(s) manager metadata (s)-----+ | |
+// | MetadataManager | | |____________________________| | | |
+// | |<--+ | | |
+// | | ___________________________ (1 use) | | |
+// | getActiveMetadata(): /---------->| CollectionMetadata |<---------+ | |
+// | back(): [(s),------/ | | _________________________|_ | |
+// | (s),-------------------->| CollectionMetadata | (0 uses) | |
+// | _metadata: (s)]------\ | | | _________________________|_ | |
+// | \-------------->| CollectionMetadata | | |
+// | _receivingChunks | | | | | (2 uses) | |
+// | _rangesToClean: | | | | _tracker: |<---------+ |
+// | _________________________ | | | | _______________________ |<-----------+
+// | | CollectionRangeDeleter | | | | | | Tracker | |
+// | | | | | | | | | |
+// | | _orphans [range,notif, | | | | | | usageCounter | |
+// | | range,notif, | | | | | | orphans [range,notif, | |
+// | | ... ] | | | | | | range,notif, | |
+// | | | | | | | | ... ] | |
+// | |_________________________| | |_| | |_______________________| |
+// |_____________________________| | | _chunksMap |
+// |_| _chunkVersion |
+// | ... |
+// |___________________________|
//
// Note that _metadata as shown here has its front() at the bottom, back() at the top. As usual,
-// new entries are pushed onto the back, popped off the front. The "active" metadata used by new
-// queries (when there is one), is _metadata.back().
+// new entries are pushed onto the back, popped off the front.
namespace mongo {
@@ -118,13 +125,17 @@ MetadataManager::~MetadataManager() {
}
void MetadataManager::_clearAllCleanups() {
+ _clearAllCleanups(
+ {ErrorCodes::InterruptedDueToReplStateChange,
+ str::stream() << "Range deletions in " << _nss.ns()
+ << " abandoned because collection was dropped or became unsharded"});
+}
+
+void MetadataManager::_clearAllCleanups(Status status) {
for (auto& metadata : _metadata) {
_pushListToClean(std::move(metadata->_tracker.orphans));
}
- _rangesToClean.clear({ErrorCodes::InterruptedDueToReplStateChange,
- str::stream() << "Range deletions in " << _nss.ns()
- << " abandoned because collection was"
- " dropped or became unsharded"});
+ _rangesToClean.clear(status);
}
ScopedCollectionMetadata MetadataManager::getActiveMetadata(std::shared_ptr<MetadataManager> self) {
@@ -241,10 +252,12 @@ void MetadataManager::_retireExpiredMetadata() {
if (!_metadata.front()->_tracker.orphans.empty()) {
log() << "Queries possibly dependent on " << _nss.ns()
<< " range(s) finished; scheduling for deletion";
+ // It is safe to push orphan ranges from _metadata.back(), even though new queries might
+ // start any time, because any request to delete a range it maps is rejected.
_pushListToClean(std::move(_metadata.front()->_tracker.orphans));
}
if (&_metadata.front() == &_metadata.back())
- break; // do not retire current chunk metadata.
+ break; // do not pop the active chunk mapping!
}
}
@@ -254,6 +267,8 @@ void MetadataManager::_retireExpiredMetadata() {
ScopedCollectionMetadata::ScopedCollectionMetadata(std::shared_ptr<MetadataManager> manager,
std::shared_ptr<CollectionMetadata> metadata)
: _metadata(std::move(metadata)), _manager(std::move(manager)) {
+ invariant(_metadata);
+ invariant(_manager);
++_metadata->_tracker.usageCounter;
}
@@ -342,15 +357,17 @@ void MetadataManager::append(BSONObjBuilder* builder) {
amrArr.done();
}
-void MetadataManager::_scheduleCleanup(executor::TaskExecutor* executor, NamespaceString nss) {
- executor->scheduleWork([executor, nss](auto&) {
+void MetadataManager::_scheduleCleanup(executor::TaskExecutor* executor,
+ NamespaceString nss,
+ CollectionRangeDeleter::Action action) {
+ executor->scheduleWork([executor, nss, action](auto&) {
const int maxToDelete = std::max(int(internalQueryExecYieldIterations.load()), 1);
Client::initThreadIfNotAlready("Collection Range Deleter");
auto UniqueOpCtx = Client::getCurrent()->makeOperationContext();
auto opCtx = UniqueOpCtx.get();
- bool again = CollectionRangeDeleter::cleanUpNextRange(opCtx, nss, maxToDelete);
- if (again) {
- _scheduleCleanup(executor, nss);
+ auto next = CollectionRangeDeleter::cleanUpNextRange(opCtx, nss, action, maxToDelete);
+ if (next != CollectionRangeDeleter::Action::kFinished) {
+ _scheduleCleanup(executor, nss, next);
}
});
}
@@ -365,9 +382,9 @@ auto MetadataManager::_pushRangeToClean(ChunkRange const& range) -> CleanupNotif
void MetadataManager::_pushListToClean(std::list<Deletion> ranges) {
if (_rangesToClean.add(std::move(ranges))) {
- _scheduleCleanup(_executor, _nss);
+ _scheduleCleanup(_executor, _nss, CollectionRangeDeleter::Action::kWriteOpLog);
}
- dassert(ranges.empty());
+ invariant(ranges.empty());
}
void MetadataManager::_addToReceiving(ChunkRange const& range) {
@@ -442,6 +459,28 @@ auto MetadataManager::cleanUpRange(ChunkRange const& range) -> CleanupNotificati
return activeMetadata->_tracker.orphans.back().notification;
}
+auto MetadataManager::overlappingMetadata(std::shared_ptr<MetadataManager> const& self,
+ ChunkRange const& range)
+ -> std::vector<ScopedCollectionMetadata> {
+ invariant(!_metadata.empty());
+ stdx::lock_guard<stdx::mutex> scopedLock(_managerLock);
+ std::vector<ScopedCollectionMetadata> result;
+ result.reserve(_metadata.size());
+ auto it = _metadata.crbegin(); // start with the current active chunk mapping
+ if ((*it)->rangeOverlapsChunk(range)) {
+ // We ignore the refcount of the active mapping; effectively, we assume it is in use.
+ result.push_back(ScopedCollectionMetadata(self, *it));
+ }
+ ++it; // step to snapshots
+ for (auto end = _metadata.crend(); it != end; ++it) {
+ // We want all the overlapping snapshot mappings still possibly in use by a query.
+ if ((*it)->_tracker.usageCounter > 0 && (*it)->rangeOverlapsChunk(range)) {
+ result.push_back(ScopedCollectionMetadata(self, *it));
+ }
+ }
+ return result;
+}
+
size_t MetadataManager::numberOfRangesToCleanStillInUse() {
stdx::lock_guard<stdx::mutex> scopedLock(_managerLock);
size_t count = 0;
diff --git a/src/mongo/db/s/metadata_manager.h b/src/mongo/db/s/metadata_manager.h
index 46da84db9e2..ff0770f450e 100644
--- a/src/mongo/db/s/metadata_manager.h
+++ b/src/mongo/db/s/metadata_manager.h
@@ -114,6 +114,14 @@ public:
CleanupNotification cleanUpRange(ChunkRange const& range);
/**
+ * Returns a vector of ScopedCollectionMetadata objects representing metadata instances in use
+ * by running queries that overlap the argument range, suitable for identifying and invalidating
+ * those queries.
+ */
+ auto overlappingMetadata(std::shared_ptr<MetadataManager> const& itself,
+ ChunkRange const& range) -> std::vector<ScopedCollectionMetadata>;
+
+ /**
* Returns the number of ranges scheduled to be cleaned, exclusive of such ranges that might
* still be in use by running queries. Outside of test drivers, the actual number may vary
* after it returns, so this is really only useful for unit tests.
@@ -146,12 +154,19 @@ private:
* Each time it completes cleaning up a range, it wakes up clients waiting on completion of
* that range, which may then verify their range has no more deletions scheduled, and proceed.
*/
- static void _scheduleCleanup(executor::TaskExecutor*, NamespaceString nss);
+ static void _scheduleCleanup(executor::TaskExecutor*,
+ NamespaceString nss,
+ CollectionRangeDeleter::Action);
// All of the following functions must be called while holding _managerLock.
/**
- * Cancel all scheduled deletions of orphan ranges, notifying listeners with status
+ * Cancels all scheduled deletions of orphan ranges, notifying listeners with specified status.
+ */
+ void _clearAllCleanups(Status);
+
+ /**
+ * Cancels all scheduled deletions of orphan ranges, notifying listeners with status
* InterruptedDueToReplStateChange.
*/
void _clearAllCleanups();
@@ -229,10 +244,12 @@ private:
// friends
// for access to _rangesToClean and _managerLock under task callback
- friend bool CollectionRangeDeleter::cleanUpNextRange(OperationContext*,
+ friend auto CollectionRangeDeleter::cleanUpNextRange(OperationContext*,
NamespaceString const&,
+ CollectionRangeDeleter::Action,
int maxToDelete,
- CollectionRangeDeleter*);
+ CollectionRangeDeleter*)
+ -> CollectionRangeDeleter::Action;
friend class ScopedCollectionMetadata;
};
@@ -265,6 +282,17 @@ public:
*/
operator bool() const;
+ /**
+ * Checks whether both objects refer to the identically the same metadata.
+ */
+ bool operator==(ScopedCollectionMetadata const& other) const {
+ return _metadata == other._metadata;
+ }
+ bool operator!=(ScopedCollectionMetadata const& other) const {
+ return _metadata != other._metadata;
+ }
+
+
private:
/**
* Increments the usageCounter in the specified CollectionMetadata.
@@ -285,8 +313,12 @@ private:
std::shared_ptr<MetadataManager> _manager{nullptr};
+ // These use our private ctor
friend ScopedCollectionMetadata MetadataManager::getActiveMetadata(
- std::shared_ptr<MetadataManager>); // uses our private ctor
+ std::shared_ptr<MetadataManager>);
+ friend auto MetadataManager::overlappingMetadata(std::shared_ptr<MetadataManager> const& itself,
+ ChunkRange const& range)
+ -> std::vector<ScopedCollectionMetadata>;
};
} // namespace mongo
diff --git a/src/mongo/db/s/metadata_manager_test.cpp b/src/mongo/db/s/metadata_manager_test.cpp
index a3375634d9f..5ba653ac731 100644
--- a/src/mongo/db/s/metadata_manager_test.cpp
+++ b/src/mongo/db/s/metadata_manager_test.cpp
@@ -222,17 +222,36 @@ TEST_F(MetadataManagerTest, NotificationBlocksUntilDeletion) {
ASSERT_EQ(manager->numberOfMetadataSnapshots(), 0UL);
ASSERT_EQ(manager->numberOfRangesToClean(), 0UL);
- auto scm = manager->getActiveMetadata(manager); // and increment scm's refcount
- ASSERT(bool(scm));
- addChunk(manager); // push new metadata
-
- ASSERT_EQ(manager->numberOfMetadataSnapshots(), 1UL);
+ auto scm1 = manager->getActiveMetadata(manager); // and increment refcount
+ ASSERT_TRUE(bool(scm1));
+ ASSERT_EQ(0ULL, scm1->getChunks().size());
+
+ addChunk(manager); // push new metadata
+ auto scm2 = manager->getActiveMetadata(manager); // and increment refcount
+ ASSERT_EQ(1ULL, scm2->getChunks().size());
+
+ // this is here solely to pacify an invariant in addChunk
+ manager->refreshActiveMetadata(makeEmptyMetadata());
+
+ addChunk(manager); // push new metadata
+ auto scm3 = manager->getActiveMetadata(manager); // and increment refcount
+ ASSERT_EQ(1ULL, scm3->getChunks().size());
+
+ auto overlaps =
+ manager->overlappingMetadata(manager, ChunkRange(BSON("key" << 0), BSON("key" << 10)));
+ ASSERT_EQ(2ULL, overlaps.size());
+ std::vector<ScopedCollectionMetadata> ref;
+ ref.push_back(std::move(scm3));
+ ref.push_back(std::move(scm2));
+ ASSERT(ref == overlaps);
+
+ ASSERT_EQ(manager->numberOfMetadataSnapshots(), 3UL);
ASSERT_EQ(manager->numberOfRangesToClean(), 0UL); // not yet...
optNotif = manager->cleanUpRange(cr1);
- ASSERT_EQ(manager->numberOfMetadataSnapshots(), 1UL);
+ ASSERT_EQ(manager->numberOfMetadataSnapshots(), 3UL);
ASSERT_EQ(manager->numberOfRangesToClean(), 1UL);
- } // scm destroyed, refcount of metadata goes to zero
+ } // scm1,2,3 destroyed, refcount of each metadata goes to zero
ASSERT_EQ(manager->numberOfMetadataSnapshots(), 0UL);
ASSERT_EQ(manager->numberOfRangesToClean(), 1UL);
ASSERT_FALSE(optNotif->ready());