diff options
author | Nathan Myers <nathan.myers@10gen.com> | 2017-06-02 13:16:12 -0400 |
---|---|---|
committer | Nathan Myers <nathan.myers@10gen.com> | 2017-06-02 13:39:13 -0400 |
commit | 20e9320f7716515d68c27324dc20080d585fd70f (patch) | |
tree | 6c2012039b6ee5f69034f0b2b1850ae02ad59b84 /src | |
parent | d4eeb742d85e31b95376715dec3454870ae83064 (diff) | |
download | mongo-20e9320f7716515d68c27324dc20080d585fd70f.tar.gz |
SERVER-29342 CollectionShardState/RangeDeleter support for safe secondary reads
Diffstat (limited to 'src')
-rw-r--r-- | src/mongo/db/exec/update.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/s/collection_range_deleter.cpp | 57 | ||||
-rw-r--r-- | src/mongo/db/s/collection_range_deleter.h | 37 | ||||
-rw-r--r-- | src/mongo/db/s/collection_range_deleter_test.cpp | 131 | ||||
-rw-r--r-- | src/mongo/db/s/collection_sharding_state.cpp | 6 | ||||
-rw-r--r-- | src/mongo/db/s/collection_sharding_state.h | 14 | ||||
-rw-r--r-- | src/mongo/db/s/metadata_manager.cpp | 157 | ||||
-rw-r--r-- | src/mongo/db/s/metadata_manager.h | 42 | ||||
-rw-r--r-- | src/mongo/db/s/metadata_manager_test.cpp | 33 |
9 files changed, 337 insertions, 146 deletions
diff --git a/src/mongo/db/exec/update.cpp b/src/mongo/db/exec/update.cpp index 9bcc5460290..90cd46ab423 100644 --- a/src/mongo/db/exec/update.cpp +++ b/src/mongo/db/exec/update.cpp @@ -254,12 +254,6 @@ StatusWith<std::uint32_t> storageValid(const mb::ConstElement& elem, Status status = validateDollarPrefixElement(elem, deep); if (!status.isOK()) return status; - } else if (fieldName.find(".") != string::npos) { - // Field name cannot have a "." in it. - return Status(ErrorCodes::DottedFieldName, - str::stream() << "The dotted field '" << elem.getFieldName() << "' in '" - << mb::getFullName(elem) - << "' is not valid for storage."); } } diff --git a/src/mongo/db/s/collection_range_deleter.cpp b/src/mongo/db/s/collection_range_deleter.cpp index df690affe3d..2392305fe5d 100644 --- a/src/mongo/db/s/collection_range_deleter.cpp +++ b/src/mongo/db/s/collection_range_deleter.cpp @@ -76,13 +76,16 @@ const WriteConcernOptions kMajorityWriteConcern(WriteConcernOptions::kMajority, CollectionRangeDeleter::~CollectionRangeDeleter() { // notify anybody still sleeping on orphan ranges clear(Status{ErrorCodes::InterruptedDueToReplStateChange, - "Collection sharding metadata destroyed"}); + "Collection sharding metadata discarded"}); } -bool CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx, +auto CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx, NamespaceString const& nss, + Action action, int maxToDelete, - CollectionRangeDeleter* rangeDeleterForTestOnly) { + CollectionRangeDeleter* forTestOnly) -> Action { + + invariant(action != Action::kFinished); StatusWith<int> wrote = 0; auto range = boost::optional<ChunkRange>(boost::none); auto notification = DeleteNotification(); @@ -92,28 +95,55 @@ bool CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx, auto* css = CollectionShardingState::get(opCtx, nss); { auto scopedCollectionMetadata = css->getMetadata(); - if ((!collection || !scopedCollectionMetadata) && !rangeDeleterForTestOnly) { - log() << "Abandoning range deletions in collection " << nss.ns() - << " left over from sharded state"; + if ((!collection || !scopedCollectionMetadata) && !forTestOnly) { + log() << "Abandoning range deletions left over from previously sharded collection" + << nss.ns(); stdx::lock_guard<stdx::mutex> lk(css->_metadataManager->_managerLock); css->_metadataManager->_clearAllCleanups(); - return false; // collection was unsharded + return Action::kFinished; } // We don't actually know if this is the same collection that we were originally // scheduled to do deletions on, or another one with the same name. But it doesn't - // matter: if it has deletions scheduled, now is as good a time as any to do them. - auto self = rangeDeleterForTestOnly ? rangeDeleterForTestOnly - : &css->_metadataManager->_rangesToClean; + // matter: if it has a record of deletions scheduled, now is as good a time as any + // to do them. + + auto self = forTestOnly ? forTestOnly : &css->_metadataManager->_rangesToClean; { stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock); if (self->isEmpty()) - return false; + return Action::kFinished; const auto& frontRange = self->_orphans.front().range; range.emplace(frontRange.getMin().getOwned(), frontRange.getMax().getOwned()); notification = self->_orphans.front().notification; } + invariant(range); + + if (action == Action::kWriteOpLog) { + // clang-format off + // Secondaries will watch for this update, and kill any queries that may depend on + // documents in the range -- excepting any queries with a read-concern option + // 'ignoreChunkMigration' + try { + auto& adminSystemVersion = NamespaceString::kConfigCollectionNamespace; + auto epoch = scopedCollectionMetadata->getCollVersion().epoch(); + AutoGetCollection autoAdmin(opCtx, adminSystemVersion, MODE_IX); + + Helpers::upsert(opCtx, adminSystemVersion.ns(), + BSON("_id" << "startRangeDeletion" << "ns" << nss.ns() << "epoch" << epoch + << "min" << range->getMin() << "max" << range->getMax())); + + } catch (DBException const& e) { + stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock); + css->_metadataManager->_clearAllCleanups( + {ErrorCodes::fromInt(e.getCode()), + str::stream() << "cannot push startRangeDeletion record to Op Log," + " abandoning scheduled range deletions: " << e.what()}); + return Action::kFinished; + } + // clang-format on + } try { auto keyPattern = scopedCollectionMetadata->getKeyPattern(); @@ -123,7 +153,6 @@ bool CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx, wrote = e.toStatus(); warning() << e.what(); } - if (!wrote.isOK() || wrote.getValue() == 0) { if (wrote.isOK()) { log() << "No documents remain to delete in " << nss << " range " @@ -131,7 +160,7 @@ bool CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx, } stdx::lock_guard<stdx::mutex> scopedLock(css->_metadataManager->_managerLock); self->_pop(wrote.getStatus()); - return true; + return Action::kWriteOpLog; } } // drop scopedCollectionMetadata } // drop autoColl @@ -174,7 +203,7 @@ bool CollectionRangeDeleter::cleanUpNextRange(OperationContext* opCtx, } notification.abandon(); - return true; + return Action::kMore; } StatusWith<int> CollectionRangeDeleter::_doDeletion(OperationContext* opCtx, diff --git a/src/mongo/db/s/collection_range_deleter.h b/src/mongo/db/s/collection_range_deleter.h index a0a6625ba02..52d3ec44256 100644 --- a/src/mongo/db/s/collection_range_deleter.h +++ b/src/mongo/db/s/collection_range_deleter.h @@ -46,7 +46,8 @@ public: /** * This is an object n that asynchronously changes state when a scheduled range deletion * completes or fails. Call n.ready() to discover if the event has already occurred. Call - * n.waitStatus(opCtx) to sleep waiting for the event, and get its result. + * n.waitStatus(opCtx) to sleep waiting for the event, and get its result. If the wait is + * interrupted, waitStatus throws. * * It is an error to destroy a returned CleanupNotification object n unless either n.ready() * is true or n.abandon() has been called. After n.abandon(), n is in a moved-from state. @@ -85,6 +86,9 @@ public: bool operator==(DeleteNotification const& other) const { return notification == other.notification; } + bool operator!=(DeleteNotification const& other) const { + return notification != other.notification; + } private: std::shared_ptr<Notification<Status>> notification; @@ -96,6 +100,8 @@ public: DeleteNotification notification{}; }; + enum class Action { kFinished, kMore, kWriteOpLog }; + CollectionRangeDeleter() = default; ~CollectionRangeDeleter(); @@ -128,8 +134,8 @@ public: bool isEmpty() const; /* - * Notify with the specified status anything waiting on ranges scheduled, before discarding the - * ranges and notifications. + * Notifies with the specified status anything waiting on ranges scheduled, and then discards + * the ranges and notifications. Is called in the destructor. */ void clear(Status); @@ -139,21 +145,22 @@ public: void append(BSONObjBuilder* builder) const; /** - * If any ranges are scheduled to clean, deletes up to maxToDelete documents, notifying watchers - * of ranges as they are done being deleted. It performs its own collection locking so it must - * be called without locks. + * If any range deletions are scheduled, deletes up to maxToDelete documents, notifying + * watchers of ranges as they are done being deleted. It performs its own collection locking, so + * it must be called without locks. * - * The 'rangeDeleterForTestOnly' is used as a utility for unit-tests that directly test the - * CollectionRangeDeleter class so they do not need to set up CollectionShardingState and - * MetadataManager objects. + * Returns kMore or kWriteOpLog if it should be scheduled to run again because there might be + * more documents to delete, or kFinished otherwise. When calling again, pass the value + * returned. * - * Returns true if it should be scheduled to run again because there might be more documents to - * delete, or false otherwise. + * Argument 'forTestOnly' is used in unit tests that exercise the CollectionRangeDeleter class, + * so that they do not need to set up CollectionShardingState and MetadataManager objects. */ - static bool cleanUpNextRange(OperationContext*, - NamespaceString const& nss, - int maxToDelete, - CollectionRangeDeleter* rangeDeleterForTestOnly = nullptr); + static Action cleanUpNextRange(OperationContext*, + NamespaceString const& nss, + Action, + int maxToDelete, + CollectionRangeDeleter* forTestOnly = nullptr); private: /** diff --git a/src/mongo/db/s/collection_range_deleter_test.cpp b/src/mongo/db/s/collection_range_deleter_test.cpp index 6442734d927..adaf90e7d0f 100644 --- a/src/mongo/db/s/collection_range_deleter_test.cpp +++ b/src/mongo/db/s/collection_range_deleter_test.cpp @@ -60,18 +60,25 @@ const std::string kPattern = "_id"; const BSONObj kKeyPattern = BSON(kPattern << 1); const std::string kShardName{"a"}; const HostAndPort dummyHost("dummy", 123); +const NamespaceString kAdminSystemVersion = NamespaceString("admin", "system.version"); class CollectionRangeDeleterTest : public ShardingMongodTestFixture { +public: + using Deletion = CollectionRangeDeleter::Deletion; + using Action = CollectionRangeDeleter::Action; + protected: - bool next(CollectionRangeDeleter& rangeDeleter, int maxToDelete) { + auto next(CollectionRangeDeleter& rangeDeleter, Action action, int maxToDelete) + -> CollectionRangeDeleter::Action { return CollectionRangeDeleter::cleanUpNextRange( - operationContext(), kNss, maxToDelete, &rangeDeleter); + operationContext(), kNss, action, maxToDelete, &rangeDeleter); } std::shared_ptr<RemoteCommandTargeterMock> configTargeter() { return RemoteCommandTargeterMock::get(shardRegistry()->getConfigShard()->getTargeter()); } - - using Deletion = CollectionRangeDeleter::Deletion; + OID const& epoch() { + return _epoch; + } private: void setUp() override; @@ -91,9 +98,19 @@ private: std::unique_ptr<DistLockManager> distLockManager) override { return stdx::make_unique<ShardingCatalogClientMock>(std::move(distLockManager)); } + + OID _epoch; }; +bool operator==(CollectionRangeDeleter::Action a, CollectionRangeDeleter::Action b) { + return (int)a == (int)b; +} +std::ostream& operator<<(std::ostream& os, CollectionRangeDeleter::Action a) { + return os << (int)a; +} + void CollectionRangeDeleterTest::setUp() { + _epoch = OID::gen(); serverGlobalParams.clusterRole = ClusterRole::ShardServer; ShardingMongodTestFixture::setUp(); replicationCoordinator()->alwaysAllowWrites(true); @@ -108,13 +125,12 @@ void CollectionRangeDeleterTest::setUp() { { AutoGetCollection autoColl(operationContext(), kNss, MODE_IX); auto collectionShardingState = CollectionShardingState::get(operationContext(), kNss); - const OID epoch = OID::gen(); collectionShardingState->refreshMetadata( operationContext(), stdx::make_unique<CollectionMetadata>( kKeyPattern, - ChunkVersion(1, 0, epoch), - ChunkVersion(0, 0, epoch), + ChunkVersion(1, 0, epoch()), + ChunkVersion(0, 0, epoch()), SimpleBSONObjComparator::kInstance.makeBSONObjIndexedMap<CachedChunkInfo>())); } } @@ -133,7 +149,7 @@ namespace { // Tests the case that there is nothing in the database. TEST_F(CollectionRangeDeleterTest, EmptyDatabase) { CollectionRangeDeleter rangeDeleter; - ASSERT_FALSE(next(rangeDeleter, 1)); + ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 1)); } // Tests the case that there is data, but it is not in a range to clean. @@ -145,14 +161,14 @@ TEST_F(CollectionRangeDeleterTest, NoDataInGivenRangeToClean) { ASSERT_BSONOBJ_EQ(insertedDoc, dbclient.findOne(kNss.toString(), QUERY(kPattern << 25))); std::list<Deletion> ranges; ranges.emplace_back(Deletion(ChunkRange(BSON(kPattern << 0), BSON(kPattern << 10)))); - rangeDeleter.add(std::move(ranges)); + ASSERT_TRUE(rangeDeleter.add(std::move(ranges))); ASSERT_EQ(1u, rangeDeleter.size()); - ASSERT_TRUE(next(rangeDeleter, 1)); + ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kWriteOpLog, 1)); ASSERT_EQ(0u, rangeDeleter.size()); ASSERT_BSONOBJ_EQ(insertedDoc, dbclient.findOne(kNss.toString(), QUERY(kPattern << 25))); - ASSERT_FALSE(next(rangeDeleter, 1)); + ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 1)); } // Tests the case that there is a single document within a range to clean. @@ -166,23 +182,24 @@ TEST_F(CollectionRangeDeleterTest, OneDocumentInOneRangeToClean) { std::list<Deletion> ranges; Deletion deletion{ChunkRange(BSON(kPattern << 0), BSON(kPattern << 10))}; ranges.emplace_back(std::move(deletion)); - rangeDeleter.add(std::move(ranges)); + ASSERT_TRUE(rangeDeleter.add(std::move(ranges))); ASSERT_TRUE(ranges.empty()); // spliced elements out of it auto optNotifn = rangeDeleter.overlaps(ChunkRange(BSON(kPattern << 0), BSON(kPattern << 10))); ASSERT(optNotifn); auto notifn = *optNotifn; ASSERT(!notifn.ready()); - ASSERT_TRUE(next(rangeDeleter, 1)); // actually delete one + ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kWriteOpLog, 1)); // actually delete one ASSERT(!notifn.ready()); ASSERT_EQ(rangeDeleter.size(), 1u); - ASSERT_TRUE(next(rangeDeleter, 1)); // range empty, pop range, notify + // range empty, pop range, notify + ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kMore, 1)); ASSERT_TRUE(rangeDeleter.isEmpty()); ASSERT(notifn.ready() && notifn.waitStatus(operationContext()).isOK()); ASSERT_TRUE(dbclient.findOne(kNss.toString(), QUERY(kPattern << 5)).isEmpty()); - ASSERT_FALSE(next(rangeDeleter, 1)); + ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 1)); } // Tests the case that there are multiple documents within a range to clean. @@ -197,12 +214,12 @@ TEST_F(CollectionRangeDeleterTest, MultipleDocumentsInOneRangeToClean) { std::list<Deletion> ranges; Deletion deletion{ChunkRange(BSON(kPattern << 0), BSON(kPattern << 10))}; ranges.emplace_back(std::move(deletion)); - rangeDeleter.add(std::move(ranges)); + ASSERT_TRUE(rangeDeleter.add(std::move(ranges))); - ASSERT_TRUE(next(rangeDeleter, 100)); - ASSERT_TRUE(next(rangeDeleter, 100)); + ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kWriteOpLog, 100)); + ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kMore, 100)); ASSERT_EQUALS(0ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 5))); - ASSERT_FALSE(next(rangeDeleter, 100)); + ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 100)); } // Tests the case that there are multiple documents within a range to clean, and the range deleter @@ -218,21 +235,20 @@ TEST_F(CollectionRangeDeleterTest, MultipleCleanupNextRangeCalls) { std::list<Deletion> ranges; Deletion deletion{ChunkRange(BSON(kPattern << 0), BSON(kPattern << 10))}; ranges.emplace_back(std::move(deletion)); - rangeDeleter.add(std::move(ranges)); + ASSERT_TRUE(rangeDeleter.add(std::move(ranges))); - ASSERT_TRUE(next(rangeDeleter, 1)); + ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kWriteOpLog, 1)); ASSERT_EQUALS(2ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 5))); - ASSERT_TRUE(next(rangeDeleter, 1)); + ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kMore, 1)); ASSERT_EQUALS(1ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 5))); - ASSERT_TRUE(next(rangeDeleter, 1)); - ASSERT_TRUE(next(rangeDeleter, 1)); + ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kMore, 1)); + ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kMore, 1)); ASSERT_EQUALS(0ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 5))); - ASSERT_FALSE(next(rangeDeleter, 1)); + ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 1)); } - // Tests the case that there are two ranges to clean, each containing multiple documents. TEST_F(CollectionRangeDeleterTest, MultipleDocumentsInMultipleRangesToClean) { CollectionRangeDeleter rangeDeleter; @@ -247,10 +263,10 @@ TEST_F(CollectionRangeDeleterTest, MultipleDocumentsInMultipleRangesToClean) { std::list<Deletion> ranges; ranges.emplace_back(Deletion{ChunkRange{BSON(kPattern << 0), BSON(kPattern << 4)}}); - rangeDeleter.add(std::move(ranges)); + ASSERT_TRUE(rangeDeleter.add(std::move(ranges))); ASSERT_TRUE(ranges.empty()); ranges.emplace_back(Deletion{ChunkRange{BSON(kPattern << 4), BSON(kPattern << 7)}}); - rangeDeleter.add(std::move(ranges)); + ASSERT_FALSE(rangeDeleter.add(std::move(ranges))); auto optNotifn1 = rangeDeleter.overlaps(ChunkRange{BSON(kPattern << 0), BSON(kPattern << 4)}); ASSERT_TRUE(optNotifn1); @@ -261,29 +277,68 @@ TEST_F(CollectionRangeDeleterTest, MultipleDocumentsInMultipleRangesToClean) { auto& notifn2 = *optNotifn2; ASSERT_FALSE(notifn2.ready()); - ASSERT_TRUE(next(rangeDeleter, 100)); + // test op== on notifications + ASSERT_TRUE(notifn1 == *optNotifn1); + ASSERT_FALSE(notifn1 == *optNotifn2); + ASSERT_TRUE(notifn1 != *optNotifn2); + ASSERT_FALSE(notifn1 != *optNotifn1); + + ASSERT_EQUALS(0ULL, + dbclient.count(kAdminSystemVersion.ns(), BSON(kPattern << "startRangeDeletion"))); + + ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kWriteOpLog, 100)); ASSERT_FALSE(notifn1.ready()); // no trigger yet ASSERT_FALSE(notifn2.ready()); // no trigger yet - ASSERT_EQUALS(0ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 4))); - ASSERT_EQUALS(3ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 10))); - - ASSERT_TRUE(next(rangeDeleter, 100)); // discover there are no more < 4, pop range 1 + ASSERT_EQUALS(1ULL, + dbclient.count(kAdminSystemVersion.ns(), BSON(kPattern << "startRangeDeletion"))); + // clang-format off + ASSERT_BSONOBJ_EQ( + BSON("_id" << "startRangeDeletion" << "ns" << kNss.ns() + << "epoch" << epoch() << "min" << BSON("_id" << 0) << "max" << BSON("_id" << 4)), + dbclient.findOne(kAdminSystemVersion.ns(), QUERY("_id" << "startRangeDeletion"))); + // clang-format on + + ASSERT_EQUALS(0ULL, dbclient.count(kNss.ns(), BSON(kPattern << LT << 4))); + ASSERT_EQUALS(3ULL, dbclient.count(kNss.ns(), BSON(kPattern << LT << 10))); + + // discover there are no more < 4, pop range 1 + ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kMore, 100)); + + ASSERT_EQUALS(1ULL, + dbclient.count(kAdminSystemVersion.ns(), BSON(kPattern << "startRangeDeletion"))); + // clang-format off + ASSERT_BSONOBJ_EQ( + BSON("_id" << "startRangeDeletion" << "ns" << kNss.ns() + << "epoch" << epoch() << "min" << BSON("_id" << 0) << "max" << BSON("_id" << 4)), + dbclient.findOne(kAdminSystemVersion.ns(), QUERY("_id" << "startRangeDeletion"))); + // clang-format on ASSERT_TRUE(notifn1.ready() && notifn1.waitStatus(operationContext()).isOK()); ASSERT_FALSE(notifn2.ready()); - ASSERT_EQUALS(3ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 10))); + ASSERT_EQUALS(3ULL, dbclient.count(kNss.ns(), BSON(kPattern << LT << 10))); - ASSERT_TRUE(next(rangeDeleter, 100)); // delete the remaining documents + // delete the remaining documents + ASSERT_EQ(Action::kMore, next(rangeDeleter, Action::kWriteOpLog, 100)); ASSERT_FALSE(notifn2.ready()); - ASSERT_EQUALS(0ULL, dbclient.count(kNss.toString(), BSON(kPattern << LT << 10))); + // clang-format off + ASSERT_BSONOBJ_EQ( + BSON("_id" << "startRangeDeletion" << "ns" << kNss.ns() + << "epoch" << epoch() << "min" << BSON("_id" << 4) << "max" << BSON("_id" << 7)), + dbclient.findOne(kAdminSystemVersion.ns(), QUERY("_id" << "startRangeDeletion"))); + // clang-format on + + ASSERT_EQUALS(0ULL, dbclient.count(kNss.ns(), BSON(kPattern << LT << 10))); + + // discover there are no more, pop range 2 + ASSERT_EQ(Action::kWriteOpLog, next(rangeDeleter, Action::kMore, 1)); - ASSERT_TRUE(next(rangeDeleter, 1)); // discover there are no more, pop range 2 ASSERT_TRUE(notifn2.ready() && notifn2.waitStatus(operationContext()).isOK()); - ASSERT_FALSE(next(rangeDeleter, 1)); // discover there are no more ranges + // discover there are no more ranges + ASSERT_EQ(Action::kFinished, next(rangeDeleter, Action::kWriteOpLog, 1)); } } // unnamed namespace diff --git a/src/mongo/db/s/collection_sharding_state.cpp b/src/mongo/db/s/collection_sharding_state.cpp index 6440415d511..e4a04ed2aaf 100644 --- a/src/mongo/db/s/collection_sharding_state.cpp +++ b/src/mongo/db/s/collection_sharding_state.cpp @@ -134,6 +134,12 @@ auto CollectionShardingState::cleanUpRange(ChunkRange const& range) -> CleanupNo return _metadataManager->cleanUpRange(range); } +auto CollectionShardingState::overlappingMetadata(ChunkRange const& range) const + -> std::vector<ScopedCollectionMetadata> { + return _metadataManager->overlappingMetadata(_metadataManager, range); +} + + MigrationSourceManager* CollectionShardingState::getMigrationSourceManager() { return _sourceMgr; } diff --git a/src/mongo/db/s/collection_sharding_state.h b/src/mongo/db/s/collection_sharding_state.h index ff0489f4d9a..28ad2ad12d1 100644 --- a/src/mongo/db/s/collection_sharding_state.h +++ b/src/mongo/db/s/collection_sharding_state.h @@ -141,6 +141,14 @@ public: auto cleanUpRange(ChunkRange const& range) -> CleanupNotification; /** + * Returns a vector of ScopedCollectionMetadata objects representing metadata instances in use + * by running queries that overlap the argument range, suitable for identifying and invalidating + * those queries. + */ + auto overlappingMetadata(ChunkRange const& range) const + -> std::vector<ScopedCollectionMetadata>; + + /** * Returns the active migration source manager, if one is available. */ MigrationSourceManager* getMigrationSourceManager(); @@ -243,10 +251,12 @@ private: MigrationSourceManager* _sourceMgr{nullptr}; // for access to _metadataManager - friend bool CollectionRangeDeleter::cleanUpNextRange(OperationContext*, + friend auto CollectionRangeDeleter::cleanUpNextRange(OperationContext*, NamespaceString const&, + CollectionRangeDeleter::Action, int maxToDelete, - CollectionRangeDeleter*); + CollectionRangeDeleter*) + -> CollectionRangeDeleter::Action; }; } // namespace mongo diff --git a/src/mongo/db/s/metadata_manager.cpp b/src/mongo/db/s/metadata_manager.cpp index 70cc7de5ee5..de6cfcabd96 100644 --- a/src/mongo/db/s/metadata_manager.cpp +++ b/src/mongo/db/s/metadata_manager.cpp @@ -44,60 +44,67 @@ #include "mongo/util/assert_util.h" #include "mongo/util/log.h" -// MetadataManager maintains std::shared_ptr<CollectionMetadataManager> pointers in a list -// _metadata. It also contains a CollectionRangeDeleter that queues orphan ranges to delete in -// a background thread, and a record of the ranges being migrated in, to avoid deleting them. +// MetadataManager maintains pointers to CollectionMetadata objects in a member list named +// _metadata. Each CollectionMetadata contains an immutable _chunksMap of chunks assigned to this +// shard, along with details related to its own lifecycle in a member _tracker. // -// Free-floating CollectionMetadata objects are maintained by these pointers, and also by clients -// via shared pointers in ScopedCollectionMetadata objects. +// The current chunk mapping, used by queries starting up, is at _metadata.back(). Each query, +// when it starts up, requests and holds a ScopedCollectionMetadata object, and destroys it on +// termination. Each ScopedCollectionMetadata keeps a shared_ptr to its CollectionMetadata chunk +// mapping, and to the MetadataManager itself. CollectionMetadata mappings also keep a record of +// chunk ranges that may be deleted when it is determined that the range can no longer be in use. // -// The _tracker member of CollectionMetadata keeps: -// a count of the ScopedCollectionMetadata objects that have pointers to the CollectionMetadata -// a list of key ranges [min,max) of orphaned documents that may be deleted when the count goes -// to zero -// ____________________________ -// (s): std::shared_ptr<> Clients:| ScopedCollectionMetadata | -// _________________________ +----(s) manager metadata (s)-----------------+ -// | CollectionShardingState | | |____________________________| | | -// | _metadataManager (s) | +-------(s) manager metadata (s)-------------+ | -// |____________________|____| | |____________________________| | | | -// ____________________v_______ +----------(s) manager metadata (s) | | | -// | MetadataManager | | |________________________|___| | | -// | |<---+ | | | -// | | ________________________ | | | -// | /----------->| CollectionMetadata |<----+ (1 use) | | -// | [(s),----/ | | ______________________|_ | | -// | (s),------------------->| CollectionMetadata | (0 uses) | | -// | _metadata: (s)]----\ | | | ______________________|_ | | -// | \--------------->| CollectionMetadata | | | -// | | | | | | | | -// | _rangesToClean: | | | | _tracker: |<------------+ | -// | ________________________ | | | | ____________________ |<--------------+ -// | | CollectionRangeDeleter | | | | | | Tracker | | (2 uses) -// | | | | | | | | | | -// | | _orphans [[min,max), | | | | | | usageCounter | | -// | | [min,max), | | | | | | orphans [min,max), | | -// | | ... ] | | | | | | ... ] | | -// | |________________________| | |_| | |____________________| | -// |____________________________| | | _chunksMap | -// |_| _chunkVersion | -// | ... | -// |________________________| +// ScopedCollectionMetadata's destructor decrements the CollectionMetadata's usageCounter. +// Whenever a usageCounter drops to zero, we check whether any now-unused CollectionMetadata +// elements can be popped off the front of _metadata. We need to keep the unused elements in the +// middle (as seen below) because they may schedule deletions of chunks depended on by older +// mappings. // -// A ScopedCollectionMetadata object is created and held during a query, and destroyed when the -// query no longer needs access to the collection. Its destructor decrements the CollectionMetadata -// _tracker member's usageCounter. Note that the collection may become unsharded, and even get -// sharded again, between construction and destruction of a ScopedCollectionMetadata. +// New chunk mappings are pushed onto the back of _metadata. Subsequently started queries use the +// new mapping while still-running queries continue using the older "snapshot" mappings. We treat +// _metadata.back()'s usage count differently from the snapshots because it can't reliably be +// compared to zero; a new query may increment it at any time. // -// When a new chunk mapping replaces the active mapping, it is pushed onto the back of _metadata. +// (Note that the collection may be dropped or become unsharded, and even get made and sharded +// again, between construction and destruction of a ScopedCollectionMetadata). // -// A CollectionMetadata object pointed to from _metadata is maintained at least as long as any -// query holds a ScopedCollectionMetadata object referring to it, or to any older one. In the -// diagram above, the middle CollectionMetadata is kept until the one below it is disposed of. +// MetadataManager also contains a CollectionRangeDeleter _rangesToClean that queues orphan ranges +// being deleted in a background thread, and a mapping _receivingChunks of the ranges being migrated +// in, to avoid deleting them. Each range deletion is paired with a notification object triggered +// when the deletion is completed or abandoned. +// +// ____________________________ +// (s): std::shared_ptr<> Clients:| ScopedCollectionMetadata | +// _________________________ +----(s) manager metadata (s)------------------+ +// | CollectionShardingState | | |____________________________| | | +// | _metadataManager (s) | +-------(s) manager metadata (s)--------------+ | +// |____________________|____| | |____________________________| | | | +// ____________________v________ +------------(s) manager metadata (s)-----+ | | +// | MetadataManager | | |____________________________| | | | +// | |<--+ | | | +// | | ___________________________ (1 use) | | | +// | getActiveMetadata(): /---------->| CollectionMetadata |<---------+ | | +// | back(): [(s),------/ | | _________________________|_ | | +// | (s),-------------------->| CollectionMetadata | (0 uses) | | +// | _metadata: (s)]------\ | | | _________________________|_ | | +// | \-------------->| CollectionMetadata | | | +// | _receivingChunks | | | | | (2 uses) | | +// | _rangesToClean: | | | | _tracker: |<---------+ | +// | _________________________ | | | | _______________________ |<-----------+ +// | | CollectionRangeDeleter | | | | | | Tracker | | +// | | | | | | | | | | +// | | _orphans [range,notif, | | | | | | usageCounter | | +// | | range,notif, | | | | | | orphans [range,notif, | | +// | | ... ] | | | | | | range,notif, | | +// | | | | | | | | ... ] | | +// | |_________________________| | |_| | |_______________________| | +// |_____________________________| | | _chunksMap | +// |_| _chunkVersion | +// | ... | +// |___________________________| // // Note that _metadata as shown here has its front() at the bottom, back() at the top. As usual, -// new entries are pushed onto the back, popped off the front. The "active" metadata used by new -// queries (when there is one), is _metadata.back(). +// new entries are pushed onto the back, popped off the front. namespace mongo { @@ -118,13 +125,17 @@ MetadataManager::~MetadataManager() { } void MetadataManager::_clearAllCleanups() { + _clearAllCleanups( + {ErrorCodes::InterruptedDueToReplStateChange, + str::stream() << "Range deletions in " << _nss.ns() + << " abandoned because collection was dropped or became unsharded"}); +} + +void MetadataManager::_clearAllCleanups(Status status) { for (auto& metadata : _metadata) { _pushListToClean(std::move(metadata->_tracker.orphans)); } - _rangesToClean.clear({ErrorCodes::InterruptedDueToReplStateChange, - str::stream() << "Range deletions in " << _nss.ns() - << " abandoned because collection was" - " dropped or became unsharded"}); + _rangesToClean.clear(status); } ScopedCollectionMetadata MetadataManager::getActiveMetadata(std::shared_ptr<MetadataManager> self) { @@ -241,10 +252,12 @@ void MetadataManager::_retireExpiredMetadata() { if (!_metadata.front()->_tracker.orphans.empty()) { log() << "Queries possibly dependent on " << _nss.ns() << " range(s) finished; scheduling for deletion"; + // It is safe to push orphan ranges from _metadata.back(), even though new queries might + // start any time, because any request to delete a range it maps is rejected. _pushListToClean(std::move(_metadata.front()->_tracker.orphans)); } if (&_metadata.front() == &_metadata.back()) - break; // do not retire current chunk metadata. + break; // do not pop the active chunk mapping! } } @@ -254,6 +267,8 @@ void MetadataManager::_retireExpiredMetadata() { ScopedCollectionMetadata::ScopedCollectionMetadata(std::shared_ptr<MetadataManager> manager, std::shared_ptr<CollectionMetadata> metadata) : _metadata(std::move(metadata)), _manager(std::move(manager)) { + invariant(_metadata); + invariant(_manager); ++_metadata->_tracker.usageCounter; } @@ -342,15 +357,17 @@ void MetadataManager::append(BSONObjBuilder* builder) { amrArr.done(); } -void MetadataManager::_scheduleCleanup(executor::TaskExecutor* executor, NamespaceString nss) { - executor->scheduleWork([executor, nss](auto&) { +void MetadataManager::_scheduleCleanup(executor::TaskExecutor* executor, + NamespaceString nss, + CollectionRangeDeleter::Action action) { + executor->scheduleWork([executor, nss, action](auto&) { const int maxToDelete = std::max(int(internalQueryExecYieldIterations.load()), 1); Client::initThreadIfNotAlready("Collection Range Deleter"); auto UniqueOpCtx = Client::getCurrent()->makeOperationContext(); auto opCtx = UniqueOpCtx.get(); - bool again = CollectionRangeDeleter::cleanUpNextRange(opCtx, nss, maxToDelete); - if (again) { - _scheduleCleanup(executor, nss); + auto next = CollectionRangeDeleter::cleanUpNextRange(opCtx, nss, action, maxToDelete); + if (next != CollectionRangeDeleter::Action::kFinished) { + _scheduleCleanup(executor, nss, next); } }); } @@ -365,9 +382,9 @@ auto MetadataManager::_pushRangeToClean(ChunkRange const& range) -> CleanupNotif void MetadataManager::_pushListToClean(std::list<Deletion> ranges) { if (_rangesToClean.add(std::move(ranges))) { - _scheduleCleanup(_executor, _nss); + _scheduleCleanup(_executor, _nss, CollectionRangeDeleter::Action::kWriteOpLog); } - dassert(ranges.empty()); + invariant(ranges.empty()); } void MetadataManager::_addToReceiving(ChunkRange const& range) { @@ -442,6 +459,28 @@ auto MetadataManager::cleanUpRange(ChunkRange const& range) -> CleanupNotificati return activeMetadata->_tracker.orphans.back().notification; } +auto MetadataManager::overlappingMetadata(std::shared_ptr<MetadataManager> const& self, + ChunkRange const& range) + -> std::vector<ScopedCollectionMetadata> { + invariant(!_metadata.empty()); + stdx::lock_guard<stdx::mutex> scopedLock(_managerLock); + std::vector<ScopedCollectionMetadata> result; + result.reserve(_metadata.size()); + auto it = _metadata.crbegin(); // start with the current active chunk mapping + if ((*it)->rangeOverlapsChunk(range)) { + // We ignore the refcount of the active mapping; effectively, we assume it is in use. + result.push_back(ScopedCollectionMetadata(self, *it)); + } + ++it; // step to snapshots + for (auto end = _metadata.crend(); it != end; ++it) { + // We want all the overlapping snapshot mappings still possibly in use by a query. + if ((*it)->_tracker.usageCounter > 0 && (*it)->rangeOverlapsChunk(range)) { + result.push_back(ScopedCollectionMetadata(self, *it)); + } + } + return result; +} + size_t MetadataManager::numberOfRangesToCleanStillInUse() { stdx::lock_guard<stdx::mutex> scopedLock(_managerLock); size_t count = 0; diff --git a/src/mongo/db/s/metadata_manager.h b/src/mongo/db/s/metadata_manager.h index 46da84db9e2..ff0770f450e 100644 --- a/src/mongo/db/s/metadata_manager.h +++ b/src/mongo/db/s/metadata_manager.h @@ -114,6 +114,14 @@ public: CleanupNotification cleanUpRange(ChunkRange const& range); /** + * Returns a vector of ScopedCollectionMetadata objects representing metadata instances in use + * by running queries that overlap the argument range, suitable for identifying and invalidating + * those queries. + */ + auto overlappingMetadata(std::shared_ptr<MetadataManager> const& itself, + ChunkRange const& range) -> std::vector<ScopedCollectionMetadata>; + + /** * Returns the number of ranges scheduled to be cleaned, exclusive of such ranges that might * still be in use by running queries. Outside of test drivers, the actual number may vary * after it returns, so this is really only useful for unit tests. @@ -146,12 +154,19 @@ private: * Each time it completes cleaning up a range, it wakes up clients waiting on completion of * that range, which may then verify their range has no more deletions scheduled, and proceed. */ - static void _scheduleCleanup(executor::TaskExecutor*, NamespaceString nss); + static void _scheduleCleanup(executor::TaskExecutor*, + NamespaceString nss, + CollectionRangeDeleter::Action); // All of the following functions must be called while holding _managerLock. /** - * Cancel all scheduled deletions of orphan ranges, notifying listeners with status + * Cancels all scheduled deletions of orphan ranges, notifying listeners with specified status. + */ + void _clearAllCleanups(Status); + + /** + * Cancels all scheduled deletions of orphan ranges, notifying listeners with status * InterruptedDueToReplStateChange. */ void _clearAllCleanups(); @@ -229,10 +244,12 @@ private: // friends // for access to _rangesToClean and _managerLock under task callback - friend bool CollectionRangeDeleter::cleanUpNextRange(OperationContext*, + friend auto CollectionRangeDeleter::cleanUpNextRange(OperationContext*, NamespaceString const&, + CollectionRangeDeleter::Action, int maxToDelete, - CollectionRangeDeleter*); + CollectionRangeDeleter*) + -> CollectionRangeDeleter::Action; friend class ScopedCollectionMetadata; }; @@ -265,6 +282,17 @@ public: */ operator bool() const; + /** + * Checks whether both objects refer to the identically the same metadata. + */ + bool operator==(ScopedCollectionMetadata const& other) const { + return _metadata == other._metadata; + } + bool operator!=(ScopedCollectionMetadata const& other) const { + return _metadata != other._metadata; + } + + private: /** * Increments the usageCounter in the specified CollectionMetadata. @@ -285,8 +313,12 @@ private: std::shared_ptr<MetadataManager> _manager{nullptr}; + // These use our private ctor friend ScopedCollectionMetadata MetadataManager::getActiveMetadata( - std::shared_ptr<MetadataManager>); // uses our private ctor + std::shared_ptr<MetadataManager>); + friend auto MetadataManager::overlappingMetadata(std::shared_ptr<MetadataManager> const& itself, + ChunkRange const& range) + -> std::vector<ScopedCollectionMetadata>; }; } // namespace mongo diff --git a/src/mongo/db/s/metadata_manager_test.cpp b/src/mongo/db/s/metadata_manager_test.cpp index a3375634d9f..5ba653ac731 100644 --- a/src/mongo/db/s/metadata_manager_test.cpp +++ b/src/mongo/db/s/metadata_manager_test.cpp @@ -222,17 +222,36 @@ TEST_F(MetadataManagerTest, NotificationBlocksUntilDeletion) { ASSERT_EQ(manager->numberOfMetadataSnapshots(), 0UL); ASSERT_EQ(manager->numberOfRangesToClean(), 0UL); - auto scm = manager->getActiveMetadata(manager); // and increment scm's refcount - ASSERT(bool(scm)); - addChunk(manager); // push new metadata - - ASSERT_EQ(manager->numberOfMetadataSnapshots(), 1UL); + auto scm1 = manager->getActiveMetadata(manager); // and increment refcount + ASSERT_TRUE(bool(scm1)); + ASSERT_EQ(0ULL, scm1->getChunks().size()); + + addChunk(manager); // push new metadata + auto scm2 = manager->getActiveMetadata(manager); // and increment refcount + ASSERT_EQ(1ULL, scm2->getChunks().size()); + + // this is here solely to pacify an invariant in addChunk + manager->refreshActiveMetadata(makeEmptyMetadata()); + + addChunk(manager); // push new metadata + auto scm3 = manager->getActiveMetadata(manager); // and increment refcount + ASSERT_EQ(1ULL, scm3->getChunks().size()); + + auto overlaps = + manager->overlappingMetadata(manager, ChunkRange(BSON("key" << 0), BSON("key" << 10))); + ASSERT_EQ(2ULL, overlaps.size()); + std::vector<ScopedCollectionMetadata> ref; + ref.push_back(std::move(scm3)); + ref.push_back(std::move(scm2)); + ASSERT(ref == overlaps); + + ASSERT_EQ(manager->numberOfMetadataSnapshots(), 3UL); ASSERT_EQ(manager->numberOfRangesToClean(), 0UL); // not yet... optNotif = manager->cleanUpRange(cr1); - ASSERT_EQ(manager->numberOfMetadataSnapshots(), 1UL); + ASSERT_EQ(manager->numberOfMetadataSnapshots(), 3UL); ASSERT_EQ(manager->numberOfRangesToClean(), 1UL); - } // scm destroyed, refcount of metadata goes to zero + } // scm1,2,3 destroyed, refcount of each metadata goes to zero ASSERT_EQ(manager->numberOfMetadataSnapshots(), 0UL); ASSERT_EQ(manager->numberOfRangesToClean(), 1UL); ASSERT_FALSE(optNotif->ready()); |