From 88c39edf9275e47e5db4b1dbffb0bf51f9187731 Mon Sep 17 00:00:00 2001 From: Dianna Hohensee Date: Tue, 17 Oct 2017 15:59:28 -0400 Subject: SERVER-31521 Make OperationContextGroup have a non-sticky interrupt and update ShardServerCatalogCacheLoader accordingly --- src/mongo/db/operation_context_group.cpp | 9 -------- src/mongo/db/operation_context_group.h | 19 ++++------------ src/mongo/db/operation_context_test.cpp | 13 ----------- .../db/s/shard_server_catalog_cache_loader.cpp | 25 ++++++++++++++++++++-- src/mongo/db/s/shard_server_catalog_cache_loader.h | 5 ++--- 5 files changed, 29 insertions(+), 42 deletions(-) (limited to 'src/mongo') diff --git a/src/mongo/db/operation_context_group.cpp b/src/mongo/db/operation_context_group.cpp index 30305044b23..df106f1df08 100644 --- a/src/mongo/db/operation_context_group.cpp +++ b/src/mongo/db/operation_context_group.cpp @@ -78,9 +78,6 @@ auto OperationContextGroup::adopt(UniqueOperationContext opCtx) -> Context { invariant(cp); stdx::lock_guard lk(_lock); _contexts.emplace_back(std::move(opCtx)); - if (_interrupted) { - interruptOne(cp, _interrupted); - } return Context(*cp, *this); } @@ -101,17 +98,11 @@ auto OperationContextGroup::take(Context ctx) -> Context { void OperationContextGroup::interrupt(ErrorCodes::Error code) { invariant(code); stdx::lock_guard lk(_lock); - _interrupted = code; for (auto&& uniqueOperationContext : _contexts) { interruptOne(uniqueOperationContext.get(), code); } } -void OperationContextGroup::resetInterrupt() { - stdx::lock_guard lk(_lock); - _interrupted = ErrorCodes::Error{}; -} - bool OperationContextGroup::isEmpty() { stdx::lock_guard lk(_lock); return _contexts.empty(); diff --git a/src/mongo/db/operation_context_group.h b/src/mongo/db/operation_context_group.h index c2a0eded364..e610c1c6058 100644 --- a/src/mongo/db/operation_context_group.h +++ b/src/mongo/db/operation_context_group.h @@ -60,7 +60,7 @@ public: * Makes an OperationContext on `client` and returns a Context object to track it. On * destruction of the returned Context, the OperationContext is destroyed and its corresponding * entry in *this is erased. If *this has been interrupted already, the new context will be - * interrupted immediately (taking and releasing the client lock). + * interrupted immediately. */ Context makeOperationContext(Client& client); @@ -68,7 +68,7 @@ public: * Takes ownership of the OperationContext from `ctx`, and returns a Context object to track it. * On destruction of the Context, its entry in *this is erased and its corresponding * OperationContext is destroyed. If *this has been interrupted already, `ctx` will be - * interrupted immediately (taking and releasing the client lock). + * interrupted immediately. */ Context adopt(UniqueOperationContext ctx); @@ -77,25 +77,15 @@ public: * Do this to protect an OperationContext from being interrupted along with the rest of its * group, or to expose `ctx` to this->interrupt(). Taking from a Context already in *this is * equivalent to moving from `ctx`. Taking a moved-from Context yields another moved-from - * Context. If *this has been interrupted already, `ctx` will be interrupted immediately - * (taking and releasing the client lock). + * Context. */ Context take(Context ctx); /* - * Interrupts all the OperationContexts maintained in *this. Contexts subsequently added to the - * group will be interrupted immediately until resetInterrupt() is called. The lock is taken on - * each affected client while interrupting its operation. - * - * Note: Takes and releases each context's client lock. + * Interrupts all the OperationContexts maintained in *this. */ void interrupt(ErrorCodes::Error); - /* - * Unsticks the interrupting state of *this. Subsequently added contexts are not interrupted. - */ - void resetInterrupt(); - /** * Reports whether the group has any OperationContexts. This must be true before the destructor * is called. Its usefulness is typically limited to invariants. @@ -107,7 +97,6 @@ private: stdx::mutex _lock; std::vector _contexts; - ErrorCodes::Error _interrupted{}; }; // class OperationContextGroup diff --git a/src/mongo/db/operation_context_test.cpp b/src/mongo/db/operation_context_test.cpp index b5c8fe2c91d..ac14497fa25 100644 --- a/src/mongo/db/operation_context_test.cpp +++ b/src/mongo/db/operation_context_test.cpp @@ -136,22 +136,9 @@ TEST(OperationContextTest, OpCtxGroup) { group1.interrupt(ErrorCodes::InternalError); ASSERT_FALSE(opCtx3->checkForInterruptNoAssert().isOK()); ASSERT_FALSE((*opCtx4).checkForInterruptNoAssert().isOK()); - - auto serviceCtx3 = stdx::make_unique(); - auto client3 = serviceCtx3->makeClient("OperationContextTest3"); - auto opCtx5 = group1.makeOperationContext(*client3); - ASSERT_FALSE(opCtx5->checkForInterruptNoAssert().isOK()); // interrupt is sticky } ASSERT_TRUE(group1.isEmpty()); - { - group1.resetInterrupt(); - auto serviceCtx1 = stdx::make_unique(); - auto client1 = serviceCtx1->makeClient("OperationContextTest3"); - auto opCtx1 = group1.makeOperationContext(*client1); - ASSERT_TRUE(opCtx1->checkForInterruptNoAssert().isOK()); // interrupt unstuck - } - OperationContextGroup group2; { auto serviceCtx = stdx::make_unique(); diff --git a/src/mongo/db/s/shard_server_catalog_cache_loader.cpp b/src/mongo/db/s/shard_server_catalog_cache_loader.cpp index 70d1a50968c..eb3168ea4d7 100644 --- a/src/mongo/db/s/shard_server_catalog_cache_loader.cpp +++ b/src/mongo/db/s/shard_server_catalog_cache_loader.cpp @@ -283,8 +283,14 @@ ShardServerCatalogCacheLoader::ShardServerCatalogCacheLoader( } ShardServerCatalogCacheLoader::~ShardServerCatalogCacheLoader() { - _contexts.interrupt(ErrorCodes::InterruptedAtShutdown); + // Prevent further scheduling, then interrupt ongoing tasks. _threadPool.shutdown(); + { + stdx::lock_guard lock(_mutex); + _contexts.interrupt(ErrorCodes::InterruptedAtShutdown); + ++_term; + } + _threadPool.join(); invariant(_contexts.isEmpty()); } @@ -315,7 +321,6 @@ void ShardServerCatalogCacheLoader::onStepDown() { void ShardServerCatalogCacheLoader::onStepUp() { stdx::lock_guard lock(_mutex); invariant(_role != ReplicaSetRole::None); - _contexts.resetInterrupt(); ++_term; _role = ReplicaSetRole::Primary; } @@ -341,6 +346,22 @@ std::shared_ptr> ShardServerCatalogCacheLoader::getChunksSinc uassertStatusOK(_threadPool.schedule( [ this, nss, version, callbackFn, notify, isPrimary, currentTerm ]() noexcept { auto context = _contexts.makeOperationContext(*Client::getCurrent()); + + { + stdx::lock_guard lock(_mutex); + // We may have missed an OperationContextGroup interrupt since this operation began + // but before the OperationContext was added to the group. So we'll check that + // we're still in the same _term. + if (_term != currentTerm) { + callbackFn(context.opCtx(), + Status{ErrorCodes::Interrupted, + "Unable to refresh routing table because replica set state " + "changed or node is shutting down."}); + notify->set(); + return; + } + } + try { if (isPrimary) { _schedulePrimaryGetChunksSince( diff --git a/src/mongo/db/s/shard_server_catalog_cache_loader.h b/src/mongo/db/s/shard_server_catalog_cache_loader.h index b06147fe01b..4a6debf8326 100644 --- a/src/mongo/db/s/shard_server_catalog_cache_loader.h +++ b/src/mongo/db/s/shard_server_catalog_cache_loader.h @@ -351,9 +351,8 @@ private: // Map to track in progress persisted cache updates on the shard primary. TaskLists _taskLists; - // This value is increment every time this server changes from primary to secondary and vice - // versa. In this way, if a task is scheduled with one term value and then execution is - // attempted during another term, we can skip the operation because it is no longer valid. + // This value is bumped every time the set of currently scheduled tasks should no longer be + // running. This includes, replica set state transitions and shutdown. long long _term{0}; // Indicates whether this server is the primary or not, so that the appropriate loading action -- cgit v1.2.1