SERVER-31521 Make OperationContextGroup have a non-sticky interrupt and update ShardServerCatalogCacheLoader accordingly

author: Dianna Hohensee <dianna.hohensee@10gen.com> 2017-10-17 15:59:28 -0400
committer: Dianna Hohensee <dianna.hohensee@10gen.com> 2017-10-19 16:29:11 -0400
commit: 88c39edf9275e47e5db4b1dbffb0bf51f9187731 (patch)
tree: c4cb3f2f181d0aa820d2b663a246543e92f4338f /src/mongo
parent: 6459867cc36f379d01f2f99642f6edcabdd3c1ce (diff)
download: mongo-88c39edf9275e47e5db4b1dbffb0bf51f9187731.tar.gz
5 files changed, 29 insertions, 42 deletions
diff --git a/src/mongo/db/operation_context_group.cpp b/src/mongo/db/operation_context_group.cpp
index 30305044b23..df106f1df08 100644
--- a/src/mongo/db/operation_context_group.cpp
+++ b/src/mongo/db/operation_context_group.cpp
@@ -78,9 +78,6 @@ auto OperationContextGroup::adopt(UniqueOperationContext opCtx) -> Context {
     invariant(cp);
     stdx::lock_guard<stdx::mutex> lk(_lock);
     _contexts.emplace_back(std::move(opCtx));
-    if (_interrupted) {
-        interruptOne(cp, _interrupted);
-    }
     return Context(*cp, *this);
 }
 
@@ -101,17 +98,11 @@ auto OperationContextGroup::take(Context ctx) -> Context {
 void OperationContextGroup::interrupt(ErrorCodes::Error code) {
     invariant(code);
     stdx::lock_guard<stdx::mutex> lk(_lock);
-    _interrupted = code;
     for (auto&& uniqueOperationContext : _contexts) {
         interruptOne(uniqueOperationContext.get(), code);
     }
 }
 
-void OperationContextGroup::resetInterrupt() {
-    stdx::lock_guard<stdx::mutex> lk(_lock);
-    _interrupted = ErrorCodes::Error{};
-}
-
 bool OperationContextGroup::isEmpty() {
     stdx::lock_guard<stdx::mutex> lk(_lock);
     return _contexts.empty();
diff --git a/src/mongo/db/operation_context_group.h b/src/mongo/db/operation_context_group.h
index c2a0eded364..e610c1c6058 100644
--- a/src/mongo/db/operation_context_group.h
+++ b/src/mongo/db/operation_context_group.h
@@ -60,7 +60,7 @@ public:
      * Makes an OperationContext on `client` and returns a Context object to track it.  On
      * destruction of the returned Context, the OperationContext is destroyed and its corresponding
      * entry in *this is erased.  If *this has been interrupted already, the new context will be
-     * interrupted immediately (taking and releasing the client lock).
+     * interrupted immediately.
      */
     Context makeOperationContext(Client& client);
 
@@ -68,7 +68,7 @@ public:
      * Takes ownership of the OperationContext from `ctx`, and returns a Context object to track it.
      * On destruction of the Context, its entry in *this is erased and its corresponding
      * OperationContext is destroyed. If *this has been interrupted already, `ctx` will be
-     * interrupted immediately (taking and releasing the client lock).
+     * interrupted immediately.
      */
     Context adopt(UniqueOperationContext ctx);
 
@@ -77,25 +77,15 @@ public:
      * Do this to protect an OperationContext from being interrupted along with the rest of its
      * group, or to expose `ctx` to this->interrupt().  Taking from a Context already in *this is
      * equivalent to moving from `ctx`. Taking a moved-from Context yields another moved-from
-     * Context.  If *this has been interrupted already, `ctx` will be interrupted immediately
-     * (taking and releasing the client lock).
+     * Context.
      */
     Context take(Context ctx);
 
     /*
-     * Interrupts all the OperationContexts maintained in *this. Contexts subsequently added to the
-     * group will be interrupted immediately until resetInterrupt() is called.  The lock is taken on
-     * each affected client while interrupting its operation.
-     *
-     * Note: Takes and releases each context's client lock.
+     * Interrupts all the OperationContexts maintained in *this.
      */
     void interrupt(ErrorCodes::Error);
 
-    /*
-     * Unsticks the interrupting state of *this.  Subsequently added contexts are not interrupted.
-     */
-    void resetInterrupt();
-
     /**
      * Reports whether the group has any OperationContexts.  This must be true before the destructor
      * is called.  Its usefulness is typically limited to invariants.
@@ -107,7 +97,6 @@ private:
 
     stdx::mutex _lock;
     std::vector<UniqueOperationContext> _contexts;
-    ErrorCodes::Error _interrupted{};
 
 };  // class OperationContextGroup
 
diff --git a/src/mongo/db/operation_context_test.cpp b/src/mongo/db/operation_context_test.cpp
index b5c8fe2c91d..ac14497fa25 100644
--- a/src/mongo/db/operation_context_test.cpp
+++ b/src/mongo/db/operation_context_test.cpp
@@ -136,22 +136,9 @@ TEST(OperationContextTest, OpCtxGroup) {
         group1.interrupt(ErrorCodes::InternalError);
         ASSERT_FALSE(opCtx3->checkForInterruptNoAssert().isOK());
         ASSERT_FALSE((*opCtx4).checkForInterruptNoAssert().isOK());
-
-        auto serviceCtx3 = stdx::make_unique<ServiceContextNoop>();
-        auto client3 = serviceCtx3->makeClient("OperationContextTest3");
-        auto opCtx5 = group1.makeOperationContext(*client3);
-        ASSERT_FALSE(opCtx5->checkForInterruptNoAssert().isOK());  // interrupt is sticky
     }
     ASSERT_TRUE(group1.isEmpty());
 
-    {
-        group1.resetInterrupt();
-        auto serviceCtx1 = stdx::make_unique<ServiceContextNoop>();
-        auto client1 = serviceCtx1->makeClient("OperationContextTest3");
-        auto opCtx1 = group1.makeOperationContext(*client1);
-        ASSERT_TRUE(opCtx1->checkForInterruptNoAssert().isOK());  // interrupt unstuck
-    }
-
     OperationContextGroup group2;
     {
         auto serviceCtx = stdx::make_unique<ServiceContextNoop>();
diff --git a/src/mongo/db/s/shard_server_catalog_cache_loader.cpp b/src/mongo/db/s/shard_server_catalog_cache_loader.cpp
index 70d1a50968c..eb3168ea4d7 100644
--- a/src/mongo/db/s/shard_server_catalog_cache_loader.cpp
+++ b/src/mongo/db/s/shard_server_catalog_cache_loader.cpp
@@ -283,8 +283,14 @@ ShardServerCatalogCacheLoader::ShardServerCatalogCacheLoader(
 }
 
 ShardServerCatalogCacheLoader::~ShardServerCatalogCacheLoader() {
-    _contexts.interrupt(ErrorCodes::InterruptedAtShutdown);
+    // Prevent further scheduling, then interrupt ongoing tasks.
     _threadPool.shutdown();
+    {
+        stdx::lock_guard<stdx::mutex> lock(_mutex);
+        _contexts.interrupt(ErrorCodes::InterruptedAtShutdown);
+        ++_term;
+    }
+
     _threadPool.join();
     invariant(_contexts.isEmpty());
 }
@@ -315,7 +321,6 @@ void ShardServerCatalogCacheLoader::onStepDown() {
 void ShardServerCatalogCacheLoader::onStepUp() {
     stdx::lock_guard<stdx::mutex> lock(_mutex);
     invariant(_role != ReplicaSetRole::None);
-    _contexts.resetInterrupt();
     ++_term;
     _role = ReplicaSetRole::Primary;
 }
@@ -341,6 +346,22 @@ std::shared_ptr<Notification<void>> ShardServerCatalogCacheLoader::getChunksSinc
     uassertStatusOK(_threadPool.schedule(
         [ this, nss, version, callbackFn, notify, isPrimary, currentTerm ]() noexcept {
             auto context = _contexts.makeOperationContext(*Client::getCurrent());
+
+            {
+                stdx::lock_guard<stdx::mutex> lock(_mutex);
+                // We may have missed an OperationContextGroup interrupt since this operation began
+                // but before the OperationContext was added to the group. So we'll check that
+                // we're still in the same _term.
+                if (_term != currentTerm) {
+                    callbackFn(context.opCtx(),
+                               Status{ErrorCodes::Interrupted,
+                                      "Unable to refresh routing table because replica set state "
+                                      "changed or node is shutting down."});
+                    notify->set();
+                    return;
+                }
+            }
+
             try {
                 if (isPrimary) {
                     _schedulePrimaryGetChunksSince(
diff --git a/src/mongo/db/s/shard_server_catalog_cache_loader.h b/src/mongo/db/s/shard_server_catalog_cache_loader.h
index b06147fe01b..4a6debf8326 100644
--- a/src/mongo/db/s/shard_server_catalog_cache_loader.h
+++ b/src/mongo/db/s/shard_server_catalog_cache_loader.h
@@ -351,9 +351,8 @@ private:
     // Map to track in progress persisted cache updates on the shard primary.
     TaskLists _taskLists;
 
-    // This value is increment every time this server changes from primary to secondary and vice
-    // versa. In this way, if a task is scheduled with one term value and then execution is
-    // attempted during another term, we can skip the operation because it is no longer valid.
+    // This value is bumped every time the set of currently scheduled tasks should no longer be
+    // running. This includes, replica set state transitions and shutdown.
     long long _term{0};
 
     // Indicates whether this server is the primary or not, so that the appropriate loading action
author	Dianna Hohensee <dianna.hohensee@10gen.com>	2017-10-17 15:59:28 -0400
committer	Dianna Hohensee <dianna.hohensee@10gen.com>	2017-10-19 16:29:11 -0400
commit	88c39edf9275e47e5db4b1dbffb0bf51f9187731 (patch)
tree	c4cb3f2f181d0aa820d2b663a246543e92f4338f /src/mongo
parent	6459867cc36f379d01f2f99642f6edcabdd3c1ce (diff)
download	mongo-88c39edf9275e47e5db4b1dbffb0bf51f9187731.tar.gz