summaryrefslogtreecommitdiff
path: root/src/mongo
diff options
context:
space:
mode:
authorDianna Hohensee <dianna.hohensee@10gen.com>2017-10-17 15:59:28 -0400
committerDianna Hohensee <dianna.hohensee@10gen.com>2017-10-19 16:29:11 -0400
commit88c39edf9275e47e5db4b1dbffb0bf51f9187731 (patch)
treec4cb3f2f181d0aa820d2b663a246543e92f4338f /src/mongo
parent6459867cc36f379d01f2f99642f6edcabdd3c1ce (diff)
downloadmongo-88c39edf9275e47e5db4b1dbffb0bf51f9187731.tar.gz
SERVER-31521 Make OperationContextGroup have a non-sticky interrupt and update ShardServerCatalogCacheLoader accordingly
Diffstat (limited to 'src/mongo')
-rw-r--r--src/mongo/db/operation_context_group.cpp9
-rw-r--r--src/mongo/db/operation_context_group.h19
-rw-r--r--src/mongo/db/operation_context_test.cpp13
-rw-r--r--src/mongo/db/s/shard_server_catalog_cache_loader.cpp25
-rw-r--r--src/mongo/db/s/shard_server_catalog_cache_loader.h5
5 files changed, 29 insertions, 42 deletions
diff --git a/src/mongo/db/operation_context_group.cpp b/src/mongo/db/operation_context_group.cpp
index 30305044b23..df106f1df08 100644
--- a/src/mongo/db/operation_context_group.cpp
+++ b/src/mongo/db/operation_context_group.cpp
@@ -78,9 +78,6 @@ auto OperationContextGroup::adopt(UniqueOperationContext opCtx) -> Context {
invariant(cp);
stdx::lock_guard<stdx::mutex> lk(_lock);
_contexts.emplace_back(std::move(opCtx));
- if (_interrupted) {
- interruptOne(cp, _interrupted);
- }
return Context(*cp, *this);
}
@@ -101,17 +98,11 @@ auto OperationContextGroup::take(Context ctx) -> Context {
void OperationContextGroup::interrupt(ErrorCodes::Error code) {
invariant(code);
stdx::lock_guard<stdx::mutex> lk(_lock);
- _interrupted = code;
for (auto&& uniqueOperationContext : _contexts) {
interruptOne(uniqueOperationContext.get(), code);
}
}
-void OperationContextGroup::resetInterrupt() {
- stdx::lock_guard<stdx::mutex> lk(_lock);
- _interrupted = ErrorCodes::Error{};
-}
-
bool OperationContextGroup::isEmpty() {
stdx::lock_guard<stdx::mutex> lk(_lock);
return _contexts.empty();
diff --git a/src/mongo/db/operation_context_group.h b/src/mongo/db/operation_context_group.h
index c2a0eded364..e610c1c6058 100644
--- a/src/mongo/db/operation_context_group.h
+++ b/src/mongo/db/operation_context_group.h
@@ -60,7 +60,7 @@ public:
* Makes an OperationContext on `client` and returns a Context object to track it. On
* destruction of the returned Context, the OperationContext is destroyed and its corresponding
* entry in *this is erased. If *this has been interrupted already, the new context will be
- * interrupted immediately (taking and releasing the client lock).
+ * interrupted immediately.
*/
Context makeOperationContext(Client& client);
@@ -68,7 +68,7 @@ public:
* Takes ownership of the OperationContext from `ctx`, and returns a Context object to track it.
* On destruction of the Context, its entry in *this is erased and its corresponding
* OperationContext is destroyed. If *this has been interrupted already, `ctx` will be
- * interrupted immediately (taking and releasing the client lock).
+ * interrupted immediately.
*/
Context adopt(UniqueOperationContext ctx);
@@ -77,25 +77,15 @@ public:
* Do this to protect an OperationContext from being interrupted along with the rest of its
* group, or to expose `ctx` to this->interrupt(). Taking from a Context already in *this is
* equivalent to moving from `ctx`. Taking a moved-from Context yields another moved-from
- * Context. If *this has been interrupted already, `ctx` will be interrupted immediately
- * (taking and releasing the client lock).
+ * Context.
*/
Context take(Context ctx);
/*
- * Interrupts all the OperationContexts maintained in *this. Contexts subsequently added to the
- * group will be interrupted immediately until resetInterrupt() is called. The lock is taken on
- * each affected client while interrupting its operation.
- *
- * Note: Takes and releases each context's client lock.
+ * Interrupts all the OperationContexts maintained in *this.
*/
void interrupt(ErrorCodes::Error);
- /*
- * Unsticks the interrupting state of *this. Subsequently added contexts are not interrupted.
- */
- void resetInterrupt();
-
/**
* Reports whether the group has any OperationContexts. This must be true before the destructor
* is called. Its usefulness is typically limited to invariants.
@@ -107,7 +97,6 @@ private:
stdx::mutex _lock;
std::vector<UniqueOperationContext> _contexts;
- ErrorCodes::Error _interrupted{};
}; // class OperationContextGroup
diff --git a/src/mongo/db/operation_context_test.cpp b/src/mongo/db/operation_context_test.cpp
index b5c8fe2c91d..ac14497fa25 100644
--- a/src/mongo/db/operation_context_test.cpp
+++ b/src/mongo/db/operation_context_test.cpp
@@ -136,22 +136,9 @@ TEST(OperationContextTest, OpCtxGroup) {
group1.interrupt(ErrorCodes::InternalError);
ASSERT_FALSE(opCtx3->checkForInterruptNoAssert().isOK());
ASSERT_FALSE((*opCtx4).checkForInterruptNoAssert().isOK());
-
- auto serviceCtx3 = stdx::make_unique<ServiceContextNoop>();
- auto client3 = serviceCtx3->makeClient("OperationContextTest3");
- auto opCtx5 = group1.makeOperationContext(*client3);
- ASSERT_FALSE(opCtx5->checkForInterruptNoAssert().isOK()); // interrupt is sticky
}
ASSERT_TRUE(group1.isEmpty());
- {
- group1.resetInterrupt();
- auto serviceCtx1 = stdx::make_unique<ServiceContextNoop>();
- auto client1 = serviceCtx1->makeClient("OperationContextTest3");
- auto opCtx1 = group1.makeOperationContext(*client1);
- ASSERT_TRUE(opCtx1->checkForInterruptNoAssert().isOK()); // interrupt unstuck
- }
-
OperationContextGroup group2;
{
auto serviceCtx = stdx::make_unique<ServiceContextNoop>();
diff --git a/src/mongo/db/s/shard_server_catalog_cache_loader.cpp b/src/mongo/db/s/shard_server_catalog_cache_loader.cpp
index 70d1a50968c..eb3168ea4d7 100644
--- a/src/mongo/db/s/shard_server_catalog_cache_loader.cpp
+++ b/src/mongo/db/s/shard_server_catalog_cache_loader.cpp
@@ -283,8 +283,14 @@ ShardServerCatalogCacheLoader::ShardServerCatalogCacheLoader(
}
ShardServerCatalogCacheLoader::~ShardServerCatalogCacheLoader() {
- _contexts.interrupt(ErrorCodes::InterruptedAtShutdown);
+ // Prevent further scheduling, then interrupt ongoing tasks.
_threadPool.shutdown();
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ _contexts.interrupt(ErrorCodes::InterruptedAtShutdown);
+ ++_term;
+ }
+
_threadPool.join();
invariant(_contexts.isEmpty());
}
@@ -315,7 +321,6 @@ void ShardServerCatalogCacheLoader::onStepDown() {
void ShardServerCatalogCacheLoader::onStepUp() {
stdx::lock_guard<stdx::mutex> lock(_mutex);
invariant(_role != ReplicaSetRole::None);
- _contexts.resetInterrupt();
++_term;
_role = ReplicaSetRole::Primary;
}
@@ -341,6 +346,22 @@ std::shared_ptr<Notification<void>> ShardServerCatalogCacheLoader::getChunksSinc
uassertStatusOK(_threadPool.schedule(
[ this, nss, version, callbackFn, notify, isPrimary, currentTerm ]() noexcept {
auto context = _contexts.makeOperationContext(*Client::getCurrent());
+
+ {
+ stdx::lock_guard<stdx::mutex> lock(_mutex);
+ // We may have missed an OperationContextGroup interrupt since this operation began
+ // but before the OperationContext was added to the group. So we'll check that
+ // we're still in the same _term.
+ if (_term != currentTerm) {
+ callbackFn(context.opCtx(),
+ Status{ErrorCodes::Interrupted,
+ "Unable to refresh routing table because replica set state "
+ "changed or node is shutting down."});
+ notify->set();
+ return;
+ }
+ }
+
try {
if (isPrimary) {
_schedulePrimaryGetChunksSince(
diff --git a/src/mongo/db/s/shard_server_catalog_cache_loader.h b/src/mongo/db/s/shard_server_catalog_cache_loader.h
index b06147fe01b..4a6debf8326 100644
--- a/src/mongo/db/s/shard_server_catalog_cache_loader.h
+++ b/src/mongo/db/s/shard_server_catalog_cache_loader.h
@@ -351,9 +351,8 @@ private:
// Map to track in progress persisted cache updates on the shard primary.
TaskLists _taskLists;
- // This value is increment every time this server changes from primary to secondary and vice
- // versa. In this way, if a task is scheduled with one term value and then execution is
- // attempted during another term, we can skip the operation because it is no longer valid.
+ // This value is bumped every time the set of currently scheduled tasks should no longer be
+ // running. This includes, replica set state transitions and shutdown.
long long _term{0};
// Indicates whether this server is the primary or not, so that the appropriate loading action