SERVER-27232 Fix early timeout in ASIO connpool

Initial connects and later refreshes have a timeout associated with them in ASIO that isn't linked to any user generated timeout. These timeouts, when they trigger, are registered as general failures however. And general failures cause us to dump all connections from the pool (propagating that error to all consumers currently waiting for a connection). That scheme is sound for actual io errors (because a failure to rpc on one connection almost certainly means something is badly wrong with all other open connections), but causes us to fail early and often when applied to timeouts. The fix is to treat timeouts on connect and refresh lightly (start connecting a new connection on timeout) and allow the general request timeouts to handle timing out user requests. (cherry picked from commit 78f62c485a390f79c84baea51d840aaa8fb9c999)
author: Jason Carey <jcarey@argv.me> 2016-11-30 15:31:19 -0500
committer: Jason Carey <jcarey@argv.me> 2016-12-08 17:34:59 -0500
commit: 743aaabc8aa4600599a79f6ef056a8e9e02e0fc6 (patch)
tree: fc6c4d343548506dcedfea8a73a461c61cde86ed
parent: a094b2d8d72d29ebb8197dbc6f549fde2f309508 (diff)
download: mongo-743aaabc8aa4600599a79f6ef056a8e9e02e0fc6.tar.gz
2 files changed, 129 insertions, 13 deletions
diff --git a/src/mongo/executor/connection_pool.cpp b/src/mongo/executor/connection_pool.cpp
index eb4b9effb97..92fa55aeaf3 100644
--- a/src/mongo/executor/connection_pool.cpp
+++ b/src/mongo/executor/connection_pool.cpp
@@ -118,7 +118,7 @@ private:
 
     void fulfillRequests(stdx::unique_lock<stdx::mutex>& lk);
 
-    void spawnConnections(stdx::unique_lock<stdx::mutex>& lk, const HostAndPort& hostAndPort);
+    void spawnConnections(stdx::unique_lock<stdx::mutex>& lk);
 
     void shutdown();
 
@@ -143,6 +143,7 @@ private:
     Date_t _requestTimerExpiration;
     size_t _generation;
     bool _inFulfillRequests;
+    bool _inSpawnConnections;
 
     size_t _created;
 
@@ -253,6 +254,7 @@ ConnectionPool::SpecificPool::SpecificPool(ConnectionPool* parent, const HostAnd
       _requestTimer(parent->_factory->makeTimer()),
       _generation(0),
       _inFulfillRequests(false),
+      _inSpawnConnections(false),
       _created(0),
       _state(State::kRunning) {}
 
@@ -282,18 +284,17 @@ void ConnectionPool::SpecificPool::getConnection(const HostAndPort& hostAndPort,
                                                  Milliseconds timeout,
                                                  stdx::unique_lock<stdx::mutex> lk,
                                                  GetConnectionCallback cb) {
-    // We need some logic here to handle kNoTimeout, which is defined as -1 Milliseconds. If we just
-    // added the timeout, we would get a time 1MS in the past, which would immediately timeout - the
-    // exact opposite of what we want.
-    auto expiration = (timeout == RemoteCommandRequest::kNoTimeout)
-        ? RemoteCommandRequest::kNoExpirationDate
-        : _parent->_factory->now() + timeout;
+    if (timeout < Milliseconds(0) || timeout > _parent->_options.refreshTimeout) {
+        timeout = _parent->_options.refreshTimeout;
+    }
+
+    const auto expiration = _parent->_factory->now() + timeout;
 
     _requests.push(make_pair(expiration, std::move(cb)));
 
     updateStateInLock();
 
-    spawnConnections(lk, hostAndPort);
+    spawnConnections(lk);
     fulfillRequests(lk);
 }
 
@@ -356,6 +357,14 @@ void ConnectionPool::SpecificPool::returnConnection(ConnectionInterface* connPtr
                                  return;
                              }
 
+                             // If we've exceeded the time limit, start a new connect, rather than
+                             // failing all operations.  We do this because the various callers have
+                             // their own time limit which is unrelated to our internal one.
+                             if (status.code() == ErrorCodes::ExceededTimeLimit) {
+                                 spawnConnections(lk);
+                                 return;
+                             }
+
                              // Otherwise pass the failure on through
                              processFailure(status, std::move(lk));
                          });
@@ -469,7 +478,7 @@ void ConnectionPool::SpecificPool::fulfillRequests(stdx::unique_lock<stdx::mutex
             if (_readyPool.empty()) {
                 log() << "after drop, pool was empty, going to spawn some connections";
                 // Spawn some more connections to the bad host if we're all out.
-                spawnConnections(lk, conn->getHostAndPort());
+                spawnConnections(lk);
             }
 
             // Drop the bad connection.
@@ -499,8 +508,15 @@ void ConnectionPool::SpecificPool::fulfillRequests(stdx::unique_lock<stdx::mutex
 
 // spawn enough connections to satisfy open requests and minpool, while
 // honoring maxpool
-void ConnectionPool::SpecificPool::spawnConnections(stdx::unique_lock<stdx::mutex>& lk,
-                                                    const HostAndPort& hostAndPort) {
+void ConnectionPool::SpecificPool::spawnConnections(stdx::unique_lock<stdx::mutex>& lk) {
+    // If some other thread (possibly this thread) is spawning connections,
+    // don't keep padding the callstack.
+    if (_inSpawnConnections)
+        return;
+
+    _inSpawnConnections = true;
+    auto guard = MakeGuard([&] { _inSpawnConnections = false; });
+
     // We want minConnections <= outstanding requests <= maxConnections
     auto target = [&] {
         return std::max(
@@ -513,7 +529,7 @@ void ConnectionPool::SpecificPool::spawnConnections(stdx::unique_lock<stdx::mute
         std::unique_ptr<ConnectionPool::ConnectionInterface> handle;
         try {
             // make a new connection and put it in processing
-            handle = _parent->_factory->makeConnection(hostAndPort, _generation);
+            handle = _parent->_factory->makeConnection(_hostAndPort, _generation);
         } catch (std::system_error& e) {
             severe() << "Failed to construct a new connection object: " << e.what();
             fassertFailed(40336);
@@ -539,6 +555,11 @@ void ConnectionPool::SpecificPool::spawnConnections(stdx::unique_lock<stdx::mute
                                // connection lapse
                            } else if (status.isOK()) {
                                addToReady(lk, std::move(conn));
+                           } else if (status.code() == ErrorCodes::ExceededTimeLimit) {
+                               // If we've exceeded the time limit, restart the connect, rather than
+                               // failing all operations.  We do this because the various callers
+                               // have their own time limit which is unrelated to our internal one.
+                               spawnConnections(lk);
                            } else {
                                // If the setup failed, cascade the failure edge
                                processFailure(status, std::move(lk));
diff --git a/src/mongo/executor/connection_pool_test.cpp b/src/mongo/executor/connection_pool_test.cpp
index 2528d561e85..8b7a492db5e 100644
--- a/src/mongo/executor/connection_pool_test.cpp
+++ b/src/mongo/executor/connection_pool_test.cpp
@@ -308,13 +308,14 @@ TEST_F(ConnectionPoolTest, refreshTimeoutHappens) {
     // see if that pans out. In this case, we'll get a failure on timeout.
     ConnectionImpl::pushSetup(Status::OK());
     pool.get(HostAndPort(),
-             Milliseconds(10000),
+             Milliseconds(1000),
              [&](StatusWith<ConnectionPool::ConnectionHandle> swConn) {
                  ASSERT(!swConn.isOK());
 
                  reachedA = true;
              });
     ASSERT(!reachedA);
+    PoolImpl::setNow(now + Milliseconds(3000));
 
     // Let the refresh timeout
     PoolImpl::setNow(now + Milliseconds(4000));
@@ -830,6 +831,100 @@ TEST_F(ConnectionPoolTest, dropConnections) {
     ASSERT(reachedB);
 }
 
+/**
+ * Verify that timeouts during setup don't prematurely time out unrelated requests
+ */
+TEST_F(ConnectionPoolTest, SetupTimeoutsDontTimeoutUnrelatedRequests) {
+    ConnectionPool::Options options;
+
+    options.maxConnections = 1;
+    options.refreshTimeout = Seconds(2);
+    ConnectionPool pool(stdx::make_unique<PoolImpl>(), "test pool", options);
+
+    auto now = Date_t::now();
+    PoolImpl::setNow(now);
+
+    boost::optional<StatusWith<ConnectionPool::ConnectionHandle>> conn1;
+    pool.get(HostAndPort(), Seconds(10), [&](StatusWith<ConnectionPool::ConnectionHandle> swConn) {
+        conn1 = std::move(swConn);
+    });
+
+    // initially we haven't called our callback
+    ASSERT(!conn1);
+
+    PoolImpl::setNow(now + Seconds(1));
+
+    // Still haven't fired on conn1
+    ASSERT(!conn1);
+
+    // Get conn2 (which should have an extra second before the timeout)
+    boost::optional<StatusWith<ConnectionPool::ConnectionHandle>> conn2;
+    pool.get(HostAndPort(), Seconds(10), [&](StatusWith<ConnectionPool::ConnectionHandle> swConn) {
+        conn2 = std::move(swConn);
+    });
+
+    PoolImpl::setNow(now + Seconds(2));
+
+    ASSERT(conn1);
+    ASSERT(!conn1->isOK());
+    ASSERT(conn1->getStatus().code() == ErrorCodes::ExceededTimeLimit);
+
+    ASSERT(!conn2);
+}
+
+/**
+ * Verify that timeouts during refresh don't prematurely time out unrelated requests
+ */
+TEST_F(ConnectionPoolTest, RefreshTimeoutsDontTimeoutRequests) {
+    ConnectionPool::Options options;
+
+    options.maxConnections = 1;
+    options.refreshTimeout = Seconds(2);
+    options.refreshRequirement = Seconds(3);
+    ConnectionPool pool(stdx::make_unique<PoolImpl>(), "test pool", options);
+
+    auto now = Date_t::now();
+    PoolImpl::setNow(now);
+
+    // Successfully get a new connection
+    size_t conn1Id = 0;
+    ConnectionImpl::pushSetup(Status::OK());
+    pool.get(HostAndPort(), Seconds(1), [&](StatusWith<ConnectionPool::ConnectionHandle> swConn) {
+        conn1Id = CONN2ID(swConn);
+        doneWith(swConn.getValue());
+    });
+    ASSERT(conn1Id);
+
+    // Force it into refresh
+    PoolImpl::setNow(now + Seconds(3));
+
+    boost::optional<StatusWith<ConnectionPool::ConnectionHandle>> conn1;
+    pool.get(HostAndPort(), Seconds(10), [&](StatusWith<ConnectionPool::ConnectionHandle> swConn) {
+        conn1 = std::move(swConn);
+    });
+
+    // initially we haven't called our callback
+    ASSERT(!conn1);
+
+    // 1 second later we've triggered a refresh and still haven't called the callback
+    PoolImpl::setNow(now + Seconds(4));
+    ASSERT(!conn1);
+
+    // Get conn2 (which should have an extra second before the timeout)
+    boost::optional<StatusWith<ConnectionPool::ConnectionHandle>> conn2;
+    pool.get(HostAndPort(), Seconds(10), [&](StatusWith<ConnectionPool::ConnectionHandle> swConn) {
+        conn2 = std::move(swConn);
+    });
+
+    PoolImpl::setNow(now + Seconds(5));
+
+    ASSERT(conn1);
+    ASSERT(!conn1->isOK());
+    ASSERT(conn1->getStatus().code() == ErrorCodes::ExceededTimeLimit);
+
+    ASSERT(!conn2);
+}
+
 }  // namespace connection_pool_test_details
 }  // namespace executor
 }  // namespace mongo
author	Jason Carey <jcarey@argv.me>	2016-11-30 15:31:19 -0500
committer	Jason Carey <jcarey@argv.me>	2016-12-08 17:34:59 -0500
commit	743aaabc8aa4600599a79f6ef056a8e9e02e0fc6 (patch)
tree	fc6c4d343548506dcedfea8a73a461c61cde86ed
parent	a094b2d8d72d29ebb8197dbc6f549fde2f309508 (diff)
download	mongo-743aaabc8aa4600599a79f6ef056a8e9e02e0fc6.tar.gz