summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlan Conway <aconway@apache.org>2011-03-30 19:30:16 +0000
committerAlan Conway <aconway@apache.org>2011-03-30 19:30:16 +0000
commit61dc74c19dd6248e4f467804b4c8f00c8cf4ad12 (patch)
tree62a2bfe04a02014f8d43563d76fac434a2061ae2
parentcfbe48cd9b8432600864e89465c321020a8940cd (diff)
downloadqpid-python-61dc74c19dd6248e4f467804b4c8f00c8cf4ad12.tar.gz
QPID-3129: cluster_tests.LongTests.test_failover hangs
Fix is a race condition in posix/Socket.cpp Socket::connect. When connecting to a port on the same host which no longer has a process associated with it the OS occasionally chooses the remote port (which is unoccupied) as the port to bind the local end of the socket, resulting in a "circular" connection. This seems like something the OS should prevent but I have confirmed that the sporadic hangs in cluster_tests.LongTests.test_failover on RHEL5 are caused by such a circular connection. The fix is to detect circular connections and raise an error. git-svn-id: https://svn.apache.org/repos/asf/qpid/trunk@1087052 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r--qpid/cpp/src/qpid/sys/posix/Socket.cpp37
-rw-r--r--qpid/cpp/src/tests/brokertest.py2
2 files changed, 28 insertions, 11 deletions
diff --git a/qpid/cpp/src/qpid/sys/posix/Socket.cpp b/qpid/cpp/src/qpid/sys/posix/Socket.cpp
index 7b906f33e8..3449a753e3 100644
--- a/qpid/cpp/src/qpid/sys/posix/Socket.cpp
+++ b/qpid/cpp/src/qpid/sys/posix/Socket.cpp
@@ -7,9 +7,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -45,9 +45,9 @@ namespace sys {
namespace {
std::string getName(int fd, bool local, bool includeService = false)
{
- ::sockaddr_storage name; // big enough for any socket address
+ ::sockaddr_storage name; // big enough for any socket address
::socklen_t namelen = sizeof(name);
-
+
int result = -1;
if (local) {
result = ::getsockname(fd, (::sockaddr*)&name, &namelen);
@@ -60,8 +60,8 @@ std::string getName(int fd, bool local, bool includeService = false)
char servName[NI_MAXSERV];
char dispName[NI_MAXHOST];
if (includeService) {
- if (int rc=::getnameinfo((::sockaddr*)&name, namelen, dispName, sizeof(dispName),
- servName, sizeof(servName),
+ if (int rc=::getnameinfo((::sockaddr*)&name, namelen, dispName, sizeof(dispName),
+ servName, sizeof(servName),
NI_NUMERICHOST | NI_NUMERICSERV) != 0)
throw QPID_POSIX_ERROR(rc);
return std::string(dispName) + ":" + std::string(servName);
@@ -75,9 +75,9 @@ std::string getName(int fd, bool local, bool includeService = false)
std::string getService(int fd, bool local)
{
- ::sockaddr_storage name; // big enough for any socket address
+ ::sockaddr_storage name; // big enough for any socket address
::socklen_t namelen = sizeof(name);
-
+
int result = -1;
if (local) {
result = ::getsockname(fd, (::sockaddr*)&name, &namelen);
@@ -88,8 +88,8 @@ std::string getService(int fd, bool local)
QPID_POSIX_CHECK(result);
char servName[NI_MAXSERV];
- if (int rc=::getnameinfo((::sockaddr*)&name, namelen, 0, 0,
- servName, sizeof(servName),
+ if (int rc=::getnameinfo((::sockaddr*)&name, namelen, 0, 0,
+ servName, sizeof(servName),
NI_NUMERICHOST | NI_NUMERICSERV) != 0)
throw QPID_POSIX_ERROR(rc);
return servName;
@@ -172,6 +172,23 @@ void Socket::connect(const SocketAddress& addr) const
(errno != EINPROGRESS)) {
throw Exception(QPID_MSG(strError(errno) << ": " << connectname));
}
+ // When connecting to a port on the same host which no longer has
+ // a process associated with it, the OS occasionally chooses the
+ // remote port (which is unoccupied) as the port to bind the local
+ // end of the socket, resulting in a "circular" connection.
+ //
+ // This seems like something the OS should prevent but I have
+ // confirmed that sporadic hangs in
+ // cluster_tests.LongTests.test_failover on RHEL5 are caused by
+ // such a circular connection.
+ //
+ // Raise an error if we see such a connection, since we know there is
+ // no listener on the peer address.
+ //
+ if (getLocalAddress() == getPeerAddress()) {
+ close();
+ throw Exception(QPID_MSG("Connection refused: " << connectname));
+ }
}
void
diff --git a/qpid/cpp/src/tests/brokertest.py b/qpid/cpp/src/tests/brokertest.py
index 1023f7152f..4abe4c2cbe 100644
--- a/qpid/cpp/src/tests/brokertest.py
+++ b/qpid/cpp/src/tests/brokertest.py
@@ -498,7 +498,7 @@ class BrokerTest(TestCase):
r.close()
self.assertEqual(expect_contents, actual_contents)
-def join(thread, timeout=1):
+def join(thread, timeout=10):
thread.join(timeout)
if thread.isAlive(): raise Exception("Timed out joining thread %s"%thread)