diff options
author | Alan Conway <aconway@apache.org> | 2011-03-30 19:30:16 +0000 |
---|---|---|
committer | Alan Conway <aconway@apache.org> | 2011-03-30 19:30:16 +0000 |
commit | 61dc74c19dd6248e4f467804b4c8f00c8cf4ad12 (patch) | |
tree | 62a2bfe04a02014f8d43563d76fac434a2061ae2 | |
parent | cfbe48cd9b8432600864e89465c321020a8940cd (diff) | |
download | qpid-python-61dc74c19dd6248e4f467804b4c8f00c8cf4ad12.tar.gz |
QPID-3129: cluster_tests.LongTests.test_failover hangs
Fix is a race condition in posix/Socket.cpp Socket::connect.
When connecting to a port on the same host which no longer has a
process associated with it the OS occasionally chooses the remote
port (which is unoccupied) as the port to bind the local end of the
socket, resulting in a "circular" connection.
This seems like something the OS should prevent but I have confirmed
that the sporadic hangs in cluster_tests.LongTests.test_failover on
RHEL5 are caused by such a circular connection.
The fix is to detect circular connections and raise an error.
git-svn-id: https://svn.apache.org/repos/asf/qpid/trunk@1087052 13f79535-47bb-0310-9956-ffa450edef68
-rw-r--r-- | qpid/cpp/src/qpid/sys/posix/Socket.cpp | 37 | ||||
-rw-r--r-- | qpid/cpp/src/tests/brokertest.py | 2 |
2 files changed, 28 insertions, 11 deletions
diff --git a/qpid/cpp/src/qpid/sys/posix/Socket.cpp b/qpid/cpp/src/qpid/sys/posix/Socket.cpp index 7b906f33e8..3449a753e3 100644 --- a/qpid/cpp/src/qpid/sys/posix/Socket.cpp +++ b/qpid/cpp/src/qpid/sys/posix/Socket.cpp @@ -7,9 +7,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -45,9 +45,9 @@ namespace sys { namespace { std::string getName(int fd, bool local, bool includeService = false) { - ::sockaddr_storage name; // big enough for any socket address + ::sockaddr_storage name; // big enough for any socket address ::socklen_t namelen = sizeof(name); - + int result = -1; if (local) { result = ::getsockname(fd, (::sockaddr*)&name, &namelen); @@ -60,8 +60,8 @@ std::string getName(int fd, bool local, bool includeService = false) char servName[NI_MAXSERV]; char dispName[NI_MAXHOST]; if (includeService) { - if (int rc=::getnameinfo((::sockaddr*)&name, namelen, dispName, sizeof(dispName), - servName, sizeof(servName), + if (int rc=::getnameinfo((::sockaddr*)&name, namelen, dispName, sizeof(dispName), + servName, sizeof(servName), NI_NUMERICHOST | NI_NUMERICSERV) != 0) throw QPID_POSIX_ERROR(rc); return std::string(dispName) + ":" + std::string(servName); @@ -75,9 +75,9 @@ std::string getName(int fd, bool local, bool includeService = false) std::string getService(int fd, bool local) { - ::sockaddr_storage name; // big enough for any socket address + ::sockaddr_storage name; // big enough for any socket address ::socklen_t namelen = sizeof(name); - + int result = -1; if (local) { result = ::getsockname(fd, (::sockaddr*)&name, &namelen); @@ -88,8 +88,8 @@ std::string getService(int fd, bool local) QPID_POSIX_CHECK(result); char servName[NI_MAXSERV]; - if (int rc=::getnameinfo((::sockaddr*)&name, namelen, 0, 0, - servName, sizeof(servName), + if (int rc=::getnameinfo((::sockaddr*)&name, namelen, 0, 0, + servName, sizeof(servName), NI_NUMERICHOST | NI_NUMERICSERV) != 0) throw QPID_POSIX_ERROR(rc); return servName; @@ -172,6 +172,23 @@ void Socket::connect(const SocketAddress& addr) const (errno != EINPROGRESS)) { throw Exception(QPID_MSG(strError(errno) << ": " << connectname)); } + // When connecting to a port on the same host which no longer has + // a process associated with it, the OS occasionally chooses the + // remote port (which is unoccupied) as the port to bind the local + // end of the socket, resulting in a "circular" connection. + // + // This seems like something the OS should prevent but I have + // confirmed that sporadic hangs in + // cluster_tests.LongTests.test_failover on RHEL5 are caused by + // such a circular connection. + // + // Raise an error if we see such a connection, since we know there is + // no listener on the peer address. + // + if (getLocalAddress() == getPeerAddress()) { + close(); + throw Exception(QPID_MSG("Connection refused: " << connectname)); + } } void diff --git a/qpid/cpp/src/tests/brokertest.py b/qpid/cpp/src/tests/brokertest.py index 1023f7152f..4abe4c2cbe 100644 --- a/qpid/cpp/src/tests/brokertest.py +++ b/qpid/cpp/src/tests/brokertest.py @@ -498,7 +498,7 @@ class BrokerTest(TestCase): r.close() self.assertEqual(expect_contents, actual_contents) -def join(thread, timeout=1): +def join(thread, timeout=10): thread.join(timeout) if thread.isAlive(): raise Exception("Timed out joining thread %s"%thread) |