summaryrefslogtreecommitdiff
path: root/ndb/src
diff options
context:
space:
mode:
authortomas@poseidon.ndb.mysql.com <>2006-04-26 16:57:45 +0200
committertomas@poseidon.ndb.mysql.com <>2006-04-26 16:57:45 +0200
commitee3bdf586b128b4985730df9ff4c598b4423d0d9 (patch)
treed43661c653b11d5f2a61900dba2839a1a65251b4 /ndb/src
parentedba345e568ec5b4234a1dc8e9a39f2655307e69 (diff)
downloadmariadb-git-ee3bdf586b128b4985730df9ff4c598b4423d0d9.tar.gz
Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
- addded more retries to wait for nodefailure to complete Bug #19039 multi node failure causes node failure handling not to complete - patch to avoid this scenario when the management server is used to perform the stop - wait for NF_COMPLETE_REP in management server before returning ndb: allocate nodeid - only retry on retryable error
Diffstat (limited to 'ndb/src')
-rw-r--r--ndb/src/common/mgmcommon/ConfigRetriever.cpp6
-rw-r--r--ndb/src/kernel/vm/Configuration.cpp3
-rw-r--r--ndb/src/mgmapi/mgmapi.cpp13
-rw-r--r--ndb/src/mgmsrv/MgmtSrvr.cpp146
-rw-r--r--ndb/src/mgmsrv/MgmtSrvr.hpp6
-rw-r--r--ndb/src/mgmsrv/Services.cpp17
6 files changed, 141 insertions, 50 deletions
diff --git a/ndb/src/common/mgmcommon/ConfigRetriever.cpp b/ndb/src/common/mgmcommon/ConfigRetriever.cpp
index fb3531d81f6..c2b3e8235eb 100644
--- a/ndb/src/common/mgmcommon/ConfigRetriever.cpp
+++ b/ndb/src/common/mgmcommon/ConfigRetriever.cpp
@@ -349,12 +349,14 @@ ConfigRetriever::allocNodeId(int no_retries, int retry_delay_in_seconds)
if(!ndb_mgm_connect(m_handle, 0, 0, 0))
goto next;
- res= ndb_mgm_alloc_nodeid(m_handle, m_version, m_node_type);
+ res= ndb_mgm_alloc_nodeid(m_handle, m_version, m_node_type,
+ no_retries == 0 /* only log last retry */);
if(res >= 0)
return _ownNodeId= (Uint32)res;
next:
- if (no_retries == 0)
+ int error = ndb_mgm_get_latest_error(m_handle);
+ if (no_retries == 0 || error == NDB_MGM_ALLOCID_CONFIG_MISMATCH)
break;
no_retries--;
NdbSleep_SecSleep(retry_delay_in_seconds);
diff --git a/ndb/src/kernel/vm/Configuration.cpp b/ndb/src/kernel/vm/Configuration.cpp
index 425907e24c6..b73a82df66b 100644
--- a/ndb/src/kernel/vm/Configuration.cpp
+++ b/ndb/src/kernel/vm/Configuration.cpp
@@ -286,7 +286,8 @@ Configuration::fetch_configuration(){
if (globalData.ownId)
cr.setNodeId(globalData.ownId);
- globalData.ownId = cr.allocNodeId(2 /*retry*/,3 /*delay*/);
+ globalData.ownId = cr.allocNodeId(globalData.ownId ? 10 : 2 /*retry*/,
+ 3 /*delay*/);
if(globalData.ownId == 0){
ERROR_SET(fatal, NDBD_EXIT_INVALID_CONFIG,
diff --git a/ndb/src/mgmapi/mgmapi.cpp b/ndb/src/mgmapi/mgmapi.cpp
index b02367a8870..f99478a8cea 100644
--- a/ndb/src/mgmapi/mgmapi.cpp
+++ b/ndb/src/mgmapi/mgmapi.cpp
@@ -1868,7 +1868,8 @@ const char *ndb_mgm_get_connectstring(NdbMgmHandle handle, char *buf, int buf_sz
extern "C"
int
-ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype)
+ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype,
+ int log_event)
{
CHECK_HANDLE(handle, 0);
CHECK_CONNECTED(handle, 0);
@@ -1888,9 +1889,11 @@ ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype)
args.put("endian", (endian_check.c[sizeof(long)-1])?"big":"little");
if (handle->m_name)
args.put("name", handle->m_name);
+ args.put("log_event", log_event);
const ParserRow<ParserDummy> reply[]= {
MGM_CMD("get nodeid reply", NULL, ""),
+ MGM_ARG("error_code", Int, Optional, "Error code"),
MGM_ARG("nodeid", Int, Optional, "Error message"),
MGM_ARG("result", String, Mandatory, "Error message"),
MGM_END()
@@ -1903,14 +1906,16 @@ ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype)
nodeid= -1;
do {
const char * buf;
- if(!prop->get("result", &buf) || strcmp(buf, "Ok") != 0){
+ if (!prop->get("result", &buf) || strcmp(buf, "Ok") != 0)
+ {
const char *hostname= ndb_mgm_get_connected_host(handle);
unsigned port= ndb_mgm_get_connected_port(handle);
BaseString err;
+ Uint32 error_code= NDB_MGM_ALLOCID_ERROR;
err.assfmt("Could not alloc node id at %s port %d: %s",
hostname, port, buf);
- setError(handle, NDB_MGM_COULD_NOT_CONNECT_TO_SOCKET, __LINE__,
- err.c_str());
+ prop->get("error_code", &error_code);
+ setError(handle, error_code, __LINE__, err.c_str());
break;
}
Uint32 _nodeid;
diff --git a/ndb/src/mgmsrv/MgmtSrvr.cpp b/ndb/src/mgmsrv/MgmtSrvr.cpp
index c7d0c11eec4..d40eaab7bd5 100644
--- a/ndb/src/mgmsrv/MgmtSrvr.cpp
+++ b/ndb/src/mgmsrv/MgmtSrvr.cpp
@@ -507,9 +507,10 @@ MgmtSrvr::MgmtSrvr(SocketServer *socket_server,
if (_ownNodeId == 0) // we did not get node id from other server
{
NodeId tmp= m_config_retriever->get_configuration_nodeid();
+ int error_code;
if (!alloc_node_id(&tmp, NDB_MGM_NODE_TYPE_MGM,
- 0, 0, error_string)){
+ 0, 0, error_code, error_string)){
ndbout << "Unable to obtain requested nodeid: "
<< error_string.c_str() << endl;
require(false);
@@ -1118,31 +1119,16 @@ int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids,
const NFCompleteRep * const rep =
CAST_CONSTPTR(NFCompleteRep, signal->getDataPtr());
#ifdef VM_TRACE
- ndbout_c("Node %d fail completed", rep->failedNodeId);
+ ndbout_c("sendSTOP_REQ Node %d fail completed", rep->failedNodeId);
#endif
+ nodes.clear(rep->failedNodeId); // clear the failed node
+ if (singleUserNodeId == 0)
+ stoppedNodes.set(rep->failedNodeId);
break;
}
case GSN_NODE_FAILREP:{
const NodeFailRep * const rep =
CAST_CONSTPTR(NodeFailRep, signal->getDataPtr());
- NodeBitmask failedNodes;
- failedNodes.assign(NodeBitmask::Size, rep->theNodes);
-#ifdef VM_TRACE
- {
- ndbout << "Failed nodes:";
- for (unsigned i = 0; i < 32*NodeBitmask::Size; i++)
- if(failedNodes.get(i))
- ndbout << " " << i;
- ndbout << endl;
- }
-#endif
- failedNodes.bitAND(nodes);
- if (!failedNodes.isclear())
- {
- nodes.bitANDC(failedNodes); // clear the failed nodes
- if (singleUserNodeId == 0)
- stoppedNodes.bitOR(failedNodes);
- }
break;
}
default:
@@ -1263,11 +1249,47 @@ int MgmtSrvr::restartNodes(const Vector<NodeId> &node_ids,
abort,
false,
true,
- nostart,
+ true,
initialStart);
+
+ if (ret)
+ return ret;
+
if (stopCount)
*stopCount = nodes.count();
- return ret;
+
+ // start up the nodes again
+ int waitTime = 12000;
+ NDB_TICKS maxTime = NdbTick_CurrentMillisecond() + waitTime;
+ for (unsigned i = 0; i < node_ids.size(); i++)
+ {
+ NodeId nodeId= node_ids[i];
+ enum ndb_mgm_node_status s;
+ s = NDB_MGM_NODE_STATUS_NO_CONTACT;
+#ifdef VM_TRACE
+ ndbout_c("Waiting for %d not started", nodeId);
+#endif
+ while (s != NDB_MGM_NODE_STATUS_NOT_STARTED && waitTime > 0)
+ {
+ Uint32 startPhase = 0, version = 0, dynamicId = 0, nodeGroup = 0;
+ Uint32 connectCount = 0;
+ bool system;
+ const char *address;
+ status(nodeId, &s, &version, &startPhase,
+ &system, &dynamicId, &nodeGroup, &connectCount, &address);
+ NdbSleep_MilliSleep(100);
+ waitTime = (maxTime - NdbTick_CurrentMillisecond());
+ }
+ }
+
+ if (nostart)
+ return 0;
+
+ for (unsigned i = 0; i < node_ids.size(); i++)
+ {
+ int result = start(node_ids[i]);
+ }
+ return 0;
}
/*
@@ -1918,7 +1940,8 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
enum ndb_mgm_node_type type,
struct sockaddr *client_addr,
SOCKET_SIZE_TYPE *client_addr_len,
- BaseString &error_string)
+ int &error_code, BaseString &error_string,
+ int log_event)
{
DBUG_ENTER("MgmtSrvr::alloc_node_id");
DBUG_PRINT("enter", ("nodeid=%d, type=%d, client_addr=%d",
@@ -1927,6 +1950,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
if (*nodeId == 0) {
error_string.appfmt("no-nodeid-checks set in management server.\n"
"node id must be set explicitly in connectstring");
+ error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
DBUG_RETURN(false);
}
DBUG_RETURN(true);
@@ -1951,8 +1975,10 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
if(NdbMutex_Lock(m_configMutex))
{
+ // should not happen
error_string.appfmt("unable to lock configuration mutex");
- return false;
+ error_code = NDB_MGM_ALLOCID_ERROR;
+ DBUG_RETURN(false);
}
ndb_mgm_configuration_iterator
iter(* _config->m_configValues, CFG_SECTION_NODE);
@@ -2023,6 +2049,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
"or specifying unique host names in config file.",
id_found, tmp);
NdbMutex_Unlock(m_configMutex);
+ error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
DBUG_RETURN(false);
}
if (config_hostname == 0) {
@@ -2031,6 +2058,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
"or specifying unique host names in config file,\n"
"or specifying just one mgmt server in config file.",
tmp);
+ error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
DBUG_RETURN(false);
}
id_found= tmp; // mgmt server matched, check for more matches
@@ -2072,8 +2100,9 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
char tmp_str[128];
m_reserved_nodes.getText(tmp_str);
- g_eventLogger.info("Mgmt server state: nodeid %d reserved for ip %s, m_reserved_nodes %s.",
- id_found, get_connect_address(id_found), tmp_str);
+ g_eventLogger.info("Mgmt server state: nodeid %d reserved for ip %s, "
+ "m_reserved_nodes %s.",
+ id_found, get_connect_address(id_found), tmp_str);
DBUG_RETURN(true);
}
@@ -2093,26 +2122,48 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
type_c_string.assfmt("%s(%s)", alias, str);
}
- if (*nodeId == 0) {
+ if (*nodeId == 0)
+ {
if (found_matching_id)
+ {
if (found_matching_type)
+ {
if (found_free_node)
+ {
error_string.appfmt("Connection done from wrong host ip %s.",
(client_addr)?
- inet_ntoa(((struct sockaddr_in *)
+ inet_ntoa(((struct sockaddr_in *)
(client_addr))->sin_addr):"");
+ error_code = NDB_MGM_ALLOCID_ERROR;
+ }
else
+ {
error_string.appfmt("No free node id found for %s.",
type_string.c_str());
+ error_code = NDB_MGM_ALLOCID_ERROR;
+ }
+ }
else
+ {
error_string.appfmt("No %s node defined in config file.",
type_string.c_str());
+ error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
+ }
+ }
else
+ {
error_string.append("No nodes defined in config file.");
- } else {
+ error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
+ }
+ }
+ else
+ {
if (found_matching_id)
+ {
if (found_matching_type)
- if (found_free_node) {
+ {
+ if (found_free_node)
+ {
// have to split these into two since inet_ntoa overwrites itself
error_string.appfmt("Connection with id %d done from wrong host ip %s,",
*nodeId, inet_ntoa(((struct sockaddr_in *)
@@ -2120,27 +2171,44 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId,
error_string.appfmt(" expected %s(%s).", config_hostname,
r_config_addr ?
"lookup failed" : inet_ntoa(config_addr));
- } else
+ error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
+ }
+ else
+ {
error_string.appfmt("Id %d already allocated by another node.",
*nodeId);
+ error_code = NDB_MGM_ALLOCID_ERROR;
+ }
+ }
else
+ {
error_string.appfmt("Id %d configured as %s, connect attempted as %s.",
*nodeId, type_c_string.c_str(),
type_string.c_str());
+ error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
+ }
+ }
else
+ {
error_string.appfmt("No node defined with id=%d in config file.",
*nodeId);
+ error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH;
+ }
}
- g_eventLogger.warning("Allocate nodeid (%d) failed. Connection from ip %s. "
- "Returned error string \"%s\"",
- *nodeId,
- client_addr != 0 ? inet_ntoa(((struct sockaddr_in *)(client_addr))->sin_addr) : "<none>",
- error_string.c_str());
-
- NodeBitmask connected_nodes2;
- get_connected_nodes(connected_nodes2);
+ if (log_event || error_code == NDB_MGM_ALLOCID_CONFIG_MISMATCH)
{
+ g_eventLogger.warning("Allocate nodeid (%d) failed. Connection from ip %s."
+ " Returned error string \"%s\"",
+ *nodeId,
+ client_addr != 0
+ ? inet_ntoa(((struct sockaddr_in *)
+ (client_addr))->sin_addr)
+ : "<none>",
+ error_string.c_str());
+
+ NodeBitmask connected_nodes2;
+ get_connected_nodes(connected_nodes2);
BaseString tmp_connected, tmp_not_connected;
for(Uint32 i = 0; i < MAX_NODES; i++)
{
diff --git a/ndb/src/mgmsrv/MgmtSrvr.hpp b/ndb/src/mgmsrv/MgmtSrvr.hpp
index e2eb33d1198..007494a277d 100644
--- a/ndb/src/mgmsrv/MgmtSrvr.hpp
+++ b/ndb/src/mgmsrv/MgmtSrvr.hpp
@@ -434,8 +434,10 @@ public:
*/
bool getNextNodeId(NodeId * _nodeId, enum ndb_mgm_node_type type) const ;
bool alloc_node_id(NodeId * _nodeId, enum ndb_mgm_node_type type,
- struct sockaddr *client_addr, SOCKET_SIZE_TYPE *client_addr_len,
- BaseString &error_string);
+ struct sockaddr *client_addr,
+ SOCKET_SIZE_TYPE *client_addr_len,
+ int &error_code, BaseString &error_string,
+ int log_event = 1);
/**
*
diff --git a/ndb/src/mgmsrv/Services.cpp b/ndb/src/mgmsrv/Services.cpp
index ee97235912a..555c2082480 100644
--- a/ndb/src/mgmsrv/Services.cpp
+++ b/ndb/src/mgmsrv/Services.cpp
@@ -138,6 +138,7 @@ ParserRow<MgmApiSession> commands[] = {
MGM_ARG("endian", String, Optional, "Endianness"),
MGM_ARG("name", String, Optional, "Name of connection"),
MGM_ARG("timeout", Int, Optional, "Timeout in seconds"),
+ MGM_ARG("log_event", Int, Optional, "Log failure in cluster log"),
MGM_CMD("get version", &MgmApiSession::getVersion, ""),
@@ -425,6 +426,8 @@ MgmApiSession::get_nodeid(Parser_t::Context &,
const char * public_key;
const char * endian= NULL;
const char * name= NULL;
+ Uint32 log_event= 1;
+ bool log_event_version;
union { long l; char c[sizeof(long)]; } endian_check;
args.get("version", &version);
@@ -437,6 +440,8 @@ MgmApiSession::get_nodeid(Parser_t::Context &,
args.get("endian", &endian);
args.get("name", &name);
args.get("timeout", &timeout);
+ /* for backwards compatability keep track if client uses new protocol */
+ log_event_version= args.get("log_event", &log_event);
endian_check.l = 1;
if(endian
@@ -476,11 +481,15 @@ MgmApiSession::get_nodeid(Parser_t::Context &,
NodeId tmp= nodeid;
if(tmp == 0 || !m_allocated_resources->is_reserved(tmp)){
BaseString error_string;
+ int error_code;
NDB_TICKS tick= 0;
+ /* only report error on second attempt as not to clog the cluster log */
while (!m_mgmsrv.alloc_node_id(&tmp, (enum ndb_mgm_node_type)nodetype,
- &addr, &addrlen, error_string))
+ &addr, &addrlen, error_code, error_string,
+ tick == 0 ? 0 : log_event))
{
- if (tick == 0)
+ /* NDB_MGM_ALLOCID_CONFIG_MISMATCH is a non retriable error */
+ if (tick == 0 && error_code != NDB_MGM_ALLOCID_CONFIG_MISMATCH)
{
// attempt to free any timed out reservations
tick= NdbTick_CurrentMillisecond();
@@ -492,6 +501,7 @@ MgmApiSession::get_nodeid(Parser_t::Context &,
ps.tick= tick;
m_mgmsrv.get_socket_server()->
foreachSession(stop_session_if_timed_out,&ps);
+ error_string = "";
continue;
}
const char *alias;
@@ -500,6 +510,9 @@ MgmApiSession::get_nodeid(Parser_t::Context &,
nodetype, &str);
m_output->println(cmd);
m_output->println("result: %s", error_string.c_str());
+ /* only use error_code protocol if client knows about it */
+ if (log_event_version)
+ m_output->println("error_code: %d", error_code);
m_output->println("");
return;
}