diff options
author | tomas@poseidon.ndb.mysql.com <> | 2006-04-26 16:57:45 +0200 |
---|---|---|
committer | tomas@poseidon.ndb.mysql.com <> | 2006-04-26 16:57:45 +0200 |
commit | ee3bdf586b128b4985730df9ff4c598b4423d0d9 (patch) | |
tree | d43661c653b11d5f2a61900dba2839a1a65251b4 /ndb/src | |
parent | edba345e568ec5b4234a1dc8e9a39f2655307e69 (diff) | |
download | mariadb-git-ee3bdf586b128b4985730df9ff4c598b4423d0d9.tar.gz |
Bug #18550 ndbd getting "node failure handling not complete..." after graceful restart
- addded more retries to wait for nodefailure to complete
Bug #19039 multi node failure causes node failure handling not to complete
- patch to avoid this scenario when the management server is used to perform the stop
- wait for NF_COMPLETE_REP in management server before returning
ndb: allocate nodeid
- only retry on retryable error
Diffstat (limited to 'ndb/src')
-rw-r--r-- | ndb/src/common/mgmcommon/ConfigRetriever.cpp | 6 | ||||
-rw-r--r-- | ndb/src/kernel/vm/Configuration.cpp | 3 | ||||
-rw-r--r-- | ndb/src/mgmapi/mgmapi.cpp | 13 | ||||
-rw-r--r-- | ndb/src/mgmsrv/MgmtSrvr.cpp | 146 | ||||
-rw-r--r-- | ndb/src/mgmsrv/MgmtSrvr.hpp | 6 | ||||
-rw-r--r-- | ndb/src/mgmsrv/Services.cpp | 17 |
6 files changed, 141 insertions, 50 deletions
diff --git a/ndb/src/common/mgmcommon/ConfigRetriever.cpp b/ndb/src/common/mgmcommon/ConfigRetriever.cpp index fb3531d81f6..c2b3e8235eb 100644 --- a/ndb/src/common/mgmcommon/ConfigRetriever.cpp +++ b/ndb/src/common/mgmcommon/ConfigRetriever.cpp @@ -349,12 +349,14 @@ ConfigRetriever::allocNodeId(int no_retries, int retry_delay_in_seconds) if(!ndb_mgm_connect(m_handle, 0, 0, 0)) goto next; - res= ndb_mgm_alloc_nodeid(m_handle, m_version, m_node_type); + res= ndb_mgm_alloc_nodeid(m_handle, m_version, m_node_type, + no_retries == 0 /* only log last retry */); if(res >= 0) return _ownNodeId= (Uint32)res; next: - if (no_retries == 0) + int error = ndb_mgm_get_latest_error(m_handle); + if (no_retries == 0 || error == NDB_MGM_ALLOCID_CONFIG_MISMATCH) break; no_retries--; NdbSleep_SecSleep(retry_delay_in_seconds); diff --git a/ndb/src/kernel/vm/Configuration.cpp b/ndb/src/kernel/vm/Configuration.cpp index 425907e24c6..b73a82df66b 100644 --- a/ndb/src/kernel/vm/Configuration.cpp +++ b/ndb/src/kernel/vm/Configuration.cpp @@ -286,7 +286,8 @@ Configuration::fetch_configuration(){ if (globalData.ownId) cr.setNodeId(globalData.ownId); - globalData.ownId = cr.allocNodeId(2 /*retry*/,3 /*delay*/); + globalData.ownId = cr.allocNodeId(globalData.ownId ? 10 : 2 /*retry*/, + 3 /*delay*/); if(globalData.ownId == 0){ ERROR_SET(fatal, NDBD_EXIT_INVALID_CONFIG, diff --git a/ndb/src/mgmapi/mgmapi.cpp b/ndb/src/mgmapi/mgmapi.cpp index b02367a8870..f99478a8cea 100644 --- a/ndb/src/mgmapi/mgmapi.cpp +++ b/ndb/src/mgmapi/mgmapi.cpp @@ -1868,7 +1868,8 @@ const char *ndb_mgm_get_connectstring(NdbMgmHandle handle, char *buf, int buf_sz extern "C" int -ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype) +ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype, + int log_event) { CHECK_HANDLE(handle, 0); CHECK_CONNECTED(handle, 0); @@ -1888,9 +1889,11 @@ ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype) args.put("endian", (endian_check.c[sizeof(long)-1])?"big":"little"); if (handle->m_name) args.put("name", handle->m_name); + args.put("log_event", log_event); const ParserRow<ParserDummy> reply[]= { MGM_CMD("get nodeid reply", NULL, ""), + MGM_ARG("error_code", Int, Optional, "Error code"), MGM_ARG("nodeid", Int, Optional, "Error message"), MGM_ARG("result", String, Mandatory, "Error message"), MGM_END() @@ -1903,14 +1906,16 @@ ndb_mgm_alloc_nodeid(NdbMgmHandle handle, unsigned int version, int nodetype) nodeid= -1; do { const char * buf; - if(!prop->get("result", &buf) || strcmp(buf, "Ok") != 0){ + if (!prop->get("result", &buf) || strcmp(buf, "Ok") != 0) + { const char *hostname= ndb_mgm_get_connected_host(handle); unsigned port= ndb_mgm_get_connected_port(handle); BaseString err; + Uint32 error_code= NDB_MGM_ALLOCID_ERROR; err.assfmt("Could not alloc node id at %s port %d: %s", hostname, port, buf); - setError(handle, NDB_MGM_COULD_NOT_CONNECT_TO_SOCKET, __LINE__, - err.c_str()); + prop->get("error_code", &error_code); + setError(handle, error_code, __LINE__, err.c_str()); break; } Uint32 _nodeid; diff --git a/ndb/src/mgmsrv/MgmtSrvr.cpp b/ndb/src/mgmsrv/MgmtSrvr.cpp index c7d0c11eec4..d40eaab7bd5 100644 --- a/ndb/src/mgmsrv/MgmtSrvr.cpp +++ b/ndb/src/mgmsrv/MgmtSrvr.cpp @@ -507,9 +507,10 @@ MgmtSrvr::MgmtSrvr(SocketServer *socket_server, if (_ownNodeId == 0) // we did not get node id from other server { NodeId tmp= m_config_retriever->get_configuration_nodeid(); + int error_code; if (!alloc_node_id(&tmp, NDB_MGM_NODE_TYPE_MGM, - 0, 0, error_string)){ + 0, 0, error_code, error_string)){ ndbout << "Unable to obtain requested nodeid: " << error_string.c_str() << endl; require(false); @@ -1118,31 +1119,16 @@ int MgmtSrvr::sendSTOP_REQ(const Vector<NodeId> &node_ids, const NFCompleteRep * const rep = CAST_CONSTPTR(NFCompleteRep, signal->getDataPtr()); #ifdef VM_TRACE - ndbout_c("Node %d fail completed", rep->failedNodeId); + ndbout_c("sendSTOP_REQ Node %d fail completed", rep->failedNodeId); #endif + nodes.clear(rep->failedNodeId); // clear the failed node + if (singleUserNodeId == 0) + stoppedNodes.set(rep->failedNodeId); break; } case GSN_NODE_FAILREP:{ const NodeFailRep * const rep = CAST_CONSTPTR(NodeFailRep, signal->getDataPtr()); - NodeBitmask failedNodes; - failedNodes.assign(NodeBitmask::Size, rep->theNodes); -#ifdef VM_TRACE - { - ndbout << "Failed nodes:"; - for (unsigned i = 0; i < 32*NodeBitmask::Size; i++) - if(failedNodes.get(i)) - ndbout << " " << i; - ndbout << endl; - } -#endif - failedNodes.bitAND(nodes); - if (!failedNodes.isclear()) - { - nodes.bitANDC(failedNodes); // clear the failed nodes - if (singleUserNodeId == 0) - stoppedNodes.bitOR(failedNodes); - } break; } default: @@ -1263,11 +1249,47 @@ int MgmtSrvr::restartNodes(const Vector<NodeId> &node_ids, abort, false, true, - nostart, + true, initialStart); + + if (ret) + return ret; + if (stopCount) *stopCount = nodes.count(); - return ret; + + // start up the nodes again + int waitTime = 12000; + NDB_TICKS maxTime = NdbTick_CurrentMillisecond() + waitTime; + for (unsigned i = 0; i < node_ids.size(); i++) + { + NodeId nodeId= node_ids[i]; + enum ndb_mgm_node_status s; + s = NDB_MGM_NODE_STATUS_NO_CONTACT; +#ifdef VM_TRACE + ndbout_c("Waiting for %d not started", nodeId); +#endif + while (s != NDB_MGM_NODE_STATUS_NOT_STARTED && waitTime > 0) + { + Uint32 startPhase = 0, version = 0, dynamicId = 0, nodeGroup = 0; + Uint32 connectCount = 0; + bool system; + const char *address; + status(nodeId, &s, &version, &startPhase, + &system, &dynamicId, &nodeGroup, &connectCount, &address); + NdbSleep_MilliSleep(100); + waitTime = (maxTime - NdbTick_CurrentMillisecond()); + } + } + + if (nostart) + return 0; + + for (unsigned i = 0; i < node_ids.size(); i++) + { + int result = start(node_ids[i]); + } + return 0; } /* @@ -1918,7 +1940,8 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId, enum ndb_mgm_node_type type, struct sockaddr *client_addr, SOCKET_SIZE_TYPE *client_addr_len, - BaseString &error_string) + int &error_code, BaseString &error_string, + int log_event) { DBUG_ENTER("MgmtSrvr::alloc_node_id"); DBUG_PRINT("enter", ("nodeid=%d, type=%d, client_addr=%d", @@ -1927,6 +1950,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId, if (*nodeId == 0) { error_string.appfmt("no-nodeid-checks set in management server.\n" "node id must be set explicitly in connectstring"); + error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH; DBUG_RETURN(false); } DBUG_RETURN(true); @@ -1951,8 +1975,10 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId, if(NdbMutex_Lock(m_configMutex)) { + // should not happen error_string.appfmt("unable to lock configuration mutex"); - return false; + error_code = NDB_MGM_ALLOCID_ERROR; + DBUG_RETURN(false); } ndb_mgm_configuration_iterator iter(* _config->m_configValues, CFG_SECTION_NODE); @@ -2023,6 +2049,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId, "or specifying unique host names in config file.", id_found, tmp); NdbMutex_Unlock(m_configMutex); + error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH; DBUG_RETURN(false); } if (config_hostname == 0) { @@ -2031,6 +2058,7 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId, "or specifying unique host names in config file,\n" "or specifying just one mgmt server in config file.", tmp); + error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH; DBUG_RETURN(false); } id_found= tmp; // mgmt server matched, check for more matches @@ -2072,8 +2100,9 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId, char tmp_str[128]; m_reserved_nodes.getText(tmp_str); - g_eventLogger.info("Mgmt server state: nodeid %d reserved for ip %s, m_reserved_nodes %s.", - id_found, get_connect_address(id_found), tmp_str); + g_eventLogger.info("Mgmt server state: nodeid %d reserved for ip %s, " + "m_reserved_nodes %s.", + id_found, get_connect_address(id_found), tmp_str); DBUG_RETURN(true); } @@ -2093,26 +2122,48 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId, type_c_string.assfmt("%s(%s)", alias, str); } - if (*nodeId == 0) { + if (*nodeId == 0) + { if (found_matching_id) + { if (found_matching_type) + { if (found_free_node) + { error_string.appfmt("Connection done from wrong host ip %s.", (client_addr)? - inet_ntoa(((struct sockaddr_in *) + inet_ntoa(((struct sockaddr_in *) (client_addr))->sin_addr):""); + error_code = NDB_MGM_ALLOCID_ERROR; + } else + { error_string.appfmt("No free node id found for %s.", type_string.c_str()); + error_code = NDB_MGM_ALLOCID_ERROR; + } + } else + { error_string.appfmt("No %s node defined in config file.", type_string.c_str()); + error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH; + } + } else + { error_string.append("No nodes defined in config file."); - } else { + error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH; + } + } + else + { if (found_matching_id) + { if (found_matching_type) - if (found_free_node) { + { + if (found_free_node) + { // have to split these into two since inet_ntoa overwrites itself error_string.appfmt("Connection with id %d done from wrong host ip %s,", *nodeId, inet_ntoa(((struct sockaddr_in *) @@ -2120,27 +2171,44 @@ MgmtSrvr::alloc_node_id(NodeId * nodeId, error_string.appfmt(" expected %s(%s).", config_hostname, r_config_addr ? "lookup failed" : inet_ntoa(config_addr)); - } else + error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH; + } + else + { error_string.appfmt("Id %d already allocated by another node.", *nodeId); + error_code = NDB_MGM_ALLOCID_ERROR; + } + } else + { error_string.appfmt("Id %d configured as %s, connect attempted as %s.", *nodeId, type_c_string.c_str(), type_string.c_str()); + error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH; + } + } else + { error_string.appfmt("No node defined with id=%d in config file.", *nodeId); + error_code = NDB_MGM_ALLOCID_CONFIG_MISMATCH; + } } - g_eventLogger.warning("Allocate nodeid (%d) failed. Connection from ip %s. " - "Returned error string \"%s\"", - *nodeId, - client_addr != 0 ? inet_ntoa(((struct sockaddr_in *)(client_addr))->sin_addr) : "<none>", - error_string.c_str()); - - NodeBitmask connected_nodes2; - get_connected_nodes(connected_nodes2); + if (log_event || error_code == NDB_MGM_ALLOCID_CONFIG_MISMATCH) { + g_eventLogger.warning("Allocate nodeid (%d) failed. Connection from ip %s." + " Returned error string \"%s\"", + *nodeId, + client_addr != 0 + ? inet_ntoa(((struct sockaddr_in *) + (client_addr))->sin_addr) + : "<none>", + error_string.c_str()); + + NodeBitmask connected_nodes2; + get_connected_nodes(connected_nodes2); BaseString tmp_connected, tmp_not_connected; for(Uint32 i = 0; i < MAX_NODES; i++) { diff --git a/ndb/src/mgmsrv/MgmtSrvr.hpp b/ndb/src/mgmsrv/MgmtSrvr.hpp index e2eb33d1198..007494a277d 100644 --- a/ndb/src/mgmsrv/MgmtSrvr.hpp +++ b/ndb/src/mgmsrv/MgmtSrvr.hpp @@ -434,8 +434,10 @@ public: */ bool getNextNodeId(NodeId * _nodeId, enum ndb_mgm_node_type type) const ; bool alloc_node_id(NodeId * _nodeId, enum ndb_mgm_node_type type, - struct sockaddr *client_addr, SOCKET_SIZE_TYPE *client_addr_len, - BaseString &error_string); + struct sockaddr *client_addr, + SOCKET_SIZE_TYPE *client_addr_len, + int &error_code, BaseString &error_string, + int log_event = 1); /** * diff --git a/ndb/src/mgmsrv/Services.cpp b/ndb/src/mgmsrv/Services.cpp index ee97235912a..555c2082480 100644 --- a/ndb/src/mgmsrv/Services.cpp +++ b/ndb/src/mgmsrv/Services.cpp @@ -138,6 +138,7 @@ ParserRow<MgmApiSession> commands[] = { MGM_ARG("endian", String, Optional, "Endianness"), MGM_ARG("name", String, Optional, "Name of connection"), MGM_ARG("timeout", Int, Optional, "Timeout in seconds"), + MGM_ARG("log_event", Int, Optional, "Log failure in cluster log"), MGM_CMD("get version", &MgmApiSession::getVersion, ""), @@ -425,6 +426,8 @@ MgmApiSession::get_nodeid(Parser_t::Context &, const char * public_key; const char * endian= NULL; const char * name= NULL; + Uint32 log_event= 1; + bool log_event_version; union { long l; char c[sizeof(long)]; } endian_check; args.get("version", &version); @@ -437,6 +440,8 @@ MgmApiSession::get_nodeid(Parser_t::Context &, args.get("endian", &endian); args.get("name", &name); args.get("timeout", &timeout); + /* for backwards compatability keep track if client uses new protocol */ + log_event_version= args.get("log_event", &log_event); endian_check.l = 1; if(endian @@ -476,11 +481,15 @@ MgmApiSession::get_nodeid(Parser_t::Context &, NodeId tmp= nodeid; if(tmp == 0 || !m_allocated_resources->is_reserved(tmp)){ BaseString error_string; + int error_code; NDB_TICKS tick= 0; + /* only report error on second attempt as not to clog the cluster log */ while (!m_mgmsrv.alloc_node_id(&tmp, (enum ndb_mgm_node_type)nodetype, - &addr, &addrlen, error_string)) + &addr, &addrlen, error_code, error_string, + tick == 0 ? 0 : log_event)) { - if (tick == 0) + /* NDB_MGM_ALLOCID_CONFIG_MISMATCH is a non retriable error */ + if (tick == 0 && error_code != NDB_MGM_ALLOCID_CONFIG_MISMATCH) { // attempt to free any timed out reservations tick= NdbTick_CurrentMillisecond(); @@ -492,6 +501,7 @@ MgmApiSession::get_nodeid(Parser_t::Context &, ps.tick= tick; m_mgmsrv.get_socket_server()-> foreachSession(stop_session_if_timed_out,&ps); + error_string = ""; continue; } const char *alias; @@ -500,6 +510,9 @@ MgmApiSession::get_nodeid(Parser_t::Context &, nodetype, &str); m_output->println(cmd); m_output->println("result: %s", error_string.c_str()); + /* only use error_code protocol if client knows about it */ + if (log_event_version) + m_output->println("error_code: %d", error_code); m_output->println(""); return; } |