diff options
author | jonas@perch.ndb.mysql.com <> | 2006-04-03 11:26:29 +0200 |
---|---|---|
committer | jonas@perch.ndb.mysql.com <> | 2006-04-03 11:26:29 +0200 |
commit | 1aa9a95065cad59795076c17fe35edfd6f86deef (patch) | |
tree | a100191301a8840180873fe714ef71b05a7f7909 /ndb | |
parent | d367f635e9310365c8d5893f2dc9c8816953672d (diff) | |
download | mariadb-git-1aa9a95065cad59795076c17fe35edfd6f86deef.tar.gz |
ndb - bug#18612
post weeked fixes :-)
change impl. to use READ_NODESREQ to query state of other qmgr(partition)
this as it has no (current) side effects, so that it's possible only to kill
starting cluster (if one started and one starting)
Diffstat (limited to 'ndb')
-rw-r--r-- | ndb/include/kernel/signaldata/FailRep.hpp | 11 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/qmgr/Qmgr.hpp | 7 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/qmgr/QmgrInit.cpp | 3 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/qmgr/QmgrMain.cpp | 226 | ||||
-rw-r--r-- | ndb/test/ndbapi/testNodeRestart.cpp | 4 |
5 files changed, 124 insertions, 127 deletions
diff --git a/ndb/include/kernel/signaldata/FailRep.hpp b/ndb/include/kernel/signaldata/FailRep.hpp index b1c16294e70..f575d99e865 100644 --- a/ndb/include/kernel/signaldata/FailRep.hpp +++ b/ndb/include/kernel/signaldata/FailRep.hpp @@ -36,7 +36,8 @@ class FailRep { public: STATIC_CONST( SignalLength = 2 ); - + STATIC_CONST( ExtraLength = 1 + NdbNodeBitmask::Size ); + enum FailCause { ZOWN_FAILURE=0, ZOTHER_NODE_WHEN_WE_START=1, @@ -45,13 +46,19 @@ public: ZHEARTBEAT_FAILURE=4, ZLINK_FAILURE=5, ZOTHERNODE_FAILED_DURING_START=6, - ZMULTI_NODE_SHUTDOWN = 7 + ZMULTI_NODE_SHUTDOWN = 7, + ZPARTITIONED_CLUSTER = 8 }; private: Uint32 failNodeId; Uint32 failCause; + /** + * Used when failCause == ZPARTITIONED_CLUSTER + */ + Uint32 president; + Uint32 partition[NdbNodeBitmask::Size]; }; diff --git a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp index 3b623b36206..07e6a2a10c1 100644 --- a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp +++ b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp @@ -124,7 +124,7 @@ public: * * i.e. nodes that connect to use, when we already have elected president */ - NdbNodeBitmask c_cmregreq_nodes; + NdbNodeBitmask c_readnodes_nodes; Uint32 c_maxDynamicId; @@ -233,6 +233,8 @@ private: void execREAD_NODESREQ(Signal* signal); void execSET_VAR_REQ(Signal* signal); + void execREAD_NODESREF(Signal* signal); + void execREAD_NODESCONF(Signal* signal); void execAPI_VERSION_REQ(Signal* signal); void execAPI_BROADCAST_REP(Signal* signal); @@ -249,6 +251,8 @@ private: void execARBIT_STOPREP(Signal* signal); // Statement blocks + void check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn); + void node_failed(Signal* signal, Uint16 aFailedNode); void checkStartInterface(Signal* signal); void failReport(Signal* signal, @@ -268,7 +272,6 @@ private: void startphase1(Signal* signal); void electionWon(Signal* signal); void cmInfoconf010Lab(Signal* signal); - bool check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn); void apiHbHandlingLab(Signal* signal); void timerHandlingLab(Signal* signal); diff --git a/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp b/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp index ade880b7e4a..a8fe30d8cfa 100644 --- a/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp +++ b/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp @@ -94,6 +94,9 @@ Qmgr::Qmgr(const class Configuration & conf) addRecSignal(GSN_ARBIT_CHOOSEREF, &Qmgr::execARBIT_CHOOSEREF); addRecSignal(GSN_ARBIT_STOPREP, &Qmgr::execARBIT_STOPREP); + addRecSignal(GSN_READ_NODESREF, &Qmgr::execREAD_NODESREF); + addRecSignal(GSN_READ_NODESCONF, &Qmgr::execREAD_NODESCONF); + initData(); }//Qmgr::Qmgr() diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp index 03f6fa2ae87..c17922dff48 100644 --- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp +++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp @@ -369,13 +369,29 @@ void Qmgr::execCONNECT_REP(Signal* signal) } ndbrequire(!c_start.m_nodes.isWaitingFor(nodeId)); - ndbrequire(!c_cmregreq_nodes.get(nodeId)); - c_cmregreq_nodes.set(nodeId); - sendCmRegReq(signal, nodeId); - c_regReqReqSent--; + ndbrequire(!c_readnodes_nodes.get(nodeId)); + c_readnodes_nodes.set(nodeId); + signal->theData[0] = reference(); + sendSignal(calcQmgrBlockRef(nodeId), GSN_READ_NODESREQ, signal, 1, JBA); return; }//Qmgr::execCONNECT_REP() +void +Qmgr::execREAD_NODESCONF(Signal* signal) +{ + check_readnodes_reply(signal, + refToNode(signal->getSendersBlockRef()), + GSN_READ_NODESCONF); +} + +void +Qmgr::execREAD_NODESREF(Signal* signal) +{ + check_readnodes_reply(signal, + refToNode(signal->getSendersBlockRef()), + GSN_READ_NODESREF); +} + /*******************************/ /* CM_INFOCONF */ /*******************************/ @@ -668,12 +684,6 @@ void Qmgr::execCM_REGCONF(Signal* signal) const CmRegConf * const cmRegConf = (CmRegConf *)&signal->theData[0]; Uint32 presidentNodeId = cmRegConf->presidentNodeId; - if (check_cmregreq_reply(signal, presidentNodeId, GSN_CM_REGCONF)) - { - jam(); - return; - } - if (!ndbCompatible_ndb_ndb(NDB_VERSION, cmRegConf->presidentVersion)) { jam(); char buf[128]; @@ -731,8 +741,8 @@ void Qmgr::execCM_REGCONF(Signal* signal) return; }//Qmgr::execCM_REGCONF() -bool -Qmgr::check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn) +void +Qmgr::check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn) { NodeRecPtr myNodePtr; myNodePtr.i = getOwnNodeId(); @@ -741,117 +751,65 @@ Qmgr::check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn) NodeRecPtr nodePtr; nodePtr.i = nodeId; ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec); - - /** - * Try to decide if replying node - * knows who is president - */ - Uint32 president_reply = RNIL; - switch(gsn){ - case GSN_CM_REGREF:{ - jam(); - CmRegRef* ref = (CmRegRef*)signal->getDataPtr(); - switch(ref->errorCode){ - case CmRegRef::ZBUSY: - case CmRegRef::ZBUSY_PRESIDENT: - case CmRegRef::ZBUSY_TO_PRES: - jam(); - /** - * Only president replies this - */ - ndbrequire(nodeId == ref->presidentCandidate); - president_reply = nodeId; - break; - case CmRegRef::ZNOT_PRESIDENT: - jam(); - president_reply = ref->presidentCandidate; - break; - case CmRegRef::ZNOT_IN_CFG: - case CmRegRef::ZNOT_DEAD: - case CmRegRef::ZELECTION: - // Neither of these replies give certain president knowledge - jam(); - } - break; - } - case GSN_CM_REGCONF: - jam(); - president_reply = nodeId; - break; - } - - char buf[256]; - switch(c_start.m_gsn){ - case GSN_CM_REGREQ: - jam(); - ndbrequire(c_start.m_nodes.isWaitingFor(nodeId)); - ndbrequire(c_cmregreq_nodes.isclear()); - ndbrequire(myNodePtr.p->phase == ZSTARTING); - return false; - case GSN_CM_NODEINFOREQ: - jam(); - ndbrequire(myNodePtr.p->phase == ZSTARTING); - if (c_start.m_nodes.isWaitingFor(nodeId)) - { - jam(); - /** - * We're waiting for CM_NODEINFO - */ - if (gsn == GSN_CM_REGREF) - { - jam(); - return false; - } - - jam(); - BaseString::snprintf(buf, sizeof(buf), - "Partitioned cluster! check StartPartialTimeout, " - " received CM_REGCONF from %d" - " while waiting for GSN_CM_NODEINFOCONF." - " president=%d", - nodeId, cpresident); - goto die_direct; - } - - goto check_reply; - default: - case GSN_CM_NODEINFOCONF: + ndbrequire(c_readnodes_nodes.get(nodeId)); + ReadNodesConf* conf = (ReadNodesConf*)signal->getDataPtr(); + if (gsn == GSN_READ_NODESREF) + { jam(); - ndbrequire(myNodePtr.p->phase == ZRUNNING); - goto check_reply; +retry: + signal->theData[0] = reference(); + sendSignal(calcQmgrBlockRef(nodeId), GSN_READ_NODESREQ, signal, 1, JBA); + return; } -check_reply: - jam(); - c_cmregreq_nodes.clear(nodeId); - - if (gsn == GSN_CM_REGCONF) + if (conf->masterNodeId == ZNIL) { jam(); - BaseString::snprintf(buf, sizeof(buf), - "Partitioned cluster! check StartPartialTimeout, " - " received CM_REGCONF" - " from %d I think president: %d", - nodeId, cpresident); - goto die_direct; + goto retry; } - if (president_reply != RNIL && president_reply != cpresident) + Uint32 president = conf->masterNodeId; + if (president == cpresident) { jam(); - BaseString::snprintf(buf, sizeof(buf), - "Partitioned cluster! check StartPartialTimeout, " - " received CM_REGREF from %d specifying president as" - " %d, president: %d", - nodeId, president_reply, cpresident); - goto die_direct; + c_readnodes_nodes.clear(nodeId); + return; } - - return true; -die_direct: + char buf[255]; + BaseString::snprintf(buf, sizeof(buf), + "Partitioned cluster! check StartPartialTimeout, " + " node %d thinks %d is president, " + " I think president is: %d", + nodeId, president, cpresident); + ndbout_c(buf); + CRASH_INSERTION(933); + + if (getNodeState().startLevel == NodeState::SL_STARTED) + { + jam(); + NdbNodeBitmask part; + part.assign(NdbNodeBitmask::Size, conf->clusterNodes); + FailRep* rep = (FailRep*)signal->getDataPtrSend(); + rep->failCause = FailRep::ZPARTITIONED_CLUSTER; + rep->president = cpresident; + c_clusterNodes.copyto(NdbNodeBitmask::Size, rep->partition); + Uint32 ref = calcQmgrBlockRef(nodeId); + Uint32 i = 0; + while((i = part.find(i + 1)) != NdbNodeBitmask::NotFound) + { + if (i == nodeId) + continue; + rep->failNodeId = i; + sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBA); + } + rep->failNodeId = nodeId; + sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBB); + return; + } + CRASH_INSERTION(932); progError(__LINE__, @@ -899,12 +857,6 @@ void Qmgr::execCM_REGREF(Signal* signal) Uint32 candidate = signal->theData[3]; DEBUG_START3(signal, TrefuseReason); - if (check_cmregreq_reply(signal, TaddNodeno, GSN_CM_REGREF)) - { - jam(); - return; - } - c_regReqReqRecv++; // Ignore block reference in data[0] @@ -2069,7 +2021,7 @@ void Qmgr::execDISCONNECT_REP(Signal* signal) const DisconnectRep * const rep = (DisconnectRep *)&signal->theData[0]; const Uint32 nodeId = rep->nodeId; c_connectedNodes.clear(nodeId); - c_cmregreq_nodes.clear(nodeId); + c_readnodes_nodes.clear(nodeId); NodeRecPtr nodePtr; nodePtr.i = getOwnNodeId(); @@ -2342,13 +2294,16 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode, failedNodePtr.i = aFailedNode; ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); + FailRep* rep = (FailRep*)signal->getDataPtr(); check_multi_node_shutdown(signal); if (failedNodePtr.i == getOwnNodeId()) { jam(); + Uint32 code = 0; const char * msg = 0; + char extra[100]; switch(aFailCause){ case FailRep::ZOWN_FAILURE: msg = "Own failure"; @@ -2369,17 +2324,46 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode, case FailRep::ZLINK_FAILURE: msg = "Connection failure"; break; + case FailRep::ZPARTITIONED_CLUSTER: + { + code = ERR_ARBIT_SHUTDOWN; + char buf1[100], buf2[100]; + c_clusterNodes.getText(buf1); + if (signal->getLength()== FailRep::SignalLength + FailRep::ExtraLength && + signal->header.theVerId_signalNumber == GSN_FAIL_REP) + { + jam(); + NdbNodeBitmask part; + part.assign(NdbNodeBitmask::Size, rep->partition); + part.getText(buf2); + BaseString::snprintf(extra, sizeof(extra), + "Partitioned cluster!" + " Our cluster: %s other cluster: %s", + buf1, buf2); + } + else + { + jam(); + BaseString::snprintf(extra, sizeof(extra), + "Partitioned cluster!" + " Our cluster: %s ", buf1); + } + msg = extra; + break; + } } - char buf[100]; - BaseString::snprintf(buf, 100, + CRASH_INSERTION(932); + + char buf[255]; + BaseString::snprintf(buf, sizeof(buf), "We(%u) have been declared dead by %u reason: %s(%u)", getOwnNodeId(), refToNode(signal->getSendersBlockRef()), aFailCause, msg ? msg : "<Unknown>"); - - progError(__LINE__, 0, buf); + + progError(__LINE__, code, buf); return; }//if diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp index 5f577b77f34..bdf0069aa26 100644 --- a/ndb/test/ndbapi/testNodeRestart.cpp +++ b/ndb/test/ndbapi/testNodeRestart.cpp @@ -753,13 +753,13 @@ runBug18612(NDBT_Context* ctx, NDBT_Step* step){ if (restarter.dumpStateAllNodes(dump, 2)) return NDBT_FAILED; - if (restarter.waitClusterNoStart()) + if (restarter.waitNodesNoStart(partition0, cnt/2)) return NDBT_FAILED; for (Uint32 i = 0; i<cnt/2; i++) if (restarter.restartOneDbNode(partition0[i], true, true, true)) return NDBT_FAILED; - + if (restarter.waitNodesNoStart(partition0, cnt/2)) return NDBT_FAILED; |