summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ndb/include/kernel/signaldata/FailRep.hpp11
-rw-r--r--ndb/src/kernel/blocks/qmgr/Qmgr.hpp7
-rw-r--r--ndb/src/kernel/blocks/qmgr/QmgrInit.cpp3
-rw-r--r--ndb/src/kernel/blocks/qmgr/QmgrMain.cpp226
-rw-r--r--ndb/test/ndbapi/testNodeRestart.cpp4
5 files changed, 124 insertions, 127 deletions
diff --git a/ndb/include/kernel/signaldata/FailRep.hpp b/ndb/include/kernel/signaldata/FailRep.hpp
index b1c16294e70..f575d99e865 100644
--- a/ndb/include/kernel/signaldata/FailRep.hpp
+++ b/ndb/include/kernel/signaldata/FailRep.hpp
@@ -36,7 +36,8 @@ class FailRep {
public:
STATIC_CONST( SignalLength = 2 );
-
+ STATIC_CONST( ExtraLength = 1 + NdbNodeBitmask::Size );
+
enum FailCause {
ZOWN_FAILURE=0,
ZOTHER_NODE_WHEN_WE_START=1,
@@ -45,13 +46,19 @@ public:
ZHEARTBEAT_FAILURE=4,
ZLINK_FAILURE=5,
ZOTHERNODE_FAILED_DURING_START=6,
- ZMULTI_NODE_SHUTDOWN = 7
+ ZMULTI_NODE_SHUTDOWN = 7,
+ ZPARTITIONED_CLUSTER = 8
};
private:
Uint32 failNodeId;
Uint32 failCause;
+ /**
+ * Used when failCause == ZPARTITIONED_CLUSTER
+ */
+ Uint32 president;
+ Uint32 partition[NdbNodeBitmask::Size];
};
diff --git a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
index 3b623b36206..07e6a2a10c1 100644
--- a/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
+++ b/ndb/src/kernel/blocks/qmgr/Qmgr.hpp
@@ -124,7 +124,7 @@ public:
*
* i.e. nodes that connect to use, when we already have elected president
*/
- NdbNodeBitmask c_cmregreq_nodes;
+ NdbNodeBitmask c_readnodes_nodes;
Uint32 c_maxDynamicId;
@@ -233,6 +233,8 @@ private:
void execREAD_NODESREQ(Signal* signal);
void execSET_VAR_REQ(Signal* signal);
+ void execREAD_NODESREF(Signal* signal);
+ void execREAD_NODESCONF(Signal* signal);
void execAPI_VERSION_REQ(Signal* signal);
void execAPI_BROADCAST_REP(Signal* signal);
@@ -249,6 +251,8 @@ private:
void execARBIT_STOPREP(Signal* signal);
// Statement blocks
+ void check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
+
void node_failed(Signal* signal, Uint16 aFailedNode);
void checkStartInterface(Signal* signal);
void failReport(Signal* signal,
@@ -268,7 +272,6 @@ private:
void startphase1(Signal* signal);
void electionWon(Signal* signal);
void cmInfoconf010Lab(Signal* signal);
- bool check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn);
void apiHbHandlingLab(Signal* signal);
void timerHandlingLab(Signal* signal);
diff --git a/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp b/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
index ade880b7e4a..a8fe30d8cfa 100644
--- a/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
+++ b/ndb/src/kernel/blocks/qmgr/QmgrInit.cpp
@@ -94,6 +94,9 @@ Qmgr::Qmgr(const class Configuration & conf)
addRecSignal(GSN_ARBIT_CHOOSEREF, &Qmgr::execARBIT_CHOOSEREF);
addRecSignal(GSN_ARBIT_STOPREP, &Qmgr::execARBIT_STOPREP);
+ addRecSignal(GSN_READ_NODESREF, &Qmgr::execREAD_NODESREF);
+ addRecSignal(GSN_READ_NODESCONF, &Qmgr::execREAD_NODESCONF);
+
initData();
}//Qmgr::Qmgr()
diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
index 03f6fa2ae87..c17922dff48 100644
--- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
+++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp
@@ -369,13 +369,29 @@ void Qmgr::execCONNECT_REP(Signal* signal)
}
ndbrequire(!c_start.m_nodes.isWaitingFor(nodeId));
- ndbrequire(!c_cmregreq_nodes.get(nodeId));
- c_cmregreq_nodes.set(nodeId);
- sendCmRegReq(signal, nodeId);
- c_regReqReqSent--;
+ ndbrequire(!c_readnodes_nodes.get(nodeId));
+ c_readnodes_nodes.set(nodeId);
+ signal->theData[0] = reference();
+ sendSignal(calcQmgrBlockRef(nodeId), GSN_READ_NODESREQ, signal, 1, JBA);
return;
}//Qmgr::execCONNECT_REP()
+void
+Qmgr::execREAD_NODESCONF(Signal* signal)
+{
+ check_readnodes_reply(signal,
+ refToNode(signal->getSendersBlockRef()),
+ GSN_READ_NODESCONF);
+}
+
+void
+Qmgr::execREAD_NODESREF(Signal* signal)
+{
+ check_readnodes_reply(signal,
+ refToNode(signal->getSendersBlockRef()),
+ GSN_READ_NODESREF);
+}
+
/*******************************/
/* CM_INFOCONF */
/*******************************/
@@ -668,12 +684,6 @@ void Qmgr::execCM_REGCONF(Signal* signal)
const CmRegConf * const cmRegConf = (CmRegConf *)&signal->theData[0];
Uint32 presidentNodeId = cmRegConf->presidentNodeId;
- if (check_cmregreq_reply(signal, presidentNodeId, GSN_CM_REGCONF))
- {
- jam();
- return;
- }
-
if (!ndbCompatible_ndb_ndb(NDB_VERSION, cmRegConf->presidentVersion)) {
jam();
char buf[128];
@@ -731,8 +741,8 @@ void Qmgr::execCM_REGCONF(Signal* signal)
return;
}//Qmgr::execCM_REGCONF()
-bool
-Qmgr::check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn)
+void
+Qmgr::check_readnodes_reply(Signal* signal, Uint32 nodeId, Uint32 gsn)
{
NodeRecPtr myNodePtr;
myNodePtr.i = getOwnNodeId();
@@ -741,117 +751,65 @@ Qmgr::check_cmregreq_reply(Signal* signal, Uint32 nodeId, Uint32 gsn)
NodeRecPtr nodePtr;
nodePtr.i = nodeId;
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRec);
-
- /**
- * Try to decide if replying node
- * knows who is president
- */
- Uint32 president_reply = RNIL;
- switch(gsn){
- case GSN_CM_REGREF:{
- jam();
- CmRegRef* ref = (CmRegRef*)signal->getDataPtr();
- switch(ref->errorCode){
- case CmRegRef::ZBUSY:
- case CmRegRef::ZBUSY_PRESIDENT:
- case CmRegRef::ZBUSY_TO_PRES:
- jam();
- /**
- * Only president replies this
- */
- ndbrequire(nodeId == ref->presidentCandidate);
- president_reply = nodeId;
- break;
- case CmRegRef::ZNOT_PRESIDENT:
- jam();
- president_reply = ref->presidentCandidate;
- break;
- case CmRegRef::ZNOT_IN_CFG:
- case CmRegRef::ZNOT_DEAD:
- case CmRegRef::ZELECTION:
- // Neither of these replies give certain president knowledge
- jam();
- }
- break;
- }
- case GSN_CM_REGCONF:
- jam();
- president_reply = nodeId;
- break;
- }
-
- char buf[256];
- switch(c_start.m_gsn){
- case GSN_CM_REGREQ:
- jam();
- ndbrequire(c_start.m_nodes.isWaitingFor(nodeId));
- ndbrequire(c_cmregreq_nodes.isclear());
- ndbrequire(myNodePtr.p->phase == ZSTARTING);
- return false;
- case GSN_CM_NODEINFOREQ:
- jam();
- ndbrequire(myNodePtr.p->phase == ZSTARTING);
- if (c_start.m_nodes.isWaitingFor(nodeId))
- {
- jam();
- /**
- * We're waiting for CM_NODEINFO
- */
- if (gsn == GSN_CM_REGREF)
- {
- jam();
- return false;
- }
-
- jam();
- BaseString::snprintf(buf, sizeof(buf),
- "Partitioned cluster! check StartPartialTimeout, "
- " received CM_REGCONF from %d"
- " while waiting for GSN_CM_NODEINFOCONF."
- " president=%d",
- nodeId, cpresident);
- goto die_direct;
- }
-
- goto check_reply;
- default:
- case GSN_CM_NODEINFOCONF:
+ ndbrequire(c_readnodes_nodes.get(nodeId));
+ ReadNodesConf* conf = (ReadNodesConf*)signal->getDataPtr();
+ if (gsn == GSN_READ_NODESREF)
+ {
jam();
- ndbrequire(myNodePtr.p->phase == ZRUNNING);
- goto check_reply;
+retry:
+ signal->theData[0] = reference();
+ sendSignal(calcQmgrBlockRef(nodeId), GSN_READ_NODESREQ, signal, 1, JBA);
+ return;
}
-check_reply:
- jam();
- c_cmregreq_nodes.clear(nodeId);
-
- if (gsn == GSN_CM_REGCONF)
+ if (conf->masterNodeId == ZNIL)
{
jam();
- BaseString::snprintf(buf, sizeof(buf),
- "Partitioned cluster! check StartPartialTimeout, "
- " received CM_REGCONF"
- " from %d I think president: %d",
- nodeId, cpresident);
- goto die_direct;
+ goto retry;
}
- if (president_reply != RNIL && president_reply != cpresident)
+ Uint32 president = conf->masterNodeId;
+ if (president == cpresident)
{
jam();
- BaseString::snprintf(buf, sizeof(buf),
- "Partitioned cluster! check StartPartialTimeout, "
- " received CM_REGREF from %d specifying president as"
- " %d, president: %d",
- nodeId, president_reply, cpresident);
- goto die_direct;
+ c_readnodes_nodes.clear(nodeId);
+ return;
}
-
- return true;
-die_direct:
+ char buf[255];
+ BaseString::snprintf(buf, sizeof(buf),
+ "Partitioned cluster! check StartPartialTimeout, "
+ " node %d thinks %d is president, "
+ " I think president is: %d",
+ nodeId, president, cpresident);
+
ndbout_c(buf);
+ CRASH_INSERTION(933);
+
+ if (getNodeState().startLevel == NodeState::SL_STARTED)
+ {
+ jam();
+ NdbNodeBitmask part;
+ part.assign(NdbNodeBitmask::Size, conf->clusterNodes);
+ FailRep* rep = (FailRep*)signal->getDataPtrSend();
+ rep->failCause = FailRep::ZPARTITIONED_CLUSTER;
+ rep->president = cpresident;
+ c_clusterNodes.copyto(NdbNodeBitmask::Size, rep->partition);
+ Uint32 ref = calcQmgrBlockRef(nodeId);
+ Uint32 i = 0;
+ while((i = part.find(i + 1)) != NdbNodeBitmask::NotFound)
+ {
+ if (i == nodeId)
+ continue;
+ rep->failNodeId = i;
+ sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBA);
+ }
+ rep->failNodeId = nodeId;
+ sendSignal(ref, GSN_FAIL_REP, signal, FailRep::SignalLength, JBB);
+ return;
+ }
+
CRASH_INSERTION(932);
progError(__LINE__,
@@ -899,12 +857,6 @@ void Qmgr::execCM_REGREF(Signal* signal)
Uint32 candidate = signal->theData[3];
DEBUG_START3(signal, TrefuseReason);
- if (check_cmregreq_reply(signal, TaddNodeno, GSN_CM_REGREF))
- {
- jam();
- return;
- }
-
c_regReqReqRecv++;
// Ignore block reference in data[0]
@@ -2069,7 +2021,7 @@ void Qmgr::execDISCONNECT_REP(Signal* signal)
const DisconnectRep * const rep = (DisconnectRep *)&signal->theData[0];
const Uint32 nodeId = rep->nodeId;
c_connectedNodes.clear(nodeId);
- c_cmregreq_nodes.clear(nodeId);
+ c_readnodes_nodes.clear(nodeId);
NodeRecPtr nodePtr;
nodePtr.i = getOwnNodeId();
@@ -2342,13 +2294,16 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
failedNodePtr.i = aFailedNode;
ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec);
+ FailRep* rep = (FailRep*)signal->getDataPtr();
check_multi_node_shutdown(signal);
if (failedNodePtr.i == getOwnNodeId()) {
jam();
+ Uint32 code = 0;
const char * msg = 0;
+ char extra[100];
switch(aFailCause){
case FailRep::ZOWN_FAILURE:
msg = "Own failure";
@@ -2369,17 +2324,46 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode,
case FailRep::ZLINK_FAILURE:
msg = "Connection failure";
break;
+ case FailRep::ZPARTITIONED_CLUSTER:
+ {
+ code = ERR_ARBIT_SHUTDOWN;
+ char buf1[100], buf2[100];
+ c_clusterNodes.getText(buf1);
+ if (signal->getLength()== FailRep::SignalLength + FailRep::ExtraLength &&
+ signal->header.theVerId_signalNumber == GSN_FAIL_REP)
+ {
+ jam();
+ NdbNodeBitmask part;
+ part.assign(NdbNodeBitmask::Size, rep->partition);
+ part.getText(buf2);
+ BaseString::snprintf(extra, sizeof(extra),
+ "Partitioned cluster!"
+ " Our cluster: %s other cluster: %s",
+ buf1, buf2);
+ }
+ else
+ {
+ jam();
+ BaseString::snprintf(extra, sizeof(extra),
+ "Partitioned cluster!"
+ " Our cluster: %s ", buf1);
+ }
+ msg = extra;
+ break;
+ }
}
- char buf[100];
- BaseString::snprintf(buf, 100,
+ CRASH_INSERTION(932);
+
+ char buf[255];
+ BaseString::snprintf(buf, sizeof(buf),
"We(%u) have been declared dead by %u reason: %s(%u)",
getOwnNodeId(),
refToNode(signal->getSendersBlockRef()),
aFailCause,
msg ? msg : "<Unknown>");
-
- progError(__LINE__, 0, buf);
+
+ progError(__LINE__, code, buf);
return;
}//if
diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp
index 5f577b77f34..bdf0069aa26 100644
--- a/ndb/test/ndbapi/testNodeRestart.cpp
+++ b/ndb/test/ndbapi/testNodeRestart.cpp
@@ -753,13 +753,13 @@ runBug18612(NDBT_Context* ctx, NDBT_Step* step){
if (restarter.dumpStateAllNodes(dump, 2))
return NDBT_FAILED;
- if (restarter.waitClusterNoStart())
+ if (restarter.waitNodesNoStart(partition0, cnt/2))
return NDBT_FAILED;
for (Uint32 i = 0; i<cnt/2; i++)
if (restarter.restartOneDbNode(partition0[i], true, true, true))
return NDBT_FAILED;
-
+
if (restarter.waitNodesNoStart(partition0, cnt/2))
return NDBT_FAILED;