summaryrefslogtreecommitdiff
path: root/ndb
diff options
context:
space:
mode:
authorunknown <tomas@poseidon.mysql.com>2007-02-14 11:05:38 +0700
committerunknown <tomas@poseidon.mysql.com>2007-02-14 11:05:38 +0700
commit9ed1b8434603d69eb0646013f721a402a624833c (patch)
tree65e7394b7a4c7108107b12deca6d998f6bb8712c /ndb
parent2ed7eaf56424485e475ef7f1c430b1881ac02fc2 (diff)
downloadmariadb-git-9ed1b8434603d69eb0646013f721a402a624833c.tar.gz
Bug#26293 cluster mgmt node sometimes doesn't receive events from all nodes on restart
- signals where sometimes sent too early when setting up subscriptions ndb/include/kernel/signaldata/DumpStateOrd.hpp: added dump for active subscriptions in cmvmi ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp: added dump for active subscriptions in cmvmi ndb/src/mgmsrv/MgmtSrvr.cpp: bug in that signals where sent prior to api reg conf arrived, causing thrown away signals and subsequent hangs in mgmtserver also add retry if node connected but not yet received api reg conf ndb/src/ndbapi/ClusterMgr.cpp: added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable ndb/src/ndbapi/ClusterMgr.hpp: added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable ndb/src/ndbapi/SignalSender.cpp: assert to see that node is sendable when signal is sent ndb/src/ndbapi/SignalSender.hpp: manke metchd const
Diffstat (limited to 'ndb')
-rw-r--r--ndb/include/kernel/signaldata/DumpStateOrd.hpp4
-rw-r--r--ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp20
-rw-r--r--ndb/src/mgmsrv/MgmtSrvr.cpp94
-rw-r--r--ndb/src/ndbapi/ClusterMgr.cpp5
-rw-r--r--ndb/src/ndbapi/ClusterMgr.hpp1
-rw-r--r--ndb/src/ndbapi/SignalSender.cpp2
-rw-r--r--ndb/src/ndbapi/SignalSender.hpp2
7 files changed, 105 insertions, 23 deletions
diff --git a/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/ndb/include/kernel/signaldata/DumpStateOrd.hpp
index 8d0961d1c27..5a1d9ece9cf 100644
--- a/ndb/include/kernel/signaldata/DumpStateOrd.hpp
+++ b/ndb/include/kernel/signaldata/DumpStateOrd.hpp
@@ -107,6 +107,10 @@ public:
CmvmiDumpLongSignalMemory = 2601,
CmvmiSetRestartOnErrorInsert = 2602,
CmvmiTestLongSigWithDelay = 2603,
+ CmvmiDumpSubscriptions = 2604, /* note: done to respective outfile
+ to be able to debug if events
+ for some reason does not end up
+ in clusterlog */
// 7000 DIH
// 7001 DIH
// 7002 DIH
diff --git a/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp b/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp
index 5dd1e527dd2..a9d9c991ca3 100644
--- a/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp
+++ b/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp
@@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* signal)
case TimeToWaitAlive:
// QMGR
- case HeartbeatIntervalDbDb: // TODO ev till Ndbcnt också
+ case HeartbeatIntervalDbDb: // TODO possibly Ndbcnt too
case HeartbeatIntervalDbApi:
case ArbitTimeout:
sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB);
@@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal)
}
}
+ if (arg == DumpStateOrd::CmvmiDumpSubscriptions)
+ {
+ SubscriberPtr ptr;
+ subscribers.first(ptr);
+ g_eventLogger.info("List subscriptions:");
+ while(ptr.i != RNIL)
+ {
+ g_eventLogger.info("Subscription: %u, nodeId: %u, ref: 0x%x",
+ ptr.i, refToNode(ptr.p->blockRef), ptr.p->blockRef);
+ for(Uint32 i = 0; i < LogLevel::LOGLEVEL_CATEGORIES; i++)
+ {
+ Uint32 level = ptr.p->logLevel.getLogLevel((LogLevel::EventCategory)i);
+ g_eventLogger.info("Category %u Level %u", i, level);
+ }
+ subscribers.next(ptr);
+ }
+ }
+
if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){
infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d",
g_sectionSegmentPool.getSize(),
diff --git a/ndb/src/mgmsrv/MgmtSrvr.cpp b/ndb/src/mgmsrv/MgmtSrvr.cpp
index 0ee59f70885..5818e7fe3ae 100644
--- a/ndb/src/mgmsrv/MgmtSrvr.cpp
+++ b/ndb/src/mgmsrv/MgmtSrvr.cpp
@@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, bool unCond)
return WRONG_PROCESS_TYPE;
// Check if we have contact with it
if(unCond){
- if(theFacade->theClusterMgr->getNodeInfo(nodeId).connected)
+ if(theFacade->theClusterMgr->getNodeInfo(nodeId).m_api_reg_conf)
return 0;
}
else if (theFacade->get_node_alive(nodeId) == true)
@@ -1562,32 +1562,85 @@ MgmtSrvr::status(int nodeId,
}
int
-MgmtSrvr::setEventReportingLevelImpl(int nodeId,
+MgmtSrvr::setEventReportingLevelImpl(int nodeId_arg,
const EventSubscribeReq& ll)
{
SignalSender ss(theFacade);
- ss.lock();
-
- SimpleSignal ssig;
- EventSubscribeReq * dst =
- CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
- ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ,
- EventSubscribeReq::SignalLength);
- *dst = ll;
-
- NodeBitmask nodes;
+ NdbNodeBitmask nodes;
+ int retries = 30;
nodes.clear();
- Uint32 max = (nodeId == 0) ? (nodeId = 1, MAX_NDB_NODES) : nodeId;
- for(; (Uint32) nodeId <= max; nodeId++)
+ while (1)
{
- if (nodeTypes[nodeId] != NODE_TYPE_DB)
- continue;
- if (okToSendTo(nodeId, true))
- continue;
- if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
+ Uint32 nodeId, max;
+ ss.lock();
+ SimpleSignal ssig;
+ EventSubscribeReq * dst =
+ CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend());
+ ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ,
+ EventSubscribeReq::SignalLength);
+ *dst = ll;
+
+ if (nodeId_arg == 0)
{
- nodes.set(nodeId);
+ // all nodes
+ nodeId = 1;
+ max = MAX_NDB_NODES;
+ }
+ else
+ {
+ // only one node
+ max = nodeId = nodeId_arg;
+ }
+ // first make sure nodes are sendable
+ for(; nodeId <= max; nodeId++)
+ {
+ if (nodeTypes[nodeId] != NODE_TYPE_DB)
+ continue;
+ if (okToSendTo(nodeId, true))
+ {
+ if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
+ {
+ // node not connected we can safely skip this one
+ continue;
+ }
+ // api_reg_conf not recevied yet, need to retry
+ break;
+ }
+ }
+ if (nodeId <= max)
+ {
+ if (--retries)
+ {
+ ss.unlock();
+ NdbSleep_MilliSleep(100);
+ continue;
+ }
+ return SEND_OR_RECEIVE_FAILED;
+ }
+
+ if (nodeId_arg == 0)
+ {
+ // all nodes
+ nodeId = 1;
+ max = MAX_NDB_NODES;
+ }
+ else
+ {
+ // only one node
+ max = nodeId = nodeId_arg;
}
+ // now send to all sendable nodes nodes
+ // note, lock is held, so states have not changed
+ for(; (Uint32) nodeId <= max; nodeId++)
+ {
+ if (nodeTypes[nodeId] != NODE_TYPE_DB)
+ continue;
+ if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false)
+ continue; // node is not connected, skip
+ if (ss.sendSignal(nodeId, &ssig) == SEND_OK)
+ nodes.set(nodeId);
+ }
+ break;
}
if (nodes.isclear())
@@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId,
int error = 0;
while (!nodes.isclear())
{
+ Uint32 nodeId;
SimpleSignal *signal = ss.waitFor();
int gsn = signal->readSignalNumber();
nodeId = refToNode(signal->header.theSendersBlockRef);
diff --git a/ndb/src/ndbapi/ClusterMgr.cpp b/ndb/src/ndbapi/ClusterMgr.cpp
index 2ff27ca893e..060e5f71b6c 100644
--- a/ndb/src/ndbapi/ClusterMgr.cpp
+++ b/ndb/src/ndbapi/ClusterMgr.cpp
@@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){
ClusterMgr::Node::Node()
: m_state(NodeState::SL_NOTHING) {
compatible = nfCompleteRep = true;
- connected = defined = m_alive = false;
+ connected = defined = m_alive = m_api_reg_conf = false;
m_state.m_connected_nodes.clear();
}
@@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){
node.m_info.m_version);
}
+ node.m_api_reg_conf = true;
+
node.m_state = apiRegConf->nodeState;
if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED ||
node.m_state.startLevel == NodeState::SL_SINGLEUSER)){
@@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId nodeId){
noOfConnectedNodes--;
theNodes[nodeId].connected = false;
+ theNodes[nodeId].m_api_reg_conf = false;
theNodes[nodeId].m_state.m_connected_nodes.clear();
reportNodeFailed(nodeId, true);
diff --git a/ndb/src/ndbapi/ClusterMgr.hpp b/ndb/src/ndbapi/ClusterMgr.hpp
index 32234a0b2f4..b05b73c8324 100644
--- a/ndb/src/ndbapi/ClusterMgr.hpp
+++ b/ndb/src/ndbapi/ClusterMgr.hpp
@@ -65,6 +65,7 @@ public:
bool compatible; // Version is compatible
bool nfCompleteRep; // NF Complete Rep has arrived
bool m_alive; // Node is alive
+ bool m_api_reg_conf;// API_REGCONF has arrived
NodeInfo m_info;
NodeState m_state;
diff --git a/ndb/src/ndbapi/SignalSender.cpp b/ndb/src/ndbapi/SignalSender.cpp
index 804ea92877d..199c6d6e804 100644
--- a/ndb/src/ndbapi/SignalSender.cpp
+++ b/ndb/src/ndbapi/SignalSender.cpp
@@ -140,6 +140,8 @@ SignalSender::getNoOfConnectedNodes() const {
SendStatus
SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){
+ assert(getNodeInfo(nodeId).m_api_reg_conf == true ||
+ s->readSignalNumber() == GSN_API_REGREQ);
return theFacade->theTransporterRegistry->prepareSend(&s->header,
1, // JBB
&s->theData[0],
diff --git a/ndb/src/ndbapi/SignalSender.hpp b/ndb/src/ndbapi/SignalSender.hpp
index ec874e63c52..4cad759a334 100644
--- a/ndb/src/ndbapi/SignalSender.hpp
+++ b/ndb/src/ndbapi/SignalSender.hpp
@@ -32,7 +32,7 @@ public:
Uint32 theData[25];
LinearSectionPtr ptr[3];
- int readSignalNumber() {return header.theVerId_signalNumber; }
+ int readSignalNumber() const {return header.theVerId_signalNumber; }
Uint32 *getDataPtrSend() { return theData; }
const Uint32 *getDataPtr() const { return theData; }