diff options
author | unknown <tomas@poseidon.mysql.com> | 2007-02-14 11:05:38 +0700 |
---|---|---|
committer | unknown <tomas@poseidon.mysql.com> | 2007-02-14 11:05:38 +0700 |
commit | 9ed1b8434603d69eb0646013f721a402a624833c (patch) | |
tree | 65e7394b7a4c7108107b12deca6d998f6bb8712c /ndb | |
parent | 2ed7eaf56424485e475ef7f1c430b1881ac02fc2 (diff) | |
download | mariadb-git-9ed1b8434603d69eb0646013f721a402a624833c.tar.gz |
Bug#26293 cluster mgmt node sometimes doesn't receive events from all nodes on restart
- signals where sometimes sent too early when setting up subscriptions
ndb/include/kernel/signaldata/DumpStateOrd.hpp:
added dump for active subscriptions in cmvmi
ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp:
added dump for active subscriptions in cmvmi
ndb/src/mgmsrv/MgmtSrvr.cpp:
bug in that signals where sent prior to api reg conf arrived, causing thrown away signals and subsequent hangs in mgmtserver
also add retry if node connected but not yet received api reg conf
ndb/src/ndbapi/ClusterMgr.cpp:
added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable
ndb/src/ndbapi/ClusterMgr.hpp:
added status variable m_api_reg_conf in cluster manager to correctly be able to determine if a node is sendable
ndb/src/ndbapi/SignalSender.cpp:
assert to see that node is sendable when signal is sent
ndb/src/ndbapi/SignalSender.hpp:
manke metchd const
Diffstat (limited to 'ndb')
-rw-r--r-- | ndb/include/kernel/signaldata/DumpStateOrd.hpp | 4 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp | 20 | ||||
-rw-r--r-- | ndb/src/mgmsrv/MgmtSrvr.cpp | 94 | ||||
-rw-r--r-- | ndb/src/ndbapi/ClusterMgr.cpp | 5 | ||||
-rw-r--r-- | ndb/src/ndbapi/ClusterMgr.hpp | 1 | ||||
-rw-r--r-- | ndb/src/ndbapi/SignalSender.cpp | 2 | ||||
-rw-r--r-- | ndb/src/ndbapi/SignalSender.hpp | 2 |
7 files changed, 105 insertions, 23 deletions
diff --git a/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/ndb/include/kernel/signaldata/DumpStateOrd.hpp index 8d0961d1c27..5a1d9ece9cf 100644 --- a/ndb/include/kernel/signaldata/DumpStateOrd.hpp +++ b/ndb/include/kernel/signaldata/DumpStateOrd.hpp @@ -107,6 +107,10 @@ public: CmvmiDumpLongSignalMemory = 2601, CmvmiSetRestartOnErrorInsert = 2602, CmvmiTestLongSigWithDelay = 2603, + CmvmiDumpSubscriptions = 2604, /* note: done to respective outfile + to be able to debug if events + for some reason does not end up + in clusterlog */ // 7000 DIH // 7001 DIH // 7002 DIH diff --git a/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp b/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp index 5dd1e527dd2..a9d9c991ca3 100644 --- a/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp +++ b/ndb/src/kernel/blocks/cmvmi/Cmvmi.cpp @@ -897,7 +897,7 @@ void Cmvmi::execSET_VAR_REQ(Signal* signal) case TimeToWaitAlive: // QMGR - case HeartbeatIntervalDbDb: // TODO ev till Ndbcnt också + case HeartbeatIntervalDbDb: // TODO possibly Ndbcnt too case HeartbeatIntervalDbApi: case ArbitTimeout: sendSignal(QMGR_REF, GSN_SET_VAR_REQ, signal, 3, JBB); @@ -1105,6 +1105,24 @@ Cmvmi::execDUMP_STATE_ORD(Signal* signal) } } + if (arg == DumpStateOrd::CmvmiDumpSubscriptions) + { + SubscriberPtr ptr; + subscribers.first(ptr); + g_eventLogger.info("List subscriptions:"); + while(ptr.i != RNIL) + { + g_eventLogger.info("Subscription: %u, nodeId: %u, ref: 0x%x", + ptr.i, refToNode(ptr.p->blockRef), ptr.p->blockRef); + for(Uint32 i = 0; i < LogLevel::LOGLEVEL_CATEGORIES; i++) + { + Uint32 level = ptr.p->logLevel.getLogLevel((LogLevel::EventCategory)i); + g_eventLogger.info("Category %u Level %u", i, level); + } + subscribers.next(ptr); + } + } + if (arg == DumpStateOrd::CmvmiDumpLongSignalMemory){ infoEvent("Cmvmi: g_sectionSegmentPool size: %d free: %d", g_sectionSegmentPool.getSize(), diff --git a/ndb/src/mgmsrv/MgmtSrvr.cpp b/ndb/src/mgmsrv/MgmtSrvr.cpp index 0ee59f70885..5818e7fe3ae 100644 --- a/ndb/src/mgmsrv/MgmtSrvr.cpp +++ b/ndb/src/mgmsrv/MgmtSrvr.cpp @@ -704,7 +704,7 @@ int MgmtSrvr::okToSendTo(NodeId nodeId, bool unCond) return WRONG_PROCESS_TYPE; // Check if we have contact with it if(unCond){ - if(theFacade->theClusterMgr->getNodeInfo(nodeId).connected) + if(theFacade->theClusterMgr->getNodeInfo(nodeId).m_api_reg_conf) return 0; } else if (theFacade->get_node_alive(nodeId) == true) @@ -1562,32 +1562,85 @@ MgmtSrvr::status(int nodeId, } int -MgmtSrvr::setEventReportingLevelImpl(int nodeId, +MgmtSrvr::setEventReportingLevelImpl(int nodeId_arg, const EventSubscribeReq& ll) { SignalSender ss(theFacade); - ss.lock(); - - SimpleSignal ssig; - EventSubscribeReq * dst = - CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend()); - ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ, - EventSubscribeReq::SignalLength); - *dst = ll; - - NodeBitmask nodes; + NdbNodeBitmask nodes; + int retries = 30; nodes.clear(); - Uint32 max = (nodeId == 0) ? (nodeId = 1, MAX_NDB_NODES) : nodeId; - for(; (Uint32) nodeId <= max; nodeId++) + while (1) { - if (nodeTypes[nodeId] != NODE_TYPE_DB) - continue; - if (okToSendTo(nodeId, true)) - continue; - if (ss.sendSignal(nodeId, &ssig) == SEND_OK) + Uint32 nodeId, max; + ss.lock(); + SimpleSignal ssig; + EventSubscribeReq * dst = + CAST_PTR(EventSubscribeReq, ssig.getDataPtrSend()); + ssig.set(ss,TestOrd::TraceAPI, CMVMI, GSN_EVENT_SUBSCRIBE_REQ, + EventSubscribeReq::SignalLength); + *dst = ll; + + if (nodeId_arg == 0) { - nodes.set(nodeId); + // all nodes + nodeId = 1; + max = MAX_NDB_NODES; + } + else + { + // only one node + max = nodeId = nodeId_arg; + } + // first make sure nodes are sendable + for(; nodeId <= max; nodeId++) + { + if (nodeTypes[nodeId] != NODE_TYPE_DB) + continue; + if (okToSendTo(nodeId, true)) + { + if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false) + { + // node not connected we can safely skip this one + continue; + } + // api_reg_conf not recevied yet, need to retry + break; + } + } + if (nodeId <= max) + { + if (--retries) + { + ss.unlock(); + NdbSleep_MilliSleep(100); + continue; + } + return SEND_OR_RECEIVE_FAILED; + } + + if (nodeId_arg == 0) + { + // all nodes + nodeId = 1; + max = MAX_NDB_NODES; + } + else + { + // only one node + max = nodeId = nodeId_arg; } + // now send to all sendable nodes nodes + // note, lock is held, so states have not changed + for(; (Uint32) nodeId <= max; nodeId++) + { + if (nodeTypes[nodeId] != NODE_TYPE_DB) + continue; + if (theFacade->theClusterMgr->getNodeInfo(nodeId).connected == false) + continue; // node is not connected, skip + if (ss.sendSignal(nodeId, &ssig) == SEND_OK) + nodes.set(nodeId); + } + break; } if (nodes.isclear()) @@ -1598,6 +1651,7 @@ MgmtSrvr::setEventReportingLevelImpl(int nodeId, int error = 0; while (!nodes.isclear()) { + Uint32 nodeId; SimpleSignal *signal = ss.waitFor(); int gsn = signal->readSignalNumber(); nodeId = refToNode(signal->header.theSendersBlockRef); diff --git a/ndb/src/ndbapi/ClusterMgr.cpp b/ndb/src/ndbapi/ClusterMgr.cpp index 2ff27ca893e..060e5f71b6c 100644 --- a/ndb/src/ndbapi/ClusterMgr.cpp +++ b/ndb/src/ndbapi/ClusterMgr.cpp @@ -327,7 +327,7 @@ ClusterMgr::showState(NodeId nodeId){ ClusterMgr::Node::Node() : m_state(NodeState::SL_NOTHING) { compatible = nfCompleteRep = true; - connected = defined = m_alive = false; + connected = defined = m_alive = m_api_reg_conf = false; m_state.m_connected_nodes.clear(); } @@ -401,6 +401,8 @@ ClusterMgr::execAPI_REGCONF(const Uint32 * theData){ node.m_info.m_version); } + node.m_api_reg_conf = true; + node.m_state = apiRegConf->nodeState; if (node.compatible && (node.m_state.startLevel == NodeState::SL_STARTED || node.m_state.startLevel == NodeState::SL_SINGLEUSER)){ @@ -519,6 +521,7 @@ ClusterMgr::reportDisconnected(NodeId nodeId){ noOfConnectedNodes--; theNodes[nodeId].connected = false; + theNodes[nodeId].m_api_reg_conf = false; theNodes[nodeId].m_state.m_connected_nodes.clear(); reportNodeFailed(nodeId, true); diff --git a/ndb/src/ndbapi/ClusterMgr.hpp b/ndb/src/ndbapi/ClusterMgr.hpp index 32234a0b2f4..b05b73c8324 100644 --- a/ndb/src/ndbapi/ClusterMgr.hpp +++ b/ndb/src/ndbapi/ClusterMgr.hpp @@ -65,6 +65,7 @@ public: bool compatible; // Version is compatible bool nfCompleteRep; // NF Complete Rep has arrived bool m_alive; // Node is alive + bool m_api_reg_conf;// API_REGCONF has arrived NodeInfo m_info; NodeState m_state; diff --git a/ndb/src/ndbapi/SignalSender.cpp b/ndb/src/ndbapi/SignalSender.cpp index 804ea92877d..199c6d6e804 100644 --- a/ndb/src/ndbapi/SignalSender.cpp +++ b/ndb/src/ndbapi/SignalSender.cpp @@ -140,6 +140,8 @@ SignalSender::getNoOfConnectedNodes() const { SendStatus SignalSender::sendSignal(Uint16 nodeId, const SimpleSignal * s){ + assert(getNodeInfo(nodeId).m_api_reg_conf == true || + s->readSignalNumber() == GSN_API_REGREQ); return theFacade->theTransporterRegistry->prepareSend(&s->header, 1, // JBB &s->theData[0], diff --git a/ndb/src/ndbapi/SignalSender.hpp b/ndb/src/ndbapi/SignalSender.hpp index ec874e63c52..4cad759a334 100644 --- a/ndb/src/ndbapi/SignalSender.hpp +++ b/ndb/src/ndbapi/SignalSender.hpp @@ -32,7 +32,7 @@ public: Uint32 theData[25]; LinearSectionPtr ptr[3]; - int readSignalNumber() {return header.theVerId_signalNumber; } + int readSignalNumber() const {return header.theVerId_signalNumber; } Uint32 *getDataPtrSend() { return theData; } const Uint32 *getDataPtr() const { return theData; } |