summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorjonas@perch.ndb.mysql.com <>2006-09-26 13:19:00 +0200
committerjonas@perch.ndb.mysql.com <>2006-09-26 13:19:00 +0200
commit72765abfb8b266a83f42713f5c5e88dedb58ff7c (patch)
treeb72065fd03276dfc0afa7789a3d1b5a6a2ecb8e1
parent0d360045503a7633f61e31afea9b9e2032153173 (diff)
downloadmariadb-git-72765abfb8b266a83f42713f5c5e88dedb58ff7c.tar.gz
ndb - bug#20895
Fix occational LCP hang!!! Make sure only to consider alive nodes in startNextChkpt
-rw-r--r--ndb/src/kernel/blocks/dbdih/DbdihMain.cpp131
1 files changed, 71 insertions, 60 deletions
diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
index 02ec5782c3e..4b37bb03783 100644
--- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
+++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
@@ -9561,73 +9561,84 @@ void Dbdih::startNextChkpt(Signal* signal)
nodePtr.i = replicaPtr.p->procNode;
ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
- if (replicaPtr.p->lcpOngoingFlag &&
- replicaPtr.p->lcpIdStarted < lcpId) {
- jam();
- //-------------------------------------------------------------------
- // We have found a replica on a node that performs local checkpoint
- // that is alive and that have not yet been started.
- //-------------------------------------------------------------------
-
- if (nodePtr.p->noOfStartedChkpt < 2) {
- jam();
- /**
- * Send LCP_FRAG_ORD to LQH
- */
-
- /**
- * Mark the replica so with lcpIdStarted == true
- */
- replicaPtr.p->lcpIdStarted = lcpId;
-
- Uint32 i = nodePtr.p->noOfStartedChkpt;
- nodePtr.p->startedChkpt[i].tableId = tabPtr.i;
- nodePtr.p->startedChkpt[i].fragId = curr.fragmentId;
- nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i;
- nodePtr.p->noOfStartedChkpt = i + 1;
-
- sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
- } else if (nodePtr.p->noOfQueuedChkpt < 2) {
- jam();
- /**
- * Put LCP_FRAG_ORD "in queue"
- */
-
- /**
- * Mark the replica so with lcpIdStarted == true
- */
- replicaPtr.p->lcpIdStarted = lcpId;
+ if (c_lcpState.m_participatingLQH.get(nodePtr.i))
+ {
+ if (replicaPtr.p->lcpOngoingFlag &&
+ replicaPtr.p->lcpIdStarted < lcpId)
+ {
+ jam();
+ //-------------------------------------------------------------------
+ // We have found a replica on a node that performs local checkpoint
+ // that is alive and that have not yet been started.
+ //-------------------------------------------------------------------
- Uint32 i = nodePtr.p->noOfQueuedChkpt;
- nodePtr.p->queuedChkpt[i].tableId = tabPtr.i;
- nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId;
- nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i;
- nodePtr.p->noOfQueuedChkpt = i + 1;
- } else {
- jam();
+ if (nodePtr.p->noOfStartedChkpt < 2)
+ {
+ jam();
+ /**
+ * Send LCP_FRAG_ORD to LQH
+ */
+
+ /**
+ * Mark the replica so with lcpIdStarted == true
+ */
+ replicaPtr.p->lcpIdStarted = lcpId;
- if(save){
+ Uint32 i = nodePtr.p->noOfStartedChkpt;
+ nodePtr.p->startedChkpt[i].tableId = tabPtr.i;
+ nodePtr.p->startedChkpt[i].fragId = curr.fragmentId;
+ nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i;
+ nodePtr.p->noOfStartedChkpt = i + 1;
+
+ sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
+ }
+ else if (nodePtr.p->noOfQueuedChkpt < 2)
+ {
+ jam();
/**
- * Stop increasing value on first that was "full"
+ * Put LCP_FRAG_ORD "in queue"
*/
- c_lcpState.currentFragment = curr;
- save = false;
- }
-
- busyNodes.set(nodePtr.i);
- if(busyNodes.count() == lcpNodes){
+
/**
- * There were no possibility to start the local checkpoint
- * and it was not possible to queue it up. In this case we
- * stop the start of local checkpoints until the nodes with a
- * backlog have performed more checkpoints. We will return and
- * will not continue the process of starting any more checkpoints.
+ * Mark the replica so with lcpIdStarted == true
*/
- return;
+ replicaPtr.p->lcpIdStarted = lcpId;
+
+ Uint32 i = nodePtr.p->noOfQueuedChkpt;
+ nodePtr.p->queuedChkpt[i].tableId = tabPtr.i;
+ nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId;
+ nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i;
+ nodePtr.p->noOfQueuedChkpt = i + 1;
+ }
+ else
+ {
+ jam();
+
+ if(save)
+ {
+ /**
+ * Stop increasing value on first that was "full"
+ */
+ c_lcpState.currentFragment = curr;
+ save = false;
+ }
+
+ busyNodes.set(nodePtr.i);
+ if(busyNodes.count() == lcpNodes)
+ {
+ /**
+ * There were no possibility to start the local checkpoint
+ * and it was not possible to queue it up. In this case we
+ * stop the start of local checkpoints until the nodes with a
+ * backlog have performed more checkpoints. We will return and
+ * will not continue the process of starting any more checkpoints.
+ */
+ return;
+ }//if
}//if
- }//if
- }
- }//while
+ }
+ }//while
+ }
curr.fragmentId++;
if (curr.fragmentId >= tabPtr.p->totalfragments) {
jam();