ndb - bug#20895

Fix occational LCP hang!!! Make sure only to consider alive nodes in startNextChkpt ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: Make sure only to consider alive nodes in startNextChkpt
author: unknown <jonas@perch.ndb.mysql.com> 2006-09-26 13:19:00 +0200
committer: unknown <jonas@perch.ndb.mysql.com> 2006-09-26 13:19:00 +0200
commit: 4b6e6da6819af7c77f7bfd62c45cc17407e7c343 (patch)
tree: b72065fd03276dfc0afa7789a3d1b5a6a2ecb8e1 /ndb
parent: 931af3194bb618a9435b7b9ca0f6996ad4e629f8 (diff)
download: mariadb-git-4b6e6da6819af7c77f7bfd62c45cc17407e7c343.tar.gz
1 files changed, 71 insertions, 60 deletions
diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
index 02ec5782c3e..4b37bb03783 100644
--- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
+++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
@@ -9561,73 +9561,84 @@ void Dbdih::startNextChkpt(Signal* signal)
       nodePtr.i = replicaPtr.p->procNode;
       ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord);
       
-      if (replicaPtr.p->lcpOngoingFlag &&
-          replicaPtr.p->lcpIdStarted < lcpId) {
-        jam();
-	//-------------------------------------------------------------------
-	// We have found a replica on a node that performs local checkpoint
-	// that is alive and that have not yet been started.
-	//-------------------------------------------------------------------
-
-        if (nodePtr.p->noOfStartedChkpt < 2) {
-          jam();
-	  /**
-	   * Send LCP_FRAG_ORD to LQH
-	   */
-	  
-	  /**
-	   * Mark the replica so with lcpIdStarted == true
-	   */
-          replicaPtr.p->lcpIdStarted = lcpId;
-
-          Uint32 i = nodePtr.p->noOfStartedChkpt;
-          nodePtr.p->startedChkpt[i].tableId = tabPtr.i;
-          nodePtr.p->startedChkpt[i].fragId = curr.fragmentId;
-          nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i;
-          nodePtr.p->noOfStartedChkpt = i + 1;
-
-	  sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
-        } else if (nodePtr.p->noOfQueuedChkpt < 2) {
-          jam();
-	  /**
-	   * Put LCP_FRAG_ORD "in queue"
-	   */
-	  
-	  /**
-	   * Mark the replica so with lcpIdStarted == true
-	   */
-          replicaPtr.p->lcpIdStarted = lcpId;
+      if (c_lcpState.m_participatingLQH.get(nodePtr.i))
+      {
+	if (replicaPtr.p->lcpOngoingFlag &&
+	    replicaPtr.p->lcpIdStarted < lcpId) 
+	{
+	  jam();
+	  //-------------------------------------------------------------------
+	  // We have found a replica on a node that performs local checkpoint
+	  // that is alive and that have not yet been started.
+	  //-------------------------------------------------------------------
 	  
-          Uint32 i = nodePtr.p->noOfQueuedChkpt;
-          nodePtr.p->queuedChkpt[i].tableId = tabPtr.i;
-          nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId;
-          nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i;
-          nodePtr.p->noOfQueuedChkpt = i + 1;
-        } else {
-          jam();
+	  if (nodePtr.p->noOfStartedChkpt < 2) 
+	  {
+	    jam();
+	    /**
+	     * Send LCP_FRAG_ORD to LQH
+	     */
+	    
+	    /**
+	     * Mark the replica so with lcpIdStarted == true
+	     */
+	    replicaPtr.p->lcpIdStarted = lcpId;
 
-	  if(save){
+	    Uint32 i = nodePtr.p->noOfStartedChkpt;
+	    nodePtr.p->startedChkpt[i].tableId = tabPtr.i;
+	    nodePtr.p->startedChkpt[i].fragId = curr.fragmentId;
+	    nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i;
+	    nodePtr.p->noOfStartedChkpt = i + 1;
+	    
+	    sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]);
+	  } 
+	  else if (nodePtr.p->noOfQueuedChkpt < 2) 
+	  {
+	    jam();
 	    /**
-	     * Stop increasing value on first that was "full"
+	     * Put LCP_FRAG_ORD "in queue"
 	     */
-	    c_lcpState.currentFragment = curr;
-	    save = false;
-	  }
-	  
-	  busyNodes.set(nodePtr.i);
-	  if(busyNodes.count() == lcpNodes){
+	    
 	    /**
-	     * There were no possibility to start the local checkpoint 
-	     * and it was not possible to queue it up. In this case we 
-	     * stop the start of local checkpoints until the nodes with a 
-	     * backlog have performed more checkpoints. We will return and 
-	     * will not continue the process of starting any more checkpoints.
+	     * Mark the replica so with lcpIdStarted == true
 	     */
-	    return;
+	    replicaPtr.p->lcpIdStarted = lcpId;
+	    
+	    Uint32 i = nodePtr.p->noOfQueuedChkpt;
+	    nodePtr.p->queuedChkpt[i].tableId = tabPtr.i;
+	    nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId;
+	    nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i;
+	    nodePtr.p->noOfQueuedChkpt = i + 1;
+	  } 
+	  else 
+	  {
+	    jam();
+	    
+	    if(save)
+	    {
+	      /**
+	       * Stop increasing value on first that was "full"
+	       */
+	      c_lcpState.currentFragment = curr;
+	      save = false;
+	    }
+	    
+	    busyNodes.set(nodePtr.i);
+	    if(busyNodes.count() == lcpNodes)
+	    {
+	      /**
+	       * There were no possibility to start the local checkpoint 
+	       * and it was not possible to queue it up. In this case we 
+	       * stop the start of local checkpoints until the nodes with a 
+	       * backlog have performed more checkpoints. We will return and 
+	       * will not continue the process of starting any more checkpoints.
+	       */
+	      return;
+	    }//if
 	  }//if
-	}//if
-      }
-    }//while
+	}
+      }//while
+    }
     curr.fragmentId++;
     if (curr.fragmentId >= tabPtr.p->totalfragments) {
       jam();
author	unknown <jonas@perch.ndb.mysql.com>	2006-09-26 13:19:00 +0200
committer	unknown <jonas@perch.ndb.mysql.com>	2006-09-26 13:19:00 +0200
commit	4b6e6da6819af7c77f7bfd62c45cc17407e7c343 (patch)
tree	b72065fd03276dfc0afa7789a3d1b5a6a2ecb8e1 /ndb
parent	931af3194bb618a9435b7b9ca0f6996ad4e629f8 (diff)
download	mariadb-git-4b6e6da6819af7c77f7bfd62c45cc17407e7c343.tar.gz