diff options
author | unknown <jonas@perch.ndb.mysql.com> | 2006-09-26 13:19:00 +0200 |
---|---|---|
committer | unknown <jonas@perch.ndb.mysql.com> | 2006-09-26 13:19:00 +0200 |
commit | 4b6e6da6819af7c77f7bfd62c45cc17407e7c343 (patch) | |
tree | b72065fd03276dfc0afa7789a3d1b5a6a2ecb8e1 /ndb | |
parent | 931af3194bb618a9435b7b9ca0f6996ad4e629f8 (diff) | |
download | mariadb-git-4b6e6da6819af7c77f7bfd62c45cc17407e7c343.tar.gz |
ndb - bug#20895
Fix occational LCP hang!!!
Make sure only to consider alive nodes in startNextChkpt
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
Make sure only to consider alive nodes in startNextChkpt
Diffstat (limited to 'ndb')
-rw-r--r-- | ndb/src/kernel/blocks/dbdih/DbdihMain.cpp | 131 |
1 files changed, 71 insertions, 60 deletions
diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index 02ec5782c3e..4b37bb03783 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -9561,73 +9561,84 @@ void Dbdih::startNextChkpt(Signal* signal) nodePtr.i = replicaPtr.p->procNode; ptrCheckGuard(nodePtr, MAX_NDB_NODES, nodeRecord); - if (replicaPtr.p->lcpOngoingFlag && - replicaPtr.p->lcpIdStarted < lcpId) { - jam(); - //------------------------------------------------------------------- - // We have found a replica on a node that performs local checkpoint - // that is alive and that have not yet been started. - //------------------------------------------------------------------- - - if (nodePtr.p->noOfStartedChkpt < 2) { - jam(); - /** - * Send LCP_FRAG_ORD to LQH - */ - - /** - * Mark the replica so with lcpIdStarted == true - */ - replicaPtr.p->lcpIdStarted = lcpId; - - Uint32 i = nodePtr.p->noOfStartedChkpt; - nodePtr.p->startedChkpt[i].tableId = tabPtr.i; - nodePtr.p->startedChkpt[i].fragId = curr.fragmentId; - nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i; - nodePtr.p->noOfStartedChkpt = i + 1; - - sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]); - } else if (nodePtr.p->noOfQueuedChkpt < 2) { - jam(); - /** - * Put LCP_FRAG_ORD "in queue" - */ - - /** - * Mark the replica so with lcpIdStarted == true - */ - replicaPtr.p->lcpIdStarted = lcpId; + if (c_lcpState.m_participatingLQH.get(nodePtr.i)) + { + if (replicaPtr.p->lcpOngoingFlag && + replicaPtr.p->lcpIdStarted < lcpId) + { + jam(); + //------------------------------------------------------------------- + // We have found a replica on a node that performs local checkpoint + // that is alive and that have not yet been started. + //------------------------------------------------------------------- - Uint32 i = nodePtr.p->noOfQueuedChkpt; - nodePtr.p->queuedChkpt[i].tableId = tabPtr.i; - nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId; - nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i; - nodePtr.p->noOfQueuedChkpt = i + 1; - } else { - jam(); + if (nodePtr.p->noOfStartedChkpt < 2) + { + jam(); + /** + * Send LCP_FRAG_ORD to LQH + */ + + /** + * Mark the replica so with lcpIdStarted == true + */ + replicaPtr.p->lcpIdStarted = lcpId; - if(save){ + Uint32 i = nodePtr.p->noOfStartedChkpt; + nodePtr.p->startedChkpt[i].tableId = tabPtr.i; + nodePtr.p->startedChkpt[i].fragId = curr.fragmentId; + nodePtr.p->startedChkpt[i].replicaPtr = replicaPtr.i; + nodePtr.p->noOfStartedChkpt = i + 1; + + sendLCP_FRAG_ORD(signal, nodePtr.p->startedChkpt[i]); + } + else if (nodePtr.p->noOfQueuedChkpt < 2) + { + jam(); /** - * Stop increasing value on first that was "full" + * Put LCP_FRAG_ORD "in queue" */ - c_lcpState.currentFragment = curr; - save = false; - } - - busyNodes.set(nodePtr.i); - if(busyNodes.count() == lcpNodes){ + /** - * There were no possibility to start the local checkpoint - * and it was not possible to queue it up. In this case we - * stop the start of local checkpoints until the nodes with a - * backlog have performed more checkpoints. We will return and - * will not continue the process of starting any more checkpoints. + * Mark the replica so with lcpIdStarted == true */ - return; + replicaPtr.p->lcpIdStarted = lcpId; + + Uint32 i = nodePtr.p->noOfQueuedChkpt; + nodePtr.p->queuedChkpt[i].tableId = tabPtr.i; + nodePtr.p->queuedChkpt[i].fragId = curr.fragmentId; + nodePtr.p->queuedChkpt[i].replicaPtr = replicaPtr.i; + nodePtr.p->noOfQueuedChkpt = i + 1; + } + else + { + jam(); + + if(save) + { + /** + * Stop increasing value on first that was "full" + */ + c_lcpState.currentFragment = curr; + save = false; + } + + busyNodes.set(nodePtr.i); + if(busyNodes.count() == lcpNodes) + { + /** + * There were no possibility to start the local checkpoint + * and it was not possible to queue it up. In this case we + * stop the start of local checkpoints until the nodes with a + * backlog have performed more checkpoints. We will return and + * will not continue the process of starting any more checkpoints. + */ + return; + }//if }//if - }//if - } - }//while + } + }//while + } curr.fragmentId++; if (curr.fragmentId >= tabPtr.p->totalfragments) { jam(); |