diff options
author | unknown <jonas@perch.ndb.mysql.com> | 2006-03-20 18:30:29 +0100 |
---|---|---|
committer | unknown <jonas@perch.ndb.mysql.com> | 2006-03-20 18:30:29 +0100 |
commit | 591aedaa2b594fdccca79a09c846cf4b4490f884 (patch) | |
tree | 8b6aa7a12a834356ce3c8db74ddafbda1713c74f | |
parent | 367442f754a97d87077c99bc5805b41da5ac7119 (diff) | |
parent | ad6dcfb1277b3b0a8692c3bfd802ba48cc3fe537 (diff) | |
download | mariadb-git-591aedaa2b594fdccca79a09c846cf4b4490f884.tar.gz |
Merge joreland@bk-internal.mysql.com:/home/bk/mysql-4.1-wl2610
into perch.ndb.mysql.com:/home/jonas/src/41-work
-rw-r--r-- | ndb/include/kernel/signaldata/TcContinueB.hpp | 3 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/dbdih/Dbdih.hpp | 3 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/dbdih/DbdihMain.cpp | 16 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/dblqh/DblqhMain.cpp | 166 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/dbtc/Dbtc.hpp | 27 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/dbtc/DbtcMain.cpp | 368 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/qmgr/QmgrMain.cpp | 101 | ||||
-rw-r--r-- | ndb/src/ndbapi/NdbConnection.cpp | 4 | ||||
-rw-r--r-- | ndb/src/ndbapi/Ndbif.cpp | 12 | ||||
-rw-r--r-- | ndb/src/ndbapi/TransporterFacade.cpp | 13 | ||||
-rw-r--r-- | ndb/src/ndbapi/TransporterFacade.hpp | 1 | ||||
-rw-r--r-- | ndb/test/ndbapi/testNodeRestart.cpp | 50 | ||||
-rw-r--r-- | ndb/test/ndbapi/testTimeout.cpp | 101 | ||||
-rw-r--r-- | ndb/test/run-test/daily-basic-tests.txt | 8 |
14 files changed, 651 insertions, 222 deletions
diff --git a/ndb/include/kernel/signaldata/TcContinueB.hpp b/ndb/include/kernel/signaldata/TcContinueB.hpp index 85213791b2a..b87b982e49b 100644 --- a/ndb/include/kernel/signaldata/TcContinueB.hpp +++ b/ndb/include/kernel/signaldata/TcContinueB.hpp @@ -44,7 +44,8 @@ private: CHECK_WAIT_DROP_TAB_FAILED_LQH = 16, TRIGGER_PENDING = 17, - DelayTCKEYCONF = 18 + DelayTCKEYCONF = 18, + ZNF_CHECK_TRANSACTIONS = 19 }; }; diff --git a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp index 0c107e35603..f74c0f36c4d 100644 --- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp +++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp @@ -1038,7 +1038,8 @@ private: void prepareReplicas(FragmentstorePtr regFragptr); void removeNodeFromStored(Uint32 nodeId, FragmentstorePtr regFragptr, - ReplicaRecordPtr replicaPtr); + ReplicaRecordPtr replicaPtr, + bool temporary); void removeOldStoredReplica(FragmentstorePtr regFragptr, ReplicaRecordPtr replicaPtr); void removeStoredReplica(FragmentstorePtr regFragptr, diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index 776e59ea495..fab428aadef 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -5212,6 +5212,7 @@ void Dbdih::removeNodeFromTable(Signal* signal, //const Uint32 lcpId = SYSFILE->latestLCP_ID; const bool lcpOngoingFlag = (tabPtr.p->tabLcpStatus== TabRecord::TLS_ACTIVE); + const bool temporary = !tabPtr.p->storedTable; FragmentstorePtr fragPtr; for(Uint32 fragNo = 0; fragNo < tabPtr.p->totalfragments; fragNo++){ @@ -5232,7 +5233,7 @@ void Dbdih::removeNodeFromTable(Signal* signal, jam(); found = true; noOfRemovedReplicas++; - removeNodeFromStored(nodeId, fragPtr, replicaPtr); + removeNodeFromStored(nodeId, fragPtr, replicaPtr, temporary); if(replicaPtr.p->lcpOngoingFlag){ jam(); /** @@ -12051,9 +12052,18 @@ void Dbdih::removeDeadNode(NodeRecordPtr removeNodePtr) /*---------------------------------------------------------------*/ void Dbdih::removeNodeFromStored(Uint32 nodeId, FragmentstorePtr fragPtr, - ReplicaRecordPtr replicatePtr) + ReplicaRecordPtr replicatePtr, + bool temporary) { - newCrashedReplica(nodeId, replicatePtr); + if (!temporary) + { + jam(); + newCrashedReplica(nodeId, replicatePtr); + } + else + { + jam(); + } removeStoredReplica(fragPtr, replicatePtr); linkOldStoredReplica(fragPtr, replicatePtr); ndbrequire(fragPtr.p->storedReplicas != RNIL); diff --git a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp index ff7e3c32924..0aeeaccd55e 100644 --- a/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp +++ b/ndb/src/kernel/blocks/dblqh/DblqhMain.cpp @@ -18448,6 +18448,172 @@ Dblqh::execDUMP_STATE_ORD(Signal* signal) c_error_insert_table_id = dumpState->args[1]; SET_ERROR_INSERT_VALUE(5042); } + + TcConnectionrec *regTcConnectionrec = tcConnectionrec; + Uint32 ttcConnectrecFileSize = ctcConnectrecFileSize; + Uint32 arg = dumpState->args[0]; + if(arg == 2306) + { + for(Uint32 i = 0; i<1024; i++) + { + TcConnectionrecPtr tcRec; + tcRec.i = ctransidHash[i]; + while(tcRec.i != RNIL) + { + ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec); + ndbout << "TcConnectionrec " << tcRec.i; + signal->theData[0] = 2307; + signal->theData[1] = tcRec.i; + execDUMP_STATE_ORD(signal); + tcRec.i = tcRec.p->nextHashRec; + } + } + } + + if(arg == 2307 || arg == 2308) + { + TcConnectionrecPtr tcRec; + tcRec.i = signal->theData[1]; + ptrCheckGuard(tcRec, ttcConnectrecFileSize, regTcConnectionrec); + + ndbout << " transactionState = " << tcRec.p->transactionState<<endl; + ndbout << " operation = " << tcRec.p->operation<<endl; + ndbout << " tcNodeFailrec = " << tcRec.p->tcNodeFailrec + << " seqNoReplica = " << tcRec.p->seqNoReplica + << " simpleRead = " << tcRec.p->simpleRead + << endl; + ndbout << " replicaType = " << tcRec.p->replicaType + << " reclenAiLqhkey = " << tcRec.p->reclenAiLqhkey + << " opExec = " << tcRec.p->opExec + << endl; + ndbout << " opSimple = " << tcRec.p->opSimple + << " nextSeqNoReplica = " << tcRec.p->nextSeqNoReplica + << " lockType = " << tcRec.p->lockType + << endl; + ndbout << " lastReplicaNo = " << tcRec.p->lastReplicaNo + << " indTakeOver = " << tcRec.p->indTakeOver + << " dirtyOp = " << tcRec.p->dirtyOp + << endl; + ndbout << " activeCreat = " << tcRec.p->activeCreat + << " tcBlockref = " << hex << tcRec.p->tcBlockref + << " reqBlockref = " << hex << tcRec.p->reqBlockref + << " primKeyLen = " << tcRec.p->primKeyLen + << endl; + ndbout << " nextReplica = " << tcRec.p->nextReplica + << " tcBlockref = " << hex << tcRec.p->tcBlockref + << " reqBlockref = " << hex << tcRec.p->reqBlockref + << " primKeyLen = " << tcRec.p->primKeyLen + << endl; + ndbout << " logStopPageNo = " << tcRec.p->logStopPageNo + << " logStartPageNo = " << tcRec.p->logStartPageNo + << " logStartPageIndex = " << tcRec.p->logStartPageIndex + << endl; + ndbout << " errorCode = " << tcRec.p->errorCode + << " clientBlockref = " << hex << tcRec.p->clientBlockref + << " applRef = " << hex << tcRec.p->applRef + << " totSendlenAi = " << tcRec.p->totSendlenAi + << endl; + ndbout << " totReclenAi = " << tcRec.p->totReclenAi + << " tcScanRec = " << tcRec.p->tcScanRec + << " tcScanInfo = " << tcRec.p->tcScanInfo + << " tcOprec = " << hex << tcRec.p->tcOprec + << endl; + ndbout << " tableref = " << tcRec.p->tableref + << " simpleTcConnect = " << tcRec.p->simpleTcConnect + << " storedProcId = " << tcRec.p->storedProcId + << " schemaVersion = " << tcRec.p->schemaVersion + << endl; + ndbout << " reqinfo = " << tcRec.p->reqinfo + << " reqRef = " << tcRec.p->reqRef + << " readlenAi = " << tcRec.p->readlenAi + << " prevTc = " << tcRec.p->prevTc + << endl; + ndbout << " prevLogTcrec = " << tcRec.p->prevLogTcrec + << " prevHashRec = " << tcRec.p->prevHashRec + << " nodeAfterNext0 = " << tcRec.p->nodeAfterNext[0] + << " nodeAfterNext1 = " << tcRec.p->nodeAfterNext[1] + << endl; + ndbout << " nextTcConnectrec = " << tcRec.p->nextTcConnectrec + << " nextTc = " << tcRec.p->nextTc + << " nextTcLogQueue = " << tcRec.p->nextTcLogQueue + << " nextLogTcrec = " << tcRec.p->nextLogTcrec + << endl; + ndbout << " nextHashRec = " << tcRec.p->nextHashRec + << " logWriteState = " << tcRec.p->logWriteState + << " logStartFileNo = " << tcRec.p->logStartFileNo + << " listState = " << tcRec.p->listState + << endl; + ndbout << " lastAttrinbuf = " << tcRec.p->lastAttrinbuf + << " lastTupkeybuf = " << tcRec.p->lastTupkeybuf + << " hashValue = " << tcRec.p->hashValue + << endl; + ndbout << " gci = " << tcRec.p->gci + << " fragmentptr = " << tcRec.p->fragmentptr + << " fragmentid = " << tcRec.p->fragmentid + << " firstTupkeybuf = " << tcRec.p->firstTupkeybuf + << endl; + ndbout << " firstAttrinbuf = " << tcRec.p->firstAttrinbuf + << " currTupAiLen = " << tcRec.p->currTupAiLen + << " currReclenAi = " << tcRec.p->currReclenAi + << endl; + ndbout << " tcTimer = " << tcRec.p->tcTimer + << " clientConnectrec = " << tcRec.p->clientConnectrec + << " applOprec = " << hex << tcRec.p->applOprec + << " abortState = " << tcRec.p->abortState + << endl; + ndbout << " transid0 = " << hex << tcRec.p->transid[0] + << " transid1 = " << hex << tcRec.p->transid[1] + << " tupkeyData0 = " << tcRec.p->tupkeyData[0] + << " tupkeyData1 = " << tcRec.p->tupkeyData[1] + << endl; + ndbout << " tupkeyData2 = " << tcRec.p->tupkeyData[2] + << " tupkeyData3 = " << tcRec.p->tupkeyData[3] + << endl; + switch (tcRec.p->transactionState) { + + case TcConnectionrec::SCAN_STATE_USED: + if (tcRec.p->tcScanRec < cscanrecFileSize){ + ScanRecordPtr TscanPtr; + c_scanRecordPool.getPtr(TscanPtr, tcRec.p->tcScanRec); + ndbout << " scanState = " << TscanPtr.p->scanState << endl; + //TscanPtr.p->scanLocalref[2]; + ndbout << " copyPtr="<<TscanPtr.p->copyPtr + << " scanAccPtr="<<TscanPtr.p->scanAccPtr + << " scanAiLength="<<TscanPtr.p->scanAiLength + << endl; + ndbout << " m_curr_batch_size_rows="<< + TscanPtr.p->m_curr_batch_size_rows + << " m_max_batch_size_rows="<< + TscanPtr.p->m_max_batch_size_rows + << " scanErrorCounter="<<TscanPtr.p->scanErrorCounter + << endl; + ndbout << " scanSchemaVersion="<<TscanPtr.p->scanSchemaVersion + << " scanStoredProcId="<<TscanPtr.p->scanStoredProcId + << " scanTcrec="<<TscanPtr.p->scanTcrec + << endl; + ndbout << " scanType="<<TscanPtr.p->scanType + << " scanApiBlockref="<<TscanPtr.p->scanApiBlockref + << " scanNodeId="<<TscanPtr.p->scanNodeId + << " scanCompletedStatus="<<TscanPtr.p->scanCompletedStatus + << endl; + ndbout << " scanFlag="<<TscanPtr.p->scanFlag + << " scanLockHold="<<TscanPtr.p->scanLockHold + << " scanLockMode="<<TscanPtr.p->scanLockMode + << " scanNumber="<<TscanPtr.p->scanNumber + << endl; + ndbout << " scanReleaseCounter="<<TscanPtr.p->scanReleaseCounter + << " scanTcWaiting="<<TscanPtr.p->scanTcWaiting + << " scanKeyinfoFlag="<<TscanPtr.p->scanKeyinfoFlag + << endl; + } else{ + ndbout << "No connected scan record found" << endl; + } + break; + default: + break; + } + ndbrequire(arg != 2308); + } }//Dblqh::execDUMP_STATE_ORD() diff --git a/ndb/src/kernel/blocks/dbtc/Dbtc.hpp b/ndb/src/kernel/blocks/dbtc/Dbtc.hpp index 61afef30b43..b1332a4fd0b 100644 --- a/ndb/src/kernel/blocks/dbtc/Dbtc.hpp +++ b/ndb/src/kernel/blocks/dbtc/Dbtc.hpp @@ -211,14 +211,6 @@ public: LTS_ACTIVE = 1 }; - enum TakeOverState { - TOS_NOT_DEFINED = 0, - TOS_IDLE = 1, - TOS_ACTIVE = 2, - TOS_COMPLETED = 3, - TOS_NODE_FAILED = 4 - }; - enum FailState { FS_IDLE = 0, FS_LISTENING = 1, @@ -636,6 +628,7 @@ public: ConnectionState apiConnectstate; UintR transid[2]; UintR firstTcConnect; + NdbNodeBitmask m_transaction_nodes; //--------------------------------------------------- // Second 16 byte cache line. Hot variables. @@ -932,7 +925,6 @@ public: struct HostRecord { HostState hostStatus; LqhTransState lqhTransStatus; - TakeOverState takeOverStatus; bool inPackedList; UintR noOfPackedWordsLqh; UintR packedWordsLqh[26]; @@ -941,6 +933,17 @@ public: UintR noOfWordsTCINDXCONF; UintR packedWordsTCINDXCONF[30]; BlockReference hostLqhBlockRef; + + enum NodeFailBits + { + NF_TAKEOVER = 0x1, + NF_CHECK_SCAN = 0x2, + NF_CHECK_TRANSACTION = 0x4, + NF_CHECK_DROP_TAB = 0x8, + NF_NODE_FAIL_BITS = 0xF // All bits... + }; + Uint32 m_nf_bits; + NdbNodeBitmask m_lqh_trans_conf; }; /* p2c: size = 128 bytes */ typedef Ptr<HostRecord> HostRecordPtr; @@ -1578,7 +1581,7 @@ private: void wrongSchemaVersionErrorLab(Signal* signal); void noFreeConnectionErrorLab(Signal* signal); void tckeyreq050Lab(Signal* signal); - void timeOutFoundLab(Signal* signal, UintR anAdd); + void timeOutFoundLab(Signal* signal, UintR anAdd, Uint32 errCode); void completeTransAtTakeOverLab(Signal* signal, UintR TtakeOverInd); void completeTransAtTakeOverDoLast(Signal* signal, UintR TtakeOverInd); void completeTransAtTakeOverDoOne(Signal* signal, UintR TtakeOverInd); @@ -1600,6 +1603,9 @@ private: void checkScanFragList(Signal*, Uint32 failedNodeId, ScanRecord * scanP, LocalDLList<ScanFragRec>::Head&); + void nodeFailCheckTransactions(Signal*,Uint32 transPtrI,Uint32 failedNodeId); + void checkNodeFailComplete(Signal* signal, Uint32 failedNodeId, Uint32 bit); + // Initialisation void initData(); void initRecords(); @@ -1626,6 +1632,7 @@ private: HostRecord *hostRecord; HostRecordPtr hostptr; UintR chostFilesize; + NdbNodeBitmask c_alive_nodes; GcpRecord *gcpRecord; GcpRecordPtr gcpPtr; diff --git a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp index d9d1f01b213..ff9b279592c 100644 --- a/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp +++ b/ndb/src/kernel/blocks/dbtc/DbtcMain.cpp @@ -262,6 +262,10 @@ void Dbtc::execCONTINUEB(Signal* signal) jam(); checkScanActiveInFailedLqh(signal, Tdata0, Tdata1); return; + case TcContinueB::ZNF_CHECK_TRANSACTIONS: + jam(); + nodeFailCheckTransactions(signal, Tdata0, Tdata1); + return; case TcContinueB::CHECK_WAIT_DROP_TAB_FAILED_LQH: jam(); checkWaitDropTabFailedLqh(signal, Tdata0, Tdata1); @@ -299,8 +303,8 @@ void Dbtc::execINCL_NODEREQ(Signal* signal) hostptr.i = signal->theData[1]; ptrCheckGuard(hostptr, chostFilesize, hostRecord); hostptr.p->hostStatus = HS_ALIVE; - hostptr.p->takeOverStatus = TOS_IDLE; signal->theData[0] = cownref; + c_alive_nodes.set(hostptr.i); sendSignal(tblockref, GSN_INCL_NODECONF, signal, 1, JBB); } @@ -487,6 +491,7 @@ Dbtc::checkWaitDropTabFailedLqh(Signal* signal, Uint32 nodeId, Uint32 tableId) * Finished */ jam(); + checkNodeFailComplete(signal, nodeId, HostRecord::NF_CHECK_DROP_TAB); return; } @@ -850,8 +855,6 @@ void Dbtc::execREAD_NODESCONF(Signal* signal) hostptr.i = i; ptrCheckGuard(hostptr, chostFilesize, hostRecord); - hostptr.p->takeOverStatus = TOS_IDLE; - if (NodeBitmask::get(readNodes->inactiveNodes, i)) { jam(); hostptr.p->hostStatus = HS_DEAD; @@ -859,6 +862,7 @@ void Dbtc::execREAD_NODESCONF(Signal* signal) jam(); con_lineNodes++; hostptr.p->hostStatus = HS_ALIVE; + c_alive_nodes.set(i); }//if }//if }//for @@ -2314,6 +2318,7 @@ void Dbtc::initApiConnectRec(Signal* signal, regApiPtr->commitAckMarker = RNIL; regApiPtr->buddyPtr = RNIL; regApiPtr->currSavePointId = 0; + regApiPtr->m_transaction_nodes.clear(); // Trigger data releaseFiredTriggerData(®ApiPtr->theFiredTriggers), // Index data @@ -2921,6 +2926,10 @@ void Dbtc::tckeyreq050Lab(Signal* signal) signal->theData[0] = TdihConnectptr; signal->theData[1] = Ttableref; signal->theData[2] = TdistrHashValue; + signal->theData[3] = 0; + signal->theData[4] = 0; + signal->theData[5] = 0; + signal->theData[6] = 0; /*-------------------------------------------------------------*/ /* FOR EFFICIENCY REASONS WE AVOID THE SIGNAL SENDING HERE AND */ @@ -3098,6 +3107,7 @@ void Dbtc::sendlqhkeyreq(Signal* signal, TcConnectRecord * const regTcPtr = tcConnectptr.p; ApiConnectRecord * const regApiPtr = apiConnectptr.p; CacheRecord * const regCachePtr = cachePtr.p; + UintR sig0, sig1, sig2, sig3, sig4, sig5, sig6; #ifdef ERROR_INSERT if (ERROR_INSERTED(8002)) { systemErrorLab(signal); @@ -3135,6 +3145,9 @@ void Dbtc::sendlqhkeyreq(Signal* signal, LqhKeyReq::setScanTakeOverFlag(tslrAttrLen, regCachePtr->scanTakeOverInd); Tdata10 = 0; + sig0 = regCachePtr->opSimple; + sig1 = regTcPtr->operation; + bool simpleRead = (sig1 == ZREAD && sig0 == ZTRUE); LqhKeyReq::setKeyLen(Tdata10, regCachePtr->keylen); LqhKeyReq::setLastReplicaNo(Tdata10, regTcPtr->lastReplicaNo); LqhKeyReq::setLockType(Tdata10, regCachePtr->opLock); @@ -3144,8 +3157,8 @@ void Dbtc::sendlqhkeyreq(Signal* signal, LqhKeyReq::setApplicationAddressFlag(Tdata10, 1); LqhKeyReq::setDirtyFlag(Tdata10, regTcPtr->dirtyOp); LqhKeyReq::setInterpretedFlag(Tdata10, regCachePtr->opExec); - LqhKeyReq::setSimpleFlag(Tdata10, regCachePtr->opSimple); - LqhKeyReq::setOperation(Tdata10, regTcPtr->operation); + LqhKeyReq::setSimpleFlag(Tdata10, sig0); + LqhKeyReq::setOperation(Tdata10, sig1); /* ----------------------------------------------------------------------- * Sequential Number of first LQH = 0, bit 22-23 * IF ATTRIBUTE INFORMATION IS SENT IN TCKEYREQ, @@ -3158,18 +3171,16 @@ void Dbtc::sendlqhkeyreq(Signal* signal, * ----------------------------------------------------------------------- */ //LqhKeyReq::setAPIVersion(Tdata10, regCachePtr->apiVersionNo); Uint32 commitAckMarker = regTcPtr->commitAckMarker; + const Uint32 noOfLqhs = regTcPtr->noOfNodes; if(commitAckMarker != RNIL){ jam(); - LqhKeyReq::setMarkerFlag(Tdata10, 1); - CommitAckMarker * tmp; - tmp = m_commitAckMarkerHash.getPtr(commitAckMarker); + CommitAckMarker * tmp = m_commitAckMarkerHash.getPtr(commitAckMarker); /** * Populate LQH array */ - const Uint32 noOfLqhs = regTcPtr->noOfNodes; tmp->noOfLqhs = noOfLqhs; for(Uint32 i = 0; i<noOfLqhs; i++){ tmp->lqhNodeId[i] = regTcPtr->tcNodedata[i]; @@ -3180,7 +3191,6 @@ void Dbtc::sendlqhkeyreq(Signal* signal, /* NO READ LENGTH SENT FROM TC. SEQUENTIAL NUMBER IS 1 AND IT */ /* IS SENT TO A PRIMARY NODE. */ /* ************************************************************> */ - UintR sig0, sig1, sig2, sig3, sig4, sig5, sig6; LqhKeyReq * const lqhKeyReq = (LqhKeyReq *)signal->getDataPtrSend(); @@ -3204,6 +3214,14 @@ void Dbtc::sendlqhkeyreq(Signal* signal, sig5 = regTcPtr->clientData; sig6 = regCachePtr->scanInfo; + if (! simpleRead) + { + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[0]); + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[1]); + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[2]); + regApiPtr->m_transaction_nodes.set(regTcPtr->tcNodedata[3]); + } + lqhKeyReq->tableSchemaVersion = sig0; lqhKeyReq->fragmentData = sig1; lqhKeyReq->transId1 = sig2; @@ -4587,6 +4605,7 @@ void Dbtc::copyApi(Signal* signal) UintR TgcpPointer = regTmpApiPtr->gcpPointer; UintR TgcpFilesize = cgcpFilesize; UintR TcommitAckMarker = regTmpApiPtr->commitAckMarker; + NdbNodeBitmask Tnodes = regTmpApiPtr->m_transaction_nodes; GcpRecord *localGcpRecord = gcpRecord; regApiPtr->ndbapiBlockref = regTmpApiPtr->ndbapiBlockref; @@ -4597,6 +4616,7 @@ void Dbtc::copyApi(Signal* signal) regApiPtr->transid[1] = Ttransid2; regApiPtr->lqhkeyconfrec = Tlqhkeyconfrec; regApiPtr->commitAckMarker = TcommitAckMarker; + regApiPtr->m_transaction_nodes = Tnodes; gcpPtr.i = TgcpPointer; ptrCheckGuard(gcpPtr, TgcpFilesize, localGcpRecord); @@ -4607,6 +4627,7 @@ void Dbtc::copyApi(Signal* signal) regTmpApiPtr->commitAckMarker = RNIL; regTmpApiPtr->firstTcConnect = RNIL; regTmpApiPtr->lastTcConnect = RNIL; + regTmpApiPtr->m_transaction_nodes.clear(); releaseAllSeizedIndexOperations(regTmpApiPtr); }//Dbtc::copyApi() @@ -4865,7 +4886,7 @@ void Dbtc::releaseTransResources(Signal* signal) TcConnectRecordPtr localTcConnectptr; UintR TtcConnectFilesize = ctcConnectFilesize; TcConnectRecord *localTcConnectRecord = tcConnectRecord; - + apiConnectptr.p->m_transaction_nodes.clear(); localTcConnectptr.i = apiConnectptr.p->firstTcConnect; do { jam(); @@ -5269,7 +5290,8 @@ void Dbtc::execTC_COMMITREQ(Signal* signal) break; case CS_ABORTING: jam(); - errorCode = ZABORTINPROGRESS; + errorCode = regApiPtr->returncode ? + regApiPtr->returncode : ZABORTINPROGRESS; break; case CS_START_SCAN: jam(); @@ -5808,9 +5830,9 @@ void Dbtc::abort010Lab(Signal* signal) if (transP->firstTcConnect == RNIL) { jam(); - /*-----------------------------------------------------------------------*/ - /* WE HAVE NO PARTICIPANTS IN THE TRANSACTION. */ - /*-----------------------------------------------------------------------*/ + /*--------------------------------------------------------------------*/ + /* WE HAVE NO PARTICIPANTS IN THE TRANSACTION. */ + /*--------------------------------------------------------------------*/ releaseAbortResources(signal); return; }//if @@ -6087,10 +6109,12 @@ void Dbtc::timeOutLoopStartLab(Signal* signal, Uint32 api_con_ptr) if (api_timer != 0) { time_out_value= time_out_param + (api_con_ptr & mask_value); time_passed= tc_timer - api_timer; - if (time_passed > time_out_value) { + if (time_passed > time_out_value) + { jam(); - timeOutFoundLab(signal, api_con_ptr); - return; + timeOutFoundLab(signal, api_con_ptr, ZTIME_OUT_ERROR); + api_con_ptr++; + break; } } } @@ -6110,10 +6134,8 @@ void Dbtc::timeOutLoopStartLab(Signal* signal, Uint32 api_con_ptr) return; }//Dbtc::timeOutLoopStartLab() -void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) +void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr, Uint32 errCode) { - sendContinueTimeOutControl(signal, TapiConPtr + 1); - apiConnectptr.i = TapiConPtr; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); /*------------------------------------------------------------------*/ @@ -6126,7 +6148,8 @@ void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) << "Time-out in state = " << apiConnectptr.p->apiConnectstate << " apiConnectptr.i = " << apiConnectptr.i << " - exec: " << apiConnectptr.p->m_exec_flag - << " - place: " << c_apiConTimer_line[apiConnectptr.i]); + << " - place: " << c_apiConTimer_line[apiConnectptr.i] + << " code: " << errCode); switch (apiConnectptr.p->apiConnectstate) { case CS_STARTED: if(apiConnectptr.p->lqhkeyreqrec == apiConnectptr.p->lqhkeyconfrec){ @@ -6143,7 +6166,7 @@ void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) }//if } apiConnectptr.p->returnsignal = RS_TCROLLBACKREP; - apiConnectptr.p->returncode = ZTIME_OUT_ERROR; + apiConnectptr.p->returncode = errCode; abort010Lab(signal); return; case CS_RECEIVING: @@ -6156,7 +6179,7 @@ void Dbtc::timeOutFoundLab(Signal* signal, Uint32 TapiConPtr) /* START ABORTING THE TRANSACTION. ALSO START CHECKING THE */ /* REMAINING TRANSACTIONS. */ /*------------------------------------------------------------------*/ - terrorCode = ZTIME_OUT_ERROR; + terrorCode = errCode; abortErrorLab(signal); return; case CS_COMMITTING: @@ -6800,58 +6823,44 @@ void Dbtc::execNODE_FAILREP(Signal* signal) const Uint32 tnewMasterId = nodeFail->masterNodeId; arrGuard(tnoOfNodes, MAX_NDB_NODES); + Uint32 i; int index = 0; - for (unsigned i = 1; i< MAX_NDB_NODES; i++) { - if(NodeBitmask::get(nodeFail->theNodes, i)){ + for (i = 1; i< MAX_NDB_NODES; i++) + { + if(NodeBitmask::get(nodeFail->theNodes, i)) + { cdata[index] = i; index++; }//if }//for + cmasterNodeId = tnewMasterId; + tcNodeFailptr.i = 0; ptrAss(tcNodeFailptr, tcFailRecord); - Uint32 tindex; - for (tindex = 0; tindex < tnoOfNodes; tindex++) { + for (i = 0; i < tnoOfNodes; i++) + { jam(); - hostptr.i = cdata[tindex]; + hostptr.i = cdata[i]; ptrCheckGuard(hostptr, chostFilesize, hostRecord); + /*------------------------------------------------------------*/ /* SET STATUS OF THE FAILED NODE TO DEAD SINCE IT HAS */ /* FAILED. */ /*------------------------------------------------------------*/ hostptr.p->hostStatus = HS_DEAD; + hostptr.p->m_nf_bits = HostRecord::NF_NODE_FAIL_BITS; + c_alive_nodes.clear(hostptr.i); - if (hostptr.p->takeOverStatus == TOS_COMPLETED) { - jam(); - /*------------------------------------------------------------*/ - /* A VERY UNUSUAL SITUATION. THE TAKE OVER WAS COMPLETED*/ - /* EVEN BEFORE WE HEARD ABOUT THE NODE FAILURE REPORT. */ - /* HOWEVER UNUSUAL THIS SITUATION IS POSSIBLE. */ - /*------------------------------------------------------------*/ - /* RELEASE THE CURRENTLY UNUSED LQH CONNECTIONS. THE */ - /* REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */ - /* USED THEM IS COMPLETED. */ - /*------------------------------------------------------------*/ - { - NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0]; - nfRep->blockNo = DBTC; - nfRep->nodeId = cownNodeid; - nfRep->failedNodeId = hostptr.i; - } - sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, - NFCompleteRep::SignalLength, JBB); - } else { - ndbrequire(hostptr.p->takeOverStatus == TOS_IDLE); - hostptr.p->takeOverStatus = TOS_NODE_FAILED; - }//if - - if (tcNodeFailptr.p->failStatus == FS_LISTENING) { + if (tcNodeFailptr.p->failStatus == FS_LISTENING) + { jam(); /*------------------------------------------------------------*/ /* THE CURRENT TAKE OVER CAN BE AFFECTED BY THIS NODE */ /* FAILURE. */ /*------------------------------------------------------------*/ - if (hostptr.p->lqhTransStatus == LTS_ACTIVE) { + if (hostptr.p->lqhTransStatus == LTS_ACTIVE) + { jam(); /*------------------------------------------------------------*/ /* WE WERE WAITING FOR THE FAILED NODE IN THE TAKE OVER */ @@ -6863,86 +6872,46 @@ void Dbtc::execNODE_FAILREP(Signal* signal) }//if }//if - }//for - - const bool masterFailed = (cmasterNodeId != tnewMasterId); - cmasterNodeId = tnewMasterId; - - if(getOwnNodeId() == cmasterNodeId && masterFailed){ - /** - * Master has failed and I'm the new master - */ - jam(); - - for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) { + if (getOwnNodeId() != tnewMasterId) + { jam(); - ptrAss(hostptr, hostRecord); - if (hostptr.p->hostStatus != HS_ALIVE) { - jam(); - if (hostptr.p->takeOverStatus == TOS_COMPLETED) { - jam(); - /*------------------------------------------------------------*/ - /* SEND TAKE OVER CONFIRMATION TO ALL ALIVE NODES IF */ - /* TAKE OVER IS COMPLETED. THIS IS PERFORMED TO ENSURE */ - /* THAT ALL NODES AGREE ON THE IDLE STATE OF THE TAKE */ - /* OVER. THIS MIGHT BE MISSED IN AN ERROR SITUATION IF */ - /* MASTER FAILS AFTER SENDING CONFIRMATION TO NEW */ - /* MASTER BUT FAILING BEFORE SENDING TO ANOTHER NODE */ - /* WHICH WAS NOT MASTER. IF THIS NODE LATER BECOMES */ - /* MASTER IT MIGHT START A NEW TAKE OVER EVEN AFTER THE */ - /* CRASHED NODE HAVE ALREADY RECOVERED. */ - /*------------------------------------------------------------*/ - for(tmpHostptr.i = 1; tmpHostptr.i < MAX_NDB_NODES;tmpHostptr.i++) { - jam(); - ptrAss(tmpHostptr, hostRecord); - if (tmpHostptr.p->hostStatus == HS_ALIVE) { - jam(); - tblockref = calcTcBlockRef(tmpHostptr.i); - signal->theData[0] = hostptr.i; - sendSignal(tblockref, GSN_TAKE_OVERTCCONF, signal, 1, JBB); - }//if - }//for - }//if - }//if - }//for - } - - if(getOwnNodeId() == cmasterNodeId){ - jam(); - for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) { + /** + * Only master does takeover currently + */ + hostptr.p->m_nf_bits &= ~HostRecord::NF_TAKEOVER; + } + else + { jam(); - ptrAss(hostptr, hostRecord); - if (hostptr.p->hostStatus != HS_ALIVE) { - jam(); - if (hostptr.p->takeOverStatus == TOS_NODE_FAILED) { - jam(); - /*------------------------------------------------------------*/ - /* CONCLUDE ALL ACTIVITIES THE FAILED TC DID CONTROL */ - /* SINCE WE ARE THE MASTER. THIS COULD HAVE BEEN STARTED*/ - /* BY A PREVIOUS MASTER BUT HAVE NOT BEEN CONCLUDED YET.*/ - /*------------------------------------------------------------*/ - hostptr.p->takeOverStatus = TOS_ACTIVE; - signal->theData[0] = hostptr.i; - sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB); - }//if - }//if - }//for - }//if - for (tindex = 0; tindex < tnoOfNodes; tindex++) { - jam(); - hostptr.i = cdata[tindex]; - ptrCheckGuard(hostptr, chostFilesize, hostRecord); - /*------------------------------------------------------------*/ - /* LOOP THROUGH AND ABORT ALL SCANS THAT WHERE */ - /* CONTROLLED BY THIS TC AND ACTIVE IN THE FAILED */ - /* NODE'S LQH */ - /*------------------------------------------------------------*/ + signal->theData[0] = hostptr.i; + sendSignal(cownref, GSN_TAKE_OVERTCREQ, signal, 1, JBB); + } + checkScanActiveInFailedLqh(signal, 0, hostptr.i); checkWaitDropTabFailedLqh(signal, hostptr.i, 0); // nodeid, tableid - }//for - + nodeFailCheckTransactions(signal, 0, hostptr.i); + } }//Dbtc::execNODE_FAILREP() +void +Dbtc::checkNodeFailComplete(Signal* signal, + Uint32 failedNodeId, + Uint32 bit) +{ + hostptr.i = failedNodeId; + ptrCheckGuard(hostptr, chostFilesize, hostRecord); + hostptr.p->m_nf_bits &= ~bit; + if (hostptr.p->m_nf_bits == 0) + { + NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0]; + nfRep->blockNo = DBTC; + nfRep->nodeId = cownNodeid; + nfRep->failedNodeId = hostptr.i; + sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, + NFCompleteRep::SignalLength, JBB); + } +} + void Dbtc::checkScanActiveInFailedLqh(Signal* signal, Uint32 scanPtrI, Uint32 failedNodeId){ @@ -6984,8 +6953,44 @@ void Dbtc::checkScanActiveInFailedLqh(Signal* signal, sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); return; }//for + + checkNodeFailComplete(signal, failedNodeId, HostRecord::NF_CHECK_SCAN); +} + +void +Dbtc::nodeFailCheckTransactions(Signal* signal, + Uint32 transPtrI, + Uint32 failedNodeId) +{ + jam(); + Ptr<ApiConnectRecord> transPtr; + for (transPtr.i = transPtrI; transPtr.i < capiConnectFilesize; transPtr.i++) + { + ptrCheckGuard(transPtr, capiConnectFilesize, apiConnectRecord); + if (transPtr.p->m_transaction_nodes.get(failedNodeId)) + { + jam(); + // Force timeout regardless of state + Uint32 save = c_appl_timeout_value; + c_appl_timeout_value = 1; + setApiConTimer(transPtr.i, 0, __LINE__); + timeOutFoundLab(signal, transPtr.i, ZNODEFAIL_BEFORE_COMMIT); + c_appl_timeout_value = save; + } + + // Send CONTINUEB to continue later + signal->theData[0] = TcContinueB::ZNF_CHECK_TRANSACTIONS; + signal->theData[1] = transPtr.i + 1; // Check next + signal->theData[2] = failedNodeId; + sendSignal(cownref, GSN_CONTINUEB, signal, 3, JBB); + return; + } + + checkNodeFailComplete(signal, failedNodeId, + HostRecord::NF_CHECK_TRANSACTION); } + void Dbtc::checkScanFragList(Signal* signal, Uint32 failedNodeId, @@ -7001,54 +7006,17 @@ void Dbtc::execTAKE_OVERTCCONF(Signal* signal) tfailedNodeId = signal->theData[0]; hostptr.i = tfailedNodeId; ptrCheckGuard(hostptr, chostFilesize, hostRecord); - switch (hostptr.p->takeOverStatus) { - case TOS_IDLE: - jam(); - /*------------------------------------------------------------*/ - /* THIS MESSAGE ARRIVED EVEN BEFORE THE NODE_FAILREP */ - /* MESSAGE. THIS IS POSSIBLE IN EXTREME SITUATIONS. */ - /* WE SET THE STATE TO TAKE_OVER_COMPLETED AND WAIT */ - /* FOR THE NODE_FAILREP MESSAGE. */ - /*------------------------------------------------------------*/ - hostptr.p->takeOverStatus = TOS_COMPLETED; - break; - case TOS_NODE_FAILED: - case TOS_ACTIVE: - jam(); - /*------------------------------------------------------------*/ - /* WE ARE NOT MASTER AND THE TAKE OVER IS ACTIVE OR WE */ - /* ARE MASTER AND THE TAKE OVER IS ACTIVE. IN BOTH */ - /* WE SET THE STATE TO TAKE_OVER_COMPLETED. */ - /*------------------------------------------------------------*/ - /* RELEASE THE CURRENTLY UNUSED LQH CONNECTIONS. THE */ - /* REMAINING WILL BE RELEASED WHEN THE TRANSACTION THAT */ - /* USED THEM IS COMPLETED. */ - /*------------------------------------------------------------*/ - hostptr.p->takeOverStatus = TOS_COMPLETED; - { - NFCompleteRep * const nfRep = (NFCompleteRep *)&signal->theData[0]; - nfRep->blockNo = DBTC; - nfRep->nodeId = cownNodeid; - nfRep->failedNodeId = hostptr.i; - } - sendSignal(cdihblockref, GSN_NF_COMPLETEREP, signal, - NFCompleteRep::SignalLength, JBB); - break; - case TOS_COMPLETED: - jam(); - /*------------------------------------------------------------*/ - /* WE HAVE ALREADY RECEIVED THE CONF SIGNAL. IT IS MOST */ - /* LIKELY SENT FROM A NEW MASTER WHICH WASN'T SURE IF */ - /* THIS NODE HEARD THE CONF SIGNAL FROM THE OLD MASTER. */ - /* WE SIMPLY IGNORE THE MESSAGE. */ - /*------------------------------------------------------------*/ - /*empty*/; - break; - default: + + ndbout_c("received execTAKE_OVERTCCONF(%d) from %x (%x)", + tfailedNodeId, signal->getSendersBlockRef(), reference()); + if (signal->getSendersBlockRef() != reference()) + { jam(); - systemErrorLab(signal); return; - }//switch + } + + + checkNodeFailComplete(signal, hostptr.i, HostRecord::NF_TAKEOVER); }//Dbtc::execTAKE_OVERTCCONF() void Dbtc::execTAKE_OVERTCREQ(Signal* signal) @@ -7288,16 +7256,10 @@ void Dbtc::completeTransAtTakeOverDoLast(Signal* signal, UintR TtakeOverInd) /* TO REPORT THE COMPLETION OF THE TAKE OVER TO ALL */ /* NODES THAT ARE ALIVE. */ /*------------------------------------------------------------*/ - for (hostptr.i = 1; hostptr.i < MAX_NDB_NODES; hostptr.i++) { - jam(); - ptrAss(hostptr, hostRecord); - if (hostptr.p->hostStatus == HS_ALIVE) { - jam(); - tblockref = calcTcBlockRef(hostptr.i); - signal->theData[0] = tcNodeFailptr.p->takeOverNode; - sendSignal(tblockref, GSN_TAKE_OVERTCCONF, signal, 1, JBB); - }//if - }//for + NodeReceiverGroup rg(DBTC, c_alive_nodes); + signal->theData[0] = tcNodeFailptr.p->takeOverNode; + sendSignal(rg, GSN_TAKE_OVERTCCONF, signal, 1, JBB); + if (tcNodeFailptr.p->queueIndex > 0) { jam(); /*------------------------------------------------------------*/ @@ -7979,6 +7941,7 @@ void Dbtc::initApiConnectFail(Signal* signal) apiConnectptr.p->ndbapiBlockref = 0; apiConnectptr.p->ndbapiConnect = 0; apiConnectptr.p->buddyPtr = RNIL; + apiConnectptr.p->m_transaction_nodes.clear(); setApiConTimer(apiConnectptr.i, 0, __LINE__); switch(ttransStatus){ case LqhTransConf::Committed: @@ -9756,6 +9719,7 @@ void Dbtc::initApiConnect(Signal* signal) apiConnectptr.p->executingIndexOp = RNIL; apiConnectptr.p->buddyPtr = RNIL; apiConnectptr.p->currSavePointId = 0; + apiConnectptr.p->m_transaction_nodes.clear(); }//for apiConnectptr.i = tiacTmp - 1; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); @@ -9783,6 +9747,7 @@ void Dbtc::initApiConnect(Signal* signal) apiConnectptr.p->executingIndexOp = RNIL; apiConnectptr.p->buddyPtr = RNIL; apiConnectptr.p->currSavePointId = 0; + apiConnectptr.p->m_transaction_nodes.clear(); }//for apiConnectptr.i = (2 * tiacTmp) - 1; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); @@ -9810,6 +9775,7 @@ void Dbtc::initApiConnect(Signal* signal) apiConnectptr.p->executingIndexOp = RNIL; apiConnectptr.p->buddyPtr = RNIL; apiConnectptr.p->currSavePointId = 0; + apiConnectptr.p->m_transaction_nodes.clear(); }//for apiConnectptr.i = (3 * tiacTmp) - 1; ptrCheckGuard(apiConnectptr, capiConnectFilesize, apiConnectRecord); @@ -9870,13 +9836,13 @@ void Dbtc::inithost(Signal* signal) ptrAss(hostptr, hostRecord); hostptr.p->hostStatus = HS_DEAD; hostptr.p->inPackedList = false; - hostptr.p->takeOverStatus = TOS_NOT_DEFINED; hostptr.p->lqhTransStatus = LTS_IDLE; hostptr.p->noOfWordsTCKEYCONF = 0; hostptr.p->noOfWordsTCINDXCONF = 0; hostptr.p->noOfPackedWordsLqh = 0; hostptr.p->hostLqhBlockRef = calcLqhBlockRef(hostptr.i); }//for + c_alive_nodes.clear(); }//Dbtc::inithost() void Dbtc::initialiseRecordsLab(Signal* signal, UintR Tdata0, @@ -10126,6 +10092,7 @@ void Dbtc::releaseAbortResources(Signal* signal) }//while apiConnectptr.p->firstTcConnect = RNIL; apiConnectptr.p->lastTcConnect = RNIL; + apiConnectptr.p->m_transaction_nodes.clear(); // MASV let state be CS_ABORTING until all // signals in the "air" have been received. Reset to CS_CONNECTED @@ -10199,6 +10166,7 @@ void Dbtc::releaseApiCon(Signal* signal, UintR TapiConnectPtr) cfirstfreeApiConnect = TlocalApiConnectptr.i; setApiConTimer(TlocalApiConnectptr.i, 0, __LINE__); TlocalApiConnectptr.p->apiConnectstate = CS_DISCONNECTED; + ndbassert(TlocalApiConnectptr.p->m_transaction_nodes.isclear()); ndbassert(TlocalApiConnectptr.p->apiScanRec == RNIL); TlocalApiConnectptr.p->ndbapiBlockref = 0; }//Dbtc::releaseApiCon() @@ -10734,6 +10702,34 @@ Dbtc::execDUMP_STATE_ORD(Signal* signal) c_theIndexOperationPool.getSize(), c_theIndexOperationPool.getNoOfFree()); } + + if (dumpState->args[0] == 2514) + { + if (signal->getLength() == 2) + { + dumpState->args[0] = DumpStateOrd::TcDumpOneApiConnectRec; + execDUMP_STATE_ORD(signal); + } + + NodeReceiverGroup rg(CMVMI, c_alive_nodes); + dumpState->args[0] = 15; + sendSignal(rg, GSN_DUMP_STATE_ORD, signal, 1, JBB); + + signal->theData[0] = 2515; + sendSignalWithDelay(cownref, GSN_DUMP_STATE_ORD, signal, 1000, 1); + return; + } + + if (dumpState->args[0] == 2515) + { + NdbNodeBitmask mask = c_alive_nodes; + mask.clear(getOwnNodeId()); + NodeReceiverGroup rg(NDBCNTR, mask); + + sendSignal(rg, GSN_SYSTEM_ERROR, signal, 1, JBB); + sendSignalWithDelay(cownref, GSN_SYSTEM_ERROR, signal, 300, 1); + return; + } }//Dbtc::execDUMP_STATE_ORD() void Dbtc::execSET_VAR_REQ(Signal* signal) diff --git a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp index 6095895e7c2..70084e6b171 100644 --- a/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp +++ b/ndb/src/kernel/blocks/qmgr/QmgrMain.cpp @@ -257,6 +257,7 @@ void Qmgr::setArbitTimeout(UintR aArbitTimeout) void Qmgr::execCONNECT_REP(Signal* signal) { + jamEntry(); const Uint32 nodeId = signal->theData[0]; c_connectedNodes.set(nodeId); NodeRecPtr nodePtr; @@ -264,9 +265,13 @@ void Qmgr::execCONNECT_REP(Signal* signal) ptrCheckGuard(nodePtr, MAX_NODES, nodeRec); switch(nodePtr.p->phase){ case ZSTARTING: + case ZRUNNING: jam(); + if(!c_start.m_nodes.isWaitingFor(nodeId)){ + jam(); + return; + } break; - case ZRUNNING: case ZPREPARE_FAIL: case ZFAIL_CLOSING: jam(); @@ -277,21 +282,28 @@ void Qmgr::execCONNECT_REP(Signal* signal) case ZAPI_INACTIVE: return; } - - if(!c_start.m_nodes.isWaitingFor(nodeId)){ - jam(); - return; - } - + switch(c_start.m_gsn){ case GSN_CM_REGREQ: jam(); sendCmRegReq(signal, nodeId); return; - case GSN_CM_NODEINFOREQ:{ + case GSN_CM_NODEINFOREQ: jam(); sendCmNodeInfoReq(signal, nodeId, nodePtr.p); return; + case GSN_CM_ADD:{ + jam(); + + ndbrequire(getOwnNodeId() != cpresident); + c_start.m_nodes.clearWaitingFor(nodeId); + c_start.m_gsn = RNIL; + + NodeRecPtr addNodePtr; + addNodePtr.i = nodeId; + ptrCheckGuard(addNodePtr, MAX_NDB_NODES, nodeRec); + cmAddPrepare(signal, addNodePtr, nodePtr.p); + return; } default: return; @@ -924,15 +936,27 @@ Qmgr::cmAddPrepare(Signal* signal, NodeRecPtr nodePtr, const NodeRec * self){ return; case ZFAIL_CLOSING: jam(); -#ifdef VM_TRACE - ndbout_c("Enabling communication to CM_ADD node state=%d", - nodePtr.p->phase); -#endif + +#if 1 + warningEvent("Recieved request to incorperate node %u, " + "while error handling has not yet completed", + nodePtr.i); + + ndbrequire(getOwnNodeId() != cpresident); + ndbrequire(signal->header.theVerId_signalNumber == GSN_CM_ADD); + c_start.m_nodes.clearWaitingFor(); + c_start.m_nodes.setWaitingFor(nodePtr.i); + c_start.m_gsn = GSN_CM_ADD; +#else + warningEvent("Enabling communication to CM_ADD node %u state=%d", + nodePtr.i, + nodePtr.p->phase); nodePtr.p->phase = ZSTARTING; nodePtr.p->failState = NORMAL; signal->theData[0] = 0; signal->theData[1] = nodePtr.i; sendSignal(CMVMI_REF, GSN_OPEN_COMREQ, signal, 2, JBA); +#endif return; case ZSTARTING: break; @@ -1766,11 +1790,27 @@ void Qmgr::execNDB_FAILCONF(Signal* signal) jamEntry(); failedNodePtr.i = signal->theData[0]; + + if (ERROR_INSERTED(930)) + { + CLEAR_ERROR_INSERT_VALUE; + infoEvent("Discarding NDB_FAILCONF for %u", failedNodePtr.i); + return; + } + ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); if (failedNodePtr.p->failState == WAITING_FOR_NDB_FAILCONF){ failedNodePtr.p->failState = NORMAL; } else { jam(); + + char buf[100]; + BaseString::snprintf(buf, 100, + "Received NDB_FAILCONF for node %u with state: %d %d", + failedNodePtr.i, + failedNodePtr.p->phase, + failedNodePtr.p->failState); + progError(__LINE__, 0, buf); systemErrorLab(signal, __LINE__); }//if if (cpresident == getOwnNodeId()) { @@ -2077,10 +2117,42 @@ void Qmgr::failReportLab(Signal* signal, Uint16 aFailedNode, ptrCheckGuard(failedNodePtr, MAX_NODES, nodeRec); if (failedNodePtr.i == getOwnNodeId()) { jam(); - systemErrorLab(signal, __LINE__); + + const char * msg = 0; + switch(aFailCause){ + case FailRep::ZOWN_FAILURE: + msg = "Own failure"; + break; + case FailRep::ZOTHER_NODE_WHEN_WE_START: + case FailRep::ZOTHERNODE_FAILED_DURING_START: + msg = "Other node died during start"; + break; + case FailRep::ZIN_PREP_FAIL_REQ: + msg = "Prep fail"; + break; + case FailRep::ZSTART_IN_REGREQ: + msg = "Start timeout"; + break; + case FailRep::ZHEARTBEAT_FAILURE: + msg = "Hearbeat failure"; + break; + case FailRep::ZLINK_FAILURE: + msg = "Connection failure"; + break; + } + + char buf[100]; + BaseString::snprintf(buf, 100, + "We(%u) have been declared dead by %u reason: %s(%u)", + getOwnNodeId(), + refToNode(signal->getSendersBlockRef()), + aFailCause, + msg ? msg : "<Unknown>"); + + progError(__LINE__, 0, buf); return; }//if - + myNodePtr.i = getOwnNodeId(); ptrCheckGuard(myNodePtr, MAX_NDB_NODES, nodeRec); if (myNodePtr.p->phase != ZRUNNING) { @@ -2791,6 +2863,7 @@ void Qmgr::failReport(Signal* signal, cfailureNr = cprepareFailureNr; ctoFailureNr = 0; ctoStatus = Q_ACTIVE; + c_start.reset(); // Don't take over nodes being started if (cnoCommitFailedNodes > 0) { jam(); /**----------------------------------------------------------------- diff --git a/ndb/src/ndbapi/NdbConnection.cpp b/ndb/src/ndbapi/NdbConnection.cpp index c9e26f8ccaf..9cd7d6ed42e 100644 --- a/ndb/src/ndbapi/NdbConnection.cpp +++ b/ndb/src/ndbapi/NdbConnection.cpp @@ -450,12 +450,12 @@ NdbConnection::executeNoBlobs(ExecType aTypeOfExec, //------------------------------------------------------------------------ Ndb* tNdb = theNdb; + Uint32 timeout = TransporterFacade::instance()->m_waitfor_timeout; m_waitForReply = false; executeAsynchPrepare(aTypeOfExec, NULL, NULL, abortOption); if (m_waitForReply){ while (1) { - int noOfComp = tNdb->sendPollNdb((3 * WAITFOR_RESPONSE_TIMEOUT), - 1, forceSend); + int noOfComp = tNdb->sendPollNdb(3 * timeout, 1, forceSend); if (noOfComp == 0) { /** * This timeout situation can occur if NDB crashes. diff --git a/ndb/src/ndbapi/Ndbif.cpp b/ndb/src/ndbapi/Ndbif.cpp index 3ebba7e1c4a..d753117aa9a 100644 --- a/ndb/src/ndbapi/Ndbif.cpp +++ b/ndb/src/ndbapi/Ndbif.cpp @@ -954,23 +954,25 @@ Ndb::pollCompleted(NdbConnection** aCopyArray) void Ndb::check_send_timeout() { + Uint32 timeout = TransporterFacade::instance()->m_waitfor_timeout; NDB_TICKS current_time = NdbTick_CurrentMillisecond(); if (current_time - the_last_check_time > 1000) { the_last_check_time = current_time; Uint32 no_of_sent = theNoOfSentTransactions; for (Uint32 i = 0; i < no_of_sent; i++) { NdbConnection* a_con = theSentTransactionsArray[i]; - if ((current_time - a_con->theStartTransTime) > - WAITFOR_RESPONSE_TIMEOUT) { + if ((current_time - a_con->theStartTransTime) > timeout) + { #ifdef VM_TRACE a_con->printState(); Uint32 t1 = a_con->theTransactionId; Uint32 t2 = a_con->theTransactionId >> 32; - ndbout_c("[%.8x %.8x]", t1, t2); - abort(); + ndbout_c("4012 [%.8x %.8x]", t1, t2); + //abort(); #endif + a_con->theReleaseOnClose = true; a_con->setOperationErrorCodeAbort(4012); - a_con->theCommitStatus = NdbConnection::Aborted; + a_con->theCommitStatus = NdbConnection::NeedAbort; a_con->theCompletionStatus = NdbConnection::CompletedFailure; a_con->handleExecuteCompletion(); remove_sent_list(i); diff --git a/ndb/src/ndbapi/TransporterFacade.cpp b/ndb/src/ndbapi/TransporterFacade.cpp index b6fb2d6cded..30d0eec1e4a 100644 --- a/ndb/src/ndbapi/TransporterFacade.cpp +++ b/ndb/src/ndbapi/TransporterFacade.cpp @@ -567,6 +567,19 @@ TransporterFacade::init(Uint32 nodeId, const ndb_mgm_configuration* props) } #endif + Uint32 timeout = 120000; + iter.first(); + for (iter.first(); iter.valid(); iter.next()) + { + Uint32 tmp1 = 0, tmp2 = 0; + iter.get(CFG_DB_TRANSACTION_CHECK_INTERVAL, &tmp1); + iter.get(CFG_DB_TRANSACTION_DEADLOCK_TIMEOUT, &tmp2); + tmp1 += tmp2; + if (tmp1 > timeout) + timeout = tmp1; + } + m_waitfor_timeout = timeout; + if (!theTransporterRegistry->start_service(m_socket_server)){ ndbout_c("Unable to start theTransporterRegistry->start_service"); DBUG_RETURN(false); diff --git a/ndb/src/ndbapi/TransporterFacade.hpp b/ndb/src/ndbapi/TransporterFacade.hpp index 99edea846c1..1e7377a3b4d 100644 --- a/ndb/src/ndbapi/TransporterFacade.hpp +++ b/ndb/src/ndbapi/TransporterFacade.hpp @@ -172,6 +172,7 @@ private: */ public: STATIC_CONST( MAX_NO_THREADS = 4711 ); + Uint32 m_waitfor_timeout; // in milli seconds... private: struct ThreadData { diff --git a/ndb/test/ndbapi/testNodeRestart.cpp b/ndb/test/ndbapi/testNodeRestart.cpp index a741e6233d9..eebd631af94 100644 --- a/ndb/test/ndbapi/testNodeRestart.cpp +++ b/ndb/test/ndbapi/testNodeRestart.cpp @@ -535,6 +535,52 @@ err: return NDBT_FAILED; } +int +runBug16772(NDBT_Context* ctx, NDBT_Step* step){ + + NdbRestarter restarter; + if (restarter.getNumDbNodes() < 2) + { + ctx->stopTest(); + return NDBT_OK; + } + + int aliveNodeId = restarter.getRandomNotMasterNodeId(rand()); + int deadNodeId = aliveNodeId; + while (deadNodeId == aliveNodeId) + deadNodeId = restarter.getDbNodeId(rand() % restarter.getNumDbNodes()); + + if (restarter.insertErrorInNode(aliveNodeId, 930)) + return NDBT_FAILED; + + if (restarter.restartOneDbNode(deadNodeId, + /** initial */ false, + /** nostart */ true, + /** abort */ true)) + return NDBT_FAILED; + + if (restarter.waitNodesNoStart(&deadNodeId, 1)) + return NDBT_FAILED; + + if (restarter.startNodes(&deadNodeId, 1)) + return NDBT_FAILED; + + // It should now be hanging since we throw away NDB_FAILCONF + int ret = restarter.waitNodesStartPhase(&deadNodeId, 1, 3, 10); + // So this should fail...i.e it should not reach startphase 3 + + // Now send a NDB_FAILCONF for deadNo + int dump[] = { 7020, 323, 252, 0 }; + dump[3] = deadNodeId; + if (restarter.dumpStateOneNode(aliveNodeId, dump, 4)) + return NDBT_FAILED; + + if (restarter.waitNodesStarted(&deadNodeId, 1)) + return NDBT_FAILED; + + return ret ? NDBT_OK : NDBT_FAILED; +} + NDBT_TESTSUITE(testNodeRestart); TESTCASE("NoLoad", @@ -820,6 +866,10 @@ TESTCASE("Bug15685", STEP(runBug15685); FINALIZER(runClearTable); } +TESTCASE("Bug16772", + "Test bug with restarting before NF handling is complete"){ + STEP(runBug16772); +} NDBT_TESTSUITE_END(testNodeRestart); int main(int argc, const char** argv){ diff --git a/ndb/test/ndbapi/testTimeout.cpp b/ndb/test/ndbapi/testTimeout.cpp index 71c11b25859..25392698642 100644 --- a/ndb/test/ndbapi/testTimeout.cpp +++ b/ndb/test/ndbapi/testTimeout.cpp @@ -24,6 +24,7 @@ #define TIMEOUT (Uint32)3000 Uint32 g_org_timeout = 3000; +Uint32 g_org_deadlock = 3000; int setTransactionTimeout(NDBT_Context* ctx, NDBT_Step* step){ @@ -59,6 +60,60 @@ resetTransactionTimeout(NDBT_Context* ctx, NDBT_Step* step){ return NDBT_OK; } +int +setDeadlockTimeout(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + int timeout = ctx->getProperty("TransactionDeadlockTimeout", TIMEOUT); + + NdbConfig conf(GETNDB(step)->getNodeId()+1); + unsigned int nodeId = conf.getMasterNodeId(); + if (!conf.getProperty(nodeId, + NODE_TYPE_DB, + CFG_DB_TRANSACTION_DEADLOCK_TIMEOUT, + &g_org_deadlock)) + return NDBT_FAILED; + + g_err << "Setting timeout: " << timeout << endl; + int val[] = { DumpStateOrd::TcSetTransactionTimeout, timeout }; + if(restarter.dumpStateAllNodes(val, 2) != 0){ + return NDBT_FAILED; + } + + return NDBT_OK; +} + +int +getDeadlockTimeout(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + + Uint32 val = 0; + NdbConfig conf(GETNDB(step)->getNodeId()+1); + unsigned int nodeId = conf.getMasterNodeId(); + if (!conf.getProperty(nodeId, + NODE_TYPE_DB, + CFG_DB_TRANSACTION_DEADLOCK_TIMEOUT, + &val)) + return NDBT_FAILED; + + if (val < 120000) + val = 120000; + ctx->setProperty("TransactionDeadlockTimeout", 4*val); + + return NDBT_OK; +} + +int +resetDeadlockTimeout(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + + int val[] = { DumpStateOrd::TcSetTransactionTimeout, g_org_deadlock }; + if(restarter.dumpStateAllNodes(val, 2) != 0){ + return NDBT_FAILED; + } + + return NDBT_OK; +} + int runLoadTable(NDBT_Context* ctx, NDBT_Step* step){ @@ -374,6 +429,43 @@ int runBuddyTransNoTimeout(NDBT_Context* ctx, NDBT_Step* step){ return result; } +int +runError4012(NDBT_Context* ctx, NDBT_Step* step){ + int result = NDBT_OK; + int loops = ctx->getNumLoops(); + int stepNo = step->getStepNo(); + + int timeout = ctx->getProperty("TransactionDeadlockTimeout", TIMEOUT); + + HugoOperations hugoOps(*ctx->getTab()); + Ndb* pNdb = GETNDB(step); + + do{ + // Commit transaction + CHECK(hugoOps.startTransaction(pNdb) == 0); + CHECK(hugoOps.pkUpdateRecord(pNdb, 0) == 0); + int ret = hugoOps.execute_NoCommit(pNdb); + if (ret == 0) + { + int sleep = timeout; + ndbout << "Sleeping for " << sleep << " milliseconds" << endl; + NdbSleep_MilliSleep(sleep); + + // Expect that transaction has NOT timed-out + CHECK(hugoOps.execute_Commit(pNdb) == 0); + } + else + { + CHECK(ret == 4012); + } + } while(false); + + hugoOps.closeTransaction(pNdb); + + return result; +} + + NDBT_TESTSUITE(testTimeout); TESTCASE("DontTimeoutTransaction", "Test that the transaction does not timeout "\ @@ -465,6 +557,15 @@ TESTCASE("BuddyTransNoTimeout5", FINALIZER(resetTransactionTimeout); FINALIZER(runClearTable); } +TESTCASE("Error4012", ""){ + TC_PROPERTY("TransactionDeadlockTimeout", 120000); + INITIALIZER(runLoadTable); + INITIALIZER(getDeadlockTimeout); + INITIALIZER(setDeadlockTimeout); + STEPS(runError4012, 2); + FINALIZER(runClearTable); +} + NDBT_TESTSUITE_END(testTimeout); int main(int argc, const char** argv){ diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt index 6378b4a06d3..70518f7881d 100644 --- a/ndb/test/run-test/daily-basic-tests.txt +++ b/ndb/test/run-test/daily-basic-tests.txt @@ -236,6 +236,10 @@ max-time: 500 cmd: testTimeout args: -n TimeoutRandTransaction T1 +max-time: 600 +cmd: testTimeout +args: -n Error4012 T1 + # SCAN TESTS # max-time: 500 @@ -446,6 +450,10 @@ max-time: 500 cmd: testNodeRestart args: -n Bug15685 T1 +max-time: 500 +cmd: testNodeRestart +args: -n Bug16772 T1 + # OLD FLEX max-time: 500 cmd: flexBench |