diff options
author | unknown <jonas@perch.ndb.mysql.com> | 2006-03-21 14:47:10 +0100 |
---|---|---|
committer | unknown <jonas@perch.ndb.mysql.com> | 2006-03-21 14:47:10 +0100 |
commit | 8ed36cb667b675244f55072cefa15fb65ec89ee7 (patch) | |
tree | ea6f3eb4ed5449560230a164a6976d3cf855dbd0 /ndb | |
parent | 591aedaa2b594fdccca79a09c846cf4b4490f884 (diff) | |
download | mariadb-git-8ed36cb667b675244f55072cefa15fb65ec89ee7.tar.gz |
ndb - bug#18385
Partial system restart, can not try to start with higher GCI that own
even if knowing about a higher number
ndb/include/kernel/signaldata/DumpStateOrd.hpp:
Add new dump for setting time between gcp
ndb/include/kernel/signaldata/StartPerm.hpp:
Move error codes into StartPerm + Add new error code
ndb/src/kernel/blocks/ERROR_codes.txt:
Add new error insert
ndb/src/kernel/blocks/dbdih/Dbdih.hpp:
Move error codes into StartPerm + Add new error code
ndb/src/kernel/blocks/dbdih/DbdihMain.cpp:
Fix so that we don't try to restart to a too new GCI when doing a partial start
Add new error code when this node later tries to join
ndb/test/include/NdbRestarter.hpp:
Add new method for selecting random node
ndb/test/ndbapi/testSystemRestart.cpp:
Add new testcase for bug#18385
ndb/test/run-test/daily-basic-tests.txt:
Run test in daily-basic
ndb/test/src/NdbRestarter.cpp:
Add new method for selecting random node
Diffstat (limited to 'ndb')
-rw-r--r-- | ndb/include/kernel/signaldata/DumpStateOrd.hpp | 1 | ||||
-rw-r--r-- | ndb/include/kernel/signaldata/StartPerm.hpp | 6 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/ERROR_codes.txt | 2 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/dbdih/Dbdih.hpp | 1 | ||||
-rw-r--r-- | ndb/src/kernel/blocks/dbdih/DbdihMain.cpp | 99 | ||||
-rw-r--r-- | ndb/test/include/NdbRestarter.hpp | 1 | ||||
-rw-r--r-- | ndb/test/ndbapi/testSystemRestart.cpp | 53 | ||||
-rw-r--r-- | ndb/test/run-test/daily-basic-tests.txt | 4 | ||||
-rw-r--r-- | ndb/test/src/NdbRestarter.cpp | 33 |
9 files changed, 177 insertions, 23 deletions
diff --git a/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/ndb/include/kernel/signaldata/DumpStateOrd.hpp index 4dd22cf5092..2c824670cef 100644 --- a/ndb/include/kernel/signaldata/DumpStateOrd.hpp +++ b/ndb/include/kernel/signaldata/DumpStateOrd.hpp @@ -127,6 +127,7 @@ public: DihMinTimeBetweenLCP = 7017, DihMaxTimeBetweenLCP = 7018, EnableUndoDelayDataWrite = 7080, // DIH+ACC+TUP + DihSetTimeBetweenGcp = 7090, DihStartLcpImmediately = 7099, // 8000 Suma // 12000 Tux diff --git a/ndb/include/kernel/signaldata/StartPerm.hpp b/ndb/include/kernel/signaldata/StartPerm.hpp index 38be72835a3..63e01ed3868 100644 --- a/ndb/include/kernel/signaldata/StartPerm.hpp +++ b/ndb/include/kernel/signaldata/StartPerm.hpp @@ -64,5 +64,11 @@ private: Uint32 startingNodeId; Uint32 errorCode; + + enum ErrorCode + { + ZNODE_ALREADY_STARTING_ERROR = 305, + InitialStartRequired = 320 + }; }; #endif diff --git a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt index 62481837c14..e5576450846 100644 --- a/ndb/src/kernel/blocks/ERROR_codes.txt +++ b/ndb/src/kernel/blocks/ERROR_codes.txt @@ -303,6 +303,8 @@ Test Crashes in handling node restarts 7131: Crash when receiving START_COPYREQ in master node 7132: Crash when receiving START_COPYCONF in starting node +7170: Crash when receiving START_PERMREF (InitialStartRequired) + DICT: 6000 Crash during NR when receiving DICTSTARTREQ 6001 Crash during NR when receiving SCHEMA_INFO diff --git a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp index f74c0f36c4d..78acf1ffd19 100644 --- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp +++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp @@ -81,7 +81,6 @@ #define ZWRONG_FAILURE_NUMBER_ERROR 302 #define ZWRONG_START_NODE_ERROR 303 #define ZNO_REPLICA_FOUND_ERROR 304 -#define ZNODE_ALREADY_STARTING_ERROR 305 #define ZNODE_START_DISALLOWED_ERROR 309 // -------------------------------------- diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp index fab428aadef..eb4ae61a3e4 100644 --- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp +++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp @@ -1420,6 +1420,33 @@ void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref) return; } + NodeRecordPtr nodePtr; + Uint32 gci = SYSFILE->lastCompletedGCI[getOwnNodeId()]; + for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) + { + jam(); + ptrAss(nodePtr, nodeRecord); + if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) + { + jam(); + /** + * Since we're starting(is master) and there + * there are other nodes with higher GCI... + * there gci's must be invalidated... + * and they _must_ do an initial start + * indicate this by setting lastCompletedGCI = 0 + */ + SYSFILE->lastCompletedGCI[nodePtr.i] = 0; + ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE); + warningEvent("Making filesystem for node %d unusable", + nodePtr.i); + } + } + /** + * This set which GCI we will try to restart to + */ + SYSFILE->newestRestorableGCI = gci; + ndbrequire(isMaster()); copyGciLab(signal, CopyGCIReq::RESTART); // We have already read the file! }//Dbdih::ndbStartReqLab() @@ -1557,7 +1584,7 @@ void Dbdih::execSTART_PERMREF(Signal* signal) { jamEntry(); Uint32 errorCode = signal->theData[1]; - if (errorCode == ZNODE_ALREADY_STARTING_ERROR) { + if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) { jam(); /*-----------------------------------------------------------------------*/ // The master was busy adding another node. We will wait for a second and @@ -1567,6 +1594,20 @@ void Dbdih::execSTART_PERMREF(Signal* signal) sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 1); return; }//if + + if (errorCode == StartPermRef::InitialStartRequired) + { + CRASH_INSERTION(7170); + char buf[255]; + BaseString::snprintf(buf, sizeof(buf), + "Cluster requires this node to be started " + " with --initial as partial start has been performed" + " and this filesystem is unusable"); + progError(__LINE__, + ERR_SR_RESTARTCONFLICT, + buf); + ndbrequire(false); + } /*------------------------------------------------------------------------*/ // Some node process in another node involving our node was still active. We // will recover from this by crashing here. @@ -1657,7 +1698,7 @@ void Dbdih::execSTART_PERMREQ(Signal* signal) (c_nodeStartMaster.wait != ZFALSE)) { jam(); signal->theData[0] = nodeId; - signal->theData[1] = ZNODE_ALREADY_STARTING_ERROR; + signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR; sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB); return; }//if @@ -1667,6 +1708,16 @@ void Dbdih::execSTART_PERMREQ(Signal* signal) ndbrequire(false); }//if + if (SYSFILE->lastCompletedGCI[nodeId] == 0 && + typeStart != NodeState::ST_INITIAL_NODE_RESTART) + { + jam(); + signal->theData[0] = nodeId; + signal->theData[1] = StartPermRef::InitialStartRequired; + sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB); + return; + } + /*---------------------------------------------------------------------- * WE START THE INCLUSION PROCEDURE * ---------------------------------------------------------------------*/ @@ -3515,24 +3566,12 @@ void Dbdih::closingGcpLab(Signal* signal, FileRecordPtr filePtr) /* ------------------------------------------------------------------------- */ void Dbdih::selectMasterCandidateAndSend(Signal* signal) { - Uint32 gci = 0; - Uint32 masterCandidateId = 0; - NodeRecordPtr nodePtr; - for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { - jam(); - ptrAss(nodePtr, nodeRecord); - if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) { - jam(); - masterCandidateId = nodePtr.i; - gci = SYSFILE->lastCompletedGCI[nodePtr.i]; - }//if - }//for - ndbrequire(masterCandidateId != 0); setNodeGroups(); - signal->theData[0] = masterCandidateId; - signal->theData[1] = gci; + signal->theData[0] = getOwnNodeId(); + signal->theData[1] = SYSFILE->lastCompletedGCI[getOwnNodeId()]; sendSignal(cntrlblockref, GSN_DIH_RESTARTCONF, signal, 2, JBB); - + + NodeRecordPtr nodePtr; Uint32 node_groups[MAX_NDB_NODES]; memset(node_groups, 0, sizeof(node_groups)); for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) { @@ -3550,10 +3589,10 @@ void Dbdih::selectMasterCandidateAndSend(Signal* signal) if(count != 0 && count != cnoReplicas){ char buf[255]; BaseString::snprintf(buf, sizeof(buf), - "Illegal configuration change." - " Initial start needs to be performed " - " when changing no of replicas (%d != %d)", - node_groups[nodePtr.i], cnoReplicas); + "Illegal configuration change." + " Initial start needs to be performed " + " when changing no of replicas (%d != %d)", + node_groups[nodePtr.i], cnoReplicas); progError(__LINE__, ERR_INVALID_CONFIG, buf); @@ -13359,6 +13398,22 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal) c_lcpState.ctimer += (1 << c_lcpState.clcpDelay); return; } + + if (dumpState->args[0] == DumpStateOrd::DihSetTimeBetweenGcp) + { + if (signal->getLength() == 1) + { + const ndb_mgm_configuration_iterator * p = + theConfiguration.getOwnConfigIterator(); + ndbrequire(p != 0); + ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &cgcpDelay); + } + else + { + cgcpDelay = signal->theData[1]; + } + ndbout_c("Setting time between gcp : %d", cgcpDelay); + } }//Dbdih::execDUMP_STATE_ORD() void diff --git a/ndb/test/include/NdbRestarter.hpp b/ndb/test/include/NdbRestarter.hpp index 19a88b4f8ad..3ec92ae786e 100644 --- a/ndb/test/include/NdbRestarter.hpp +++ b/ndb/test/include/NdbRestarter.hpp @@ -62,6 +62,7 @@ public: int dumpStateAllNodes(int * _args, int _num_args); int getMasterNodeId(); + int getRandomNodeSameNodeGroup(int nodeId, int randomNumber); int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber); int getRandomNotMasterNodeId(int randomNumber); diff --git a/ndb/test/ndbapi/testSystemRestart.cpp b/ndb/test/ndbapi/testSystemRestart.cpp index 35016896495..30f7aca9b06 100644 --- a/ndb/test/ndbapi/testSystemRestart.cpp +++ b/ndb/test/ndbapi/testSystemRestart.cpp @@ -1051,6 +1051,52 @@ int runSystemRestart9(NDBT_Context* ctx, NDBT_Step* step){ return result; } +int runBug18385(NDBT_Context* ctx, NDBT_Step* step){ + NdbRestarter restarter; + const Uint32 nodeCount = restarter.getNumDbNodes(); + if(nodeCount < 2){ + g_info << "Bug18385 - Needs atleast 2 nodes to test" << endl; + return NDBT_OK; + } + + int node1 = restarter.getDbNodeId(rand() % nodeCount); + int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand()); + + if (node1 == -1 || node2 == -1) + return NDBT_OK; + + int dump[] = { DumpStateOrd::DihSetTimeBetweenGcp, 300 }; + + int result = NDBT_OK; + do { + CHECK(restarter.dumpStateAllNodes(dump, 2) == 0); + CHECK(restarter.restartOneDbNode(node1, false, true, false) == 0); + NdbSleep_SecSleep(3); + CHECK(restarter.restartAll(false, true, false) == 0); + + Uint32 cnt = 0; + int nodes[128]; + for(Uint32 i = 0; i<nodeCount; i++) + if ((nodes[cnt] = restarter.getDbNodeId(i)) != node2) + cnt++; + + assert(cnt == nodeCount - 1); + + CHECK(restarter.startNodes(nodes, cnt) == 0); + CHECK(restarter.waitNodesStarted(nodes, cnt, 300) == 0); + + CHECK(restarter.insertErrorInNode(node2, 7170) == 0); + CHECK(restarter.waitNodesNoStart(&node2, 1) == 0); + CHECK(restarter.restartOneDbNode(node2, true, false, true) == 0); + CHECK(restarter.waitNodesStarted(&node2, 1) == 0); + + } while(0); + + g_info << "Bug18385 finished" << endl; + + return result; +} + int runWaitStarted(NDBT_Context* ctx, NDBT_Step* step){ NdbRestarter restarter; @@ -1234,6 +1280,13 @@ TESTCASE("SR9", STEP(runSystemRestart9); FINALIZER(runClearTable); } +TESTCASE("Bug18385", + "Perform partition system restart with other nodes with higher GCI"){ + INITIALIZER(runWaitStarted); + INITIALIZER(runClearTable); + STEP(runBug18385); + FINALIZER(runClearTable); +} NDBT_TESTSUITE_END(testSystemRestart); int main(int argc, const char** argv){ diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt index 70518f7881d..0533d585a41 100644 --- a/ndb/test/run-test/daily-basic-tests.txt +++ b/ndb/test/run-test/daily-basic-tests.txt @@ -454,6 +454,10 @@ max-time: 500 cmd: testNodeRestart args: -n Bug16772 T1 +max-time: 500 +cmd: testSystemRestart +args: -n Bug18385 T1 + # OLD FLEX max-time: 500 cmd: flexBench diff --git a/ndb/test/src/NdbRestarter.cpp b/ndb/test/src/NdbRestarter.cpp index 91c0963feae..2c16a05240d 100644 --- a/ndb/test/src/NdbRestarter.cpp +++ b/ndb/test/src/NdbRestarter.cpp @@ -174,6 +174,39 @@ NdbRestarter::getRandomNodeOtherNodeGroup(int nodeId, int rand){ return -1; } +int +NdbRestarter::getRandomNodeSameNodeGroup(int nodeId, int rand){ + if (!isConnected()) + return -1; + + if (getStatus() != 0) + return -1; + + int node_group = -1; + for(size_t i = 0; i < ndbNodes.size(); i++){ + if(ndbNodes[i].node_id == nodeId){ + node_group = ndbNodes[i].node_group; + break; + } + } + if(node_group == -1){ + return -1; + } + + Uint32 counter = 0; + rand = rand % ndbNodes.size(); + while(counter++ < ndbNodes.size() && + (ndbNodes[rand].node_id == nodeId || + ndbNodes[rand].node_group != node_group)) + rand = (rand + 1) % ndbNodes.size(); + + if(ndbNodes[rand].node_group == node_group && + ndbNodes[rand].node_id != nodeId) + return ndbNodes[rand].node_id; + + return -1; +} + int NdbRestarter::waitClusterStarted(unsigned int _timeout){ return waitClusterState(NDB_MGM_NODE_STATUS_STARTED, _timeout); |