/* Copyright (C) 2003 MySQL AB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include int restartRandomNodeGraceful(NdbRestarter&, const NdbRestarts::NdbRestart*); int restartRandomNodeAbort(NdbRestarter&, const NdbRestarts::NdbRestart*); int restartRandomNodeError(NdbRestarter&, const NdbRestarts::NdbRestart*); int restartRandomNodeInitial(NdbRestarter&, const NdbRestarts::NdbRestart*); int restartNFDuringNR(NdbRestarter&, const NdbRestarts::NdbRestart*); int restartMasterNodeError(NdbRestarter&, const NdbRestarts::NdbRestart*); int twoNodeFailure(NdbRestarter&, const NdbRestarts::NdbRestart*); int fiftyPercentFail(NdbRestarter&, const NdbRestarts::NdbRestart*); int twoMasterNodeFailure(NdbRestarter&, const NdbRestarts::NdbRestart*); int restartAllNodesGracfeul(NdbRestarter&, const NdbRestarts::NdbRestart*); int restartAllNodesAbort(NdbRestarter&, const NdbRestarts::NdbRestart*); int restartAllNodesError9999(NdbRestarter&, const NdbRestarts::NdbRestart*); int fiftyPercentStopAndWait(NdbRestarter&, const NdbRestarts::NdbRestart*); int restartNodeDuringLCP(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart); int stopOnError(NdbRestarter&, const NdbRestarts::NdbRestart*); int getRandomNodeId(NdbRestarter& _restarter); /** * Define list of restarts * - name of restart * - function perfoming the restart * - required number of nodes * - ... * - arg1, used depending of restart * - arg2, used depending of restart */ const NdbRestarts::NdbRestart NdbRestarts::m_restarts[] = { /********************************************************* * * NODE RESTARTS with 1 node restarted * *********************************************************/ /** * Restart a randomly selected node * with graceful shutdown */ NdbRestart("RestartRandomNode", NODE_RESTART, restartRandomNodeGraceful, 2), /** * Restart a randomly selected node * with immediate(abort) shutdown */ NdbRestart("RestartRandomNodeAbort", NODE_RESTART, restartRandomNodeAbort, 2), /** * Restart a randomly selected node * with error insert * */ NdbRestart("RestartRandomNodeError", NODE_RESTART, restartRandomNodeError, 2), /** * Restart the master node * with error insert */ NdbRestart("RestartMasterNodeError", NODE_RESTART, restartMasterNodeError, 2), /** * Restart a randomly selected node without fileystem * */ NdbRestart("RestartRandomNodeInitial", NODE_RESTART, restartRandomNodeInitial, 2), /** * Restart a randomly selected node and then * crash it while restarting * */ NdbRestart("RestartNFDuringNR", NODE_RESTART, restartNFDuringNR, 2), /** * Set StopOnError and crash the node by sending * SYSTEM_ERROR to it * */ NdbRestart("StopOnError", NODE_RESTART, stopOnError, 1), /********************************************************* * * MULTIPLE NODE RESTARTS with more than 1 node * *********************************************************/ /** * 2 nodes restart, select nodes to restart randomly and restart * with a small random delay between restarts */ NdbRestart("TwoNodeFailure", MULTIPLE_NODE_RESTART, twoNodeFailure, 4), /** * 2 nodes restart, select master nodes and restart with * a small random delay between restarts */ NdbRestart("TwoMasterNodeFailure", MULTIPLE_NODE_RESTART, twoMasterNodeFailure, 4), NdbRestart("FiftyPercentFail", MULTIPLE_NODE_RESTART, fiftyPercentFail, 2), /********************************************************* * * SYSTEM RESTARTS * *********************************************************/ /** * Restart all nodes with graceful shutdown * */ NdbRestart("RestartAllNodes", SYSTEM_RESTART, restartAllNodesGracfeul, 1), /** * Restart all nodes immediately without * graful shutdown */ NdbRestart("RestartAllNodesAbort", SYSTEM_RESTART, restartAllNodesAbort, 1), /** * Restart all nodes with error insert 9999 * TODO! We can later add more errors like 9998, 9997 etc. */ NdbRestart("RestartAllNodesError9999", SYSTEM_RESTART, restartAllNodesError9999, 1), /** * Stop 50% of all nodes with error insert 9999 * Wait for a random number of minutes * Stop the rest of the nodes and then start all again */ NdbRestart("FiftyPercentStopAndWait", SYSTEM_RESTART, fiftyPercentStopAndWait, 2), /** * Restart a master node during LCP with error inserts. */ NdbRestart("RestartNodeDuringLCP", NODE_RESTART, restartNodeDuringLCP, 2), }; const int NdbRestarts::m_NoOfRestarts = sizeof(m_restarts) / sizeof(NdbRestart); const NdbRestarts::NdbErrorInsert NdbRestarts::m_errors[] = { NdbErrorInsert("Error9999", 9999) }; const int NdbRestarts::m_NoOfErrors = sizeof(m_errors) / sizeof(NdbErrorInsert); NdbRestarts::NdbRestart::NdbRestart(const char* _name, NdbRestartType _type, restartFunc* _func, int _requiredNodes, int _arg1){ m_name = _name; m_type = _type; m_restartFunc = _func; m_numRequiredNodes = _requiredNodes; // m_arg1 = arg1; }; int NdbRestarts::getNumRestarts(){ return m_NoOfRestarts; } const NdbRestarts::NdbRestart* NdbRestarts::getRestart(int _num){ if (_num >= m_NoOfRestarts) return NULL; return &m_restarts[_num]; } const NdbRestarts::NdbRestart* NdbRestarts::getRestart(const char* _name){ for(int i = 0; i < m_NoOfRestarts; i++){ if (strcmp(m_restarts[i].m_name, _name) == 0){ return &m_restarts[i]; } } g_err << "The restart \""<< _name << "\" not found in NdbRestarts" << endl; return NULL; } int NdbRestarts::executeRestart(const NdbRestarts::NdbRestart* _restart, unsigned int _timeout){ // Check that there are enough nodes in the cluster // for this test NdbRestarter restarter; if (_restart->m_numRequiredNodes > restarter.getNumDbNodes()){ g_err << "This test requires " << _restart->m_numRequiredNodes << " nodes " << "there are only "<< restarter.getNumDbNodes() <<" nodes in cluster" << endl; return NDBT_OK; } if (restarter.waitClusterStarted(120) != 0){ // If cluster is not started when we shall peform restart // the restart can not be executed and the test fails return NDBT_FAILED; } int res = _restart->m_restartFunc(restarter, _restart); // Sleep a little waiting for nodes to react to command NdbSleep_SecSleep(2); if (_timeout == 0){ // If timeout == 0 wait for ever while(restarter.waitClusterStarted(60) != 0) g_err << "Cluster is not started after restart. Waiting 60s more..." << endl; } else { if (restarter.waitClusterStarted(_timeout) != 0){ g_err<<"Cluster failed to start" << endl; res = NDBT_FAILED; } } return res; } int NdbRestarts::executeRestart(int _num, unsigned int _timeout){ const NdbRestarts::NdbRestart* r = getRestart(_num); if (r == NULL) return NDBT_FAILED; int res = executeRestart(r, _timeout); return res; } int NdbRestarts::executeRestart(const char* _name, unsigned int _timeout){ const NdbRestarts::NdbRestart* r = getRestart(_name); if (r == NULL) return NDBT_FAILED; int res = executeRestart(r, _timeout); return res; } void NdbRestarts::listRestarts(NdbRestartType _type){ for(int i = 0; i < m_NoOfRestarts; i++){ if (m_restarts[i].m_type == _type) ndbout << " " << m_restarts[i].m_name << ", min " << m_restarts[i].m_numRequiredNodes << " nodes"<< endl; } } void NdbRestarts::listRestarts(){ ndbout << "NODE RESTARTS" << endl; listRestarts(NODE_RESTART); ndbout << "MULTIPLE NODE RESTARTS" << endl; listRestarts(MULTIPLE_NODE_RESTART); ndbout << "SYSTEM RESTARTS" << endl; listRestarts(SYSTEM_RESTART); } NdbRestarts::NdbErrorInsert::NdbErrorInsert(const char* _name, int _errorNo){ m_name = _name; m_errorNo = _errorNo; } int NdbRestarts::getNumErrorInserts(){ return m_NoOfErrors; } const NdbRestarts::NdbErrorInsert* NdbRestarts::getError(int _num){ if (_num >= m_NoOfErrors) return NULL; return &m_errors[_num]; } const NdbRestarts::NdbErrorInsert* NdbRestarts::getRandomError(){ int randomId = myRandom48(m_NoOfErrors); return &m_errors[randomId]; } /** * * IMPLEMENTATION OF THE DIFFERENT RESTARTS * Each function should perform it's action * and the returned NDBT_OK or NDBT_FAILED * */ #define CHECK(b, m) { int _xx = b; if (!(_xx)) { \ ndbout << "ERR: "<< m \ << " " << "File: " << __FILE__ \ << " (Line: " << __LINE__ << ")" << "- " << _xx << endl; \ return NDBT_FAILED; } } int restartRandomNodeGraceful(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart){ myRandom48Init(NdbTick_CurrentMillisecond()); int randomId = myRandom48(_restarter.getNumDbNodes()); int nodeId = _restarter.getDbNodeId(randomId); g_info << _restart->m_name << ": node = "<m_name << ": node = "<m_name << ": node = "<m_name << ": node = "<m_name << ": node = "<m_name << ": node = "<< nodeId << endl; CHECK(_restarter.insertErrorInNode(nodeId, 9999) == 0, "Could not restart node "<< nodeId); // Create random value, max 10 secs int max = 10; int seconds = (myRandom48(max)) + 1; g_info << "Waiting for " << seconds << "(" << max << ") secs " << endl; NdbSleep_SecSleep(seconds); randomId = (rand() % _restarter.getNumDbNodes()); nodeId = _restarter.getDbNodeId(randomId); g_info << _restart->m_name << ": node = "<< nodeId << endl; CHECK(_restarter.insertErrorInNode(nodeId, 9999) == 0, "Could not restart node "<< nodeId); return NDBT_OK; }; int twoMasterNodeFailure(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart){ int nodeId = _restarter.getDbNodeId(0); g_info << _restart->m_name << ": node = "<< nodeId << endl; CHECK(_restarter.insertErrorInNode(nodeId, 39999) == 0, "Could not restart node "<< nodeId); // Create random value, max 10 secs int max = 10; int seconds = (myRandom48(max)) + 1; g_info << "Waiting for " << seconds << "(" << max << ") secs " << endl; NdbSleep_SecSleep(seconds); nodeId = _restarter.getDbNodeId(0); g_info << _restart->m_name << ": node = "<< nodeId << endl; CHECK(_restarter.insertErrorInNode(nodeId, 39999) == 0, "Could not restart node "<< nodeId); return NDBT_OK; }; int get50PercentOfNodes(NdbRestarter& restarter, int * _nodes){ // For now simply return all nodes with even node id // TODO Check nodegroup and return one node from each int num50Percent = restarter.getNumDbNodes() / 2; assert(num50Percent <= MAX_NDB_NODES); // Calculate which nodes to stop, select all even nodes for (int i = 0; i < num50Percent; i++){ _nodes[i] = restarter.getDbNodeId(i*2); } return num50Percent; } int fiftyPercentFail(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart){ int nodes[MAX_NDB_NODES]; int numNodes = get50PercentOfNodes(_restarter, nodes); // Stop the nodes, with nostart and abort for (int i = 0; i < numNodes; i++){ g_info << "Stopping node "<< nodes[i] << endl; int res = _restarter.restartOneDbNode(nodes[i], false, true, true); CHECK(res == 0, "Could not stop node: "<< nodes[i]); } CHECK(_restarter.waitNodesNoStart(nodes, numNodes) == 0, "waitNodesNoStart"); // Order all nodes to start ndbout << "Starting all nodes" << endl; CHECK(_restarter.startAll() == 0, "Could not start all nodes"); return NDBT_OK; }; int restartAllNodesGracfeul(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart){ g_info << _restart->m_name << endl; // Restart graceful CHECK(_restarter.restartAll() == 0, "Could not restart all nodes"); return NDBT_OK; }; int restartAllNodesAbort(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart){ g_info << _restart->m_name << endl; // Restart abort CHECK(_restarter.restartAll(false, false, true) == 0, "Could not restart all nodes"); return NDBT_OK; }; int restartAllNodesError9999(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart){ g_info << _restart->m_name << endl; // Restart with error insert CHECK(_restarter.insertErrorInAllNodes(9999) == 0, "Could not restart all nodes "); return NDBT_OK; }; int fiftyPercentStopAndWait(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart){ int nodes[MAX_NDB_NODES]; int numNodes = get50PercentOfNodes(_restarter, nodes); // Stop the nodes, with nostart and abort for (int i = 0; i < numNodes; i++){ g_info << "Stopping node "<m_name << endl; return NDBT_OK; }; int NFDuringNR_codes[] = { 7121, 5027, 7172, 6000, 6001, 6002, 7171, 7130, 7133, 7138, 7154, 7144, 5026, 7139, 7132, //LCP 8000, 8001, 5010, 7022, 7024, 7016, 7017, 5002 }; int restartNFDuringNR(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart){ myRandom48Init(NdbTick_CurrentMillisecond()); int i; const int sz = sizeof(NFDuringNR_codes)/sizeof(NFDuringNR_codes[0]); for(i = 0; im_name << ": node = " << nodeId << " error code = " << error << endl; CHECK(_restarter.restartOneDbNode(nodeId, false, true, true) == 0, "Could not restart node "<< nodeId); CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0, "waitNodesNoStart failed"); int val = DumpStateOrd::CmvmiSetRestartOnErrorInsert; CHECK(_restarter.dumpStateOneNode(nodeId, &val, 1) == 0, "failed to set RestartOnErrorInsert"); CHECK(_restarter.insertErrorInNode(nodeId, error) == 0, "failed to set error insert"); CHECK(_restarter.startNodes(&nodeId, 1) == 0, "failed to start node"); NdbSleep_SecSleep(3); //CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0, // "waitNodesNoStart failed"); _restarter.waitNodesNoStart(&nodeId, 1); CHECK(_restarter.startNodes(&nodeId, 1) == 0, "failed to start node"); CHECK(_restarter.waitNodesStarted(&nodeId, 1) == 0, "waitNodesStarted failed"); } return NDBT_OK; if(_restarter.getNumDbNodes() < 4) return NDBT_OK; char buf[256]; if(NdbEnv_GetEnv("USER", buf, 256) == 0 || strcmp(buf, "ejonore") != 0) return NDBT_OK; for(i = 0; i 0, "getMasterNodeId failed"); int crashNodeId = 0; do { int rand = myRandom48(1000); crashNodeId = _restarter.getRandomNodeOtherNodeGroup(nodeId, rand); } while(crashNodeId == masterNodeId); CHECK(crashNodeId > 0, "getMasterNodeId failed"); g_info << _restart->m_name << " restarting node = " << nodeId << " error code = " << error << " crash node = " << crashNodeId << endl; CHECK(_restarter.restartOneDbNode(nodeId, false, true, true) == 0, "Could not restart node "<< nodeId); CHECK(_restarter.waitNodesNoStart(&nodeId, 1) == 0, "waitNodesNoStart failed"); int val = DumpStateOrd::CmvmiSetRestartOnErrorInsert; CHECK(_restarter.dumpStateOneNode(crashNodeId, &val, 2) == 0, "failed to set RestartOnErrorInsert"); CHECK(_restarter.insertErrorInNode(crashNodeId, error) == 0, "failed to set error insert"); CHECK(_restarter.startNodes(&nodeId, 1) == 0, "failed to start node"); CHECK(_restarter.waitClusterStarted() == 0, "waitClusterStarted failed"); } return NDBT_OK; }; int NRDuringLCP_Master_codes[] = { 7009, // Insert system error in master when local checkpoint is idle. 7010, // Insert system error in master when local checkpoint is in the // state clcpStatus = CALCULATE_KEEP_GCI. 7013, // Insert system error in master when local checkpoint is in the // state clcpStatus = COPY_GCI before sending COPY_GCIREQ. 7014, // Insert system error in master when local checkpoint is in the // state clcpStatus = TC_CLOPSIZE before sending TC_CLOPSIZEREQ. 7015, // Insert system error in master when local checkpoint is in the // state clcpStatus = START_LCP_ROUND before sending START_LCP_ROUND. 7019, // Insert system error in master when local checkpoint is in the // state clcpStatus = IDLE before sending CONTINUEB(ZCHECK_TC_COUNTER). 7075, // Master. Don't send any LCP_FRAG_ORD(last=true) // And crash when all have "not" been sent 7021, // Crash in master when receiving START_LCP_REQ 7023, // Crash in master when sending START_LCP_CONF 7025, // Crash in master when receiving LCP_FRAG_REP 7026, // Crash in master when changing state to LCP_TAB_COMPLETED 7027 // Crash in master when changing state to LCP_TAB_SAVED }; int NRDuringLCP_NonMaster_codes[] = { 7020, // Insert system error in local checkpoint participant at reception // of COPY_GCIREQ. 8000, // Crash particpant when receiving TCGETOPSIZEREQ 8001, // Crash particpant when receiving TC_CLOPSIZEREQ 5010, // Crash any when receiving LCP_FRAGORD 7022, // Crash in !master when receiving START_LCP_REQ 7024, // Crash in !master when sending START_LCP_CONF 7016, // Crash in !master when receiving LCP_FRAG_REP 7017, // Crash in !master when changing state to LCP_TAB_COMPLETED 7018 // Crash in !master when changing state to LCP_TAB_SAVED }; int restartNodeDuringLCP(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart) { int i; // Master int val = DumpStateOrd::DihMinTimeBetweenLCP; CHECK(_restarter.dumpStateAllNodes(&val, 1) == 0, "Failed to set LCP to min value"); // Set LCP to min val int sz = sizeof(NRDuringLCP_Master_codes)/ sizeof(NRDuringLCP_Master_codes[0]); for(i = 0; i 0, "getMasterNodeId failed"); ndbout << _restart->m_name << " restarting master node = " << masterNodeId << " error code = " << error << endl; { int val = DumpStateOrd::CmvmiSetRestartOnErrorInsert; CHECK(_restarter.dumpStateAllNodes(&val, 1) == 0, "failed to set RestartOnErrorInsert"); } CHECK(_restarter.insertErrorInNode(masterNodeId, error) == 0, "failed to set error insert"); CHECK(_restarter.waitNodesNoStart(&masterNodeId, 1, 300) == 0, "failed to wait no start"); CHECK(_restarter.startNodes(&masterNodeId, 1) == 0, "failed to start node"); CHECK(_restarter.waitClusterStarted(300) == 0, "waitClusterStarted failed"); { int val = DumpStateOrd::DihMinTimeBetweenLCP; CHECK(_restarter.dumpStateOneNode(masterNodeId, &val, 1) == 0, "failed to set error insert"); } } // NON-Master sz = sizeof(NRDuringLCP_NonMaster_codes)/ sizeof(NRDuringLCP_NonMaster_codes[0]); for(i = 0; i 0, "getMasterNodeId failed"); while (nodeId == masterNodeId) { nodeId = getRandomNodeId(_restarter); } ndbout << _restart->m_name << " restarting non-master node = " << nodeId << " error code = " << error << endl; int val = DumpStateOrd::CmvmiSetRestartOnErrorInsert; CHECK(_restarter.dumpStateAllNodes(&val, 1) == 0, "failed to set RestartOnErrorInsert"); CHECK(_restarter.insertErrorInNode(nodeId, error) == 0, "failed to set error insert"); CHECK(_restarter.waitNodesNoStart(&nodeId, 1, 300) == 0, "failed to wait no start"); CHECK(_restarter.startNodes(&nodeId, 1) == 0, "failed to start node"); CHECK(_restarter.waitClusterStarted(300) == 0, "waitClusterStarted failed"); { int val = DumpStateOrd::DihMinTimeBetweenLCP; CHECK(_restarter.dumpStateOneNode(nodeId, &val, 1) == 0, "failed to set error insert"); } } return NDBT_OK; } int stopOnError(NdbRestarter& _restarter, const NdbRestarts::NdbRestart* _restart){ myRandom48Init(NdbTick_CurrentMillisecond()); int randomId = myRandom48(_restarter.getNumDbNodes()); int nodeId = _restarter.getDbNodeId(randomId); do { g_info << _restart->m_name << ": node = " << nodeId << endl; CHECK(_restarter.waitClusterStarted(300) == 0, "waitClusterStarted failed"); int val = DumpStateOrd::NdbcntrTestStopOnError; CHECK(_restarter.dumpStateOneNode(nodeId, &val, 1) == 0, "failed to set NdbcntrTestStopOnError"); NdbSleep_SecSleep(3); CHECK(_restarter.waitClusterStarted(300) == 0, "waitClusterStarted failed"); } while (false); return NDBT_OK; }; int getRandomNodeId(NdbRestarter& _restarter) { myRandom48Init(NdbTick_CurrentMillisecond()); int randomId = myRandom48(_restarter.getNumDbNodes()); int nodeId = _restarter.getDbNodeId(randomId); return nodeId; }