summaryrefslogtreecommitdiff
path: root/ndb
diff options
context:
space:
mode:
authorunknown <jonas@perch.ndb.mysql.com>2006-03-21 14:47:10 +0100
committerunknown <jonas@perch.ndb.mysql.com>2006-03-21 14:47:10 +0100
commit8ed36cb667b675244f55072cefa15fb65ec89ee7 (patch)
treeea6f3eb4ed5449560230a164a6976d3cf855dbd0 /ndb
parent591aedaa2b594fdccca79a09c846cf4b4490f884 (diff)
downloadmariadb-git-8ed36cb667b675244f55072cefa15fb65ec89ee7.tar.gz
ndb - bug#18385
Partial system restart, can not try to start with higher GCI that own even if knowing about a higher number ndb/include/kernel/signaldata/DumpStateOrd.hpp: Add new dump for setting time between gcp ndb/include/kernel/signaldata/StartPerm.hpp: Move error codes into StartPerm + Add new error code ndb/src/kernel/blocks/ERROR_codes.txt: Add new error insert ndb/src/kernel/blocks/dbdih/Dbdih.hpp: Move error codes into StartPerm + Add new error code ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: Fix so that we don't try to restart to a too new GCI when doing a partial start Add new error code when this node later tries to join ndb/test/include/NdbRestarter.hpp: Add new method for selecting random node ndb/test/ndbapi/testSystemRestart.cpp: Add new testcase for bug#18385 ndb/test/run-test/daily-basic-tests.txt: Run test in daily-basic ndb/test/src/NdbRestarter.cpp: Add new method for selecting random node
Diffstat (limited to 'ndb')
-rw-r--r--ndb/include/kernel/signaldata/DumpStateOrd.hpp1
-rw-r--r--ndb/include/kernel/signaldata/StartPerm.hpp6
-rw-r--r--ndb/src/kernel/blocks/ERROR_codes.txt2
-rw-r--r--ndb/src/kernel/blocks/dbdih/Dbdih.hpp1
-rw-r--r--ndb/src/kernel/blocks/dbdih/DbdihMain.cpp99
-rw-r--r--ndb/test/include/NdbRestarter.hpp1
-rw-r--r--ndb/test/ndbapi/testSystemRestart.cpp53
-rw-r--r--ndb/test/run-test/daily-basic-tests.txt4
-rw-r--r--ndb/test/src/NdbRestarter.cpp33
9 files changed, 177 insertions, 23 deletions
diff --git a/ndb/include/kernel/signaldata/DumpStateOrd.hpp b/ndb/include/kernel/signaldata/DumpStateOrd.hpp
index 4dd22cf5092..2c824670cef 100644
--- a/ndb/include/kernel/signaldata/DumpStateOrd.hpp
+++ b/ndb/include/kernel/signaldata/DumpStateOrd.hpp
@@ -127,6 +127,7 @@ public:
DihMinTimeBetweenLCP = 7017,
DihMaxTimeBetweenLCP = 7018,
EnableUndoDelayDataWrite = 7080, // DIH+ACC+TUP
+ DihSetTimeBetweenGcp = 7090,
DihStartLcpImmediately = 7099,
// 8000 Suma
// 12000 Tux
diff --git a/ndb/include/kernel/signaldata/StartPerm.hpp b/ndb/include/kernel/signaldata/StartPerm.hpp
index 38be72835a3..63e01ed3868 100644
--- a/ndb/include/kernel/signaldata/StartPerm.hpp
+++ b/ndb/include/kernel/signaldata/StartPerm.hpp
@@ -64,5 +64,11 @@ private:
Uint32 startingNodeId;
Uint32 errorCode;
+
+ enum ErrorCode
+ {
+ ZNODE_ALREADY_STARTING_ERROR = 305,
+ InitialStartRequired = 320
+ };
};
#endif
diff --git a/ndb/src/kernel/blocks/ERROR_codes.txt b/ndb/src/kernel/blocks/ERROR_codes.txt
index 62481837c14..e5576450846 100644
--- a/ndb/src/kernel/blocks/ERROR_codes.txt
+++ b/ndb/src/kernel/blocks/ERROR_codes.txt
@@ -303,6 +303,8 @@ Test Crashes in handling node restarts
7131: Crash when receiving START_COPYREQ in master node
7132: Crash when receiving START_COPYCONF in starting node
+7170: Crash when receiving START_PERMREF (InitialStartRequired)
+
DICT:
6000 Crash during NR when receiving DICTSTARTREQ
6001 Crash during NR when receiving SCHEMA_INFO
diff --git a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
index f74c0f36c4d..78acf1ffd19 100644
--- a/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
+++ b/ndb/src/kernel/blocks/dbdih/Dbdih.hpp
@@ -81,7 +81,6 @@
#define ZWRONG_FAILURE_NUMBER_ERROR 302
#define ZWRONG_START_NODE_ERROR 303
#define ZNO_REPLICA_FOUND_ERROR 304
-#define ZNODE_ALREADY_STARTING_ERROR 305
#define ZNODE_START_DISALLOWED_ERROR 309
// --------------------------------------
diff --git a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
index fab428aadef..eb4ae61a3e4 100644
--- a/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
+++ b/ndb/src/kernel/blocks/dbdih/DbdihMain.cpp
@@ -1420,6 +1420,33 @@ void Dbdih::ndbStartReqLab(Signal* signal, BlockReference ref)
return;
}
+ NodeRecordPtr nodePtr;
+ Uint32 gci = SYSFILE->lastCompletedGCI[getOwnNodeId()];
+ for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++)
+ {
+ jam();
+ ptrAss(nodePtr, nodeRecord);
+ if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci)
+ {
+ jam();
+ /**
+ * Since we're starting(is master) and there
+ * there are other nodes with higher GCI...
+ * there gci's must be invalidated...
+ * and they _must_ do an initial start
+ * indicate this by setting lastCompletedGCI = 0
+ */
+ SYSFILE->lastCompletedGCI[nodePtr.i] = 0;
+ ndbrequire(nodePtr.p->nodeStatus != NodeRecord::ALIVE);
+ warningEvent("Making filesystem for node %d unusable",
+ nodePtr.i);
+ }
+ }
+ /**
+ * This set which GCI we will try to restart to
+ */
+ SYSFILE->newestRestorableGCI = gci;
+
ndbrequire(isMaster());
copyGciLab(signal, CopyGCIReq::RESTART); // We have already read the file!
}//Dbdih::ndbStartReqLab()
@@ -1557,7 +1584,7 @@ void Dbdih::execSTART_PERMREF(Signal* signal)
{
jamEntry();
Uint32 errorCode = signal->theData[1];
- if (errorCode == ZNODE_ALREADY_STARTING_ERROR) {
+ if (errorCode == StartPermRef::ZNODE_ALREADY_STARTING_ERROR) {
jam();
/*-----------------------------------------------------------------------*/
// The master was busy adding another node. We will wait for a second and
@@ -1567,6 +1594,20 @@ void Dbdih::execSTART_PERMREF(Signal* signal)
sendSignalWithDelay(reference(), GSN_CONTINUEB, signal, 3000, 1);
return;
}//if
+
+ if (errorCode == StartPermRef::InitialStartRequired)
+ {
+ CRASH_INSERTION(7170);
+ char buf[255];
+ BaseString::snprintf(buf, sizeof(buf),
+ "Cluster requires this node to be started "
+ " with --initial as partial start has been performed"
+ " and this filesystem is unusable");
+ progError(__LINE__,
+ ERR_SR_RESTARTCONFLICT,
+ buf);
+ ndbrequire(false);
+ }
/*------------------------------------------------------------------------*/
// Some node process in another node involving our node was still active. We
// will recover from this by crashing here.
@@ -1657,7 +1698,7 @@ void Dbdih::execSTART_PERMREQ(Signal* signal)
(c_nodeStartMaster.wait != ZFALSE)) {
jam();
signal->theData[0] = nodeId;
- signal->theData[1] = ZNODE_ALREADY_STARTING_ERROR;
+ signal->theData[1] = StartPermRef::ZNODE_ALREADY_STARTING_ERROR;
sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
return;
}//if
@@ -1667,6 +1708,16 @@ void Dbdih::execSTART_PERMREQ(Signal* signal)
ndbrequire(false);
}//if
+ if (SYSFILE->lastCompletedGCI[nodeId] == 0 &&
+ typeStart != NodeState::ST_INITIAL_NODE_RESTART)
+ {
+ jam();
+ signal->theData[0] = nodeId;
+ signal->theData[1] = StartPermRef::InitialStartRequired;
+ sendSignal(retRef, GSN_START_PERMREF, signal, 2, JBB);
+ return;
+ }
+
/*----------------------------------------------------------------------
* WE START THE INCLUSION PROCEDURE
* ---------------------------------------------------------------------*/
@@ -3515,24 +3566,12 @@ void Dbdih::closingGcpLab(Signal* signal, FileRecordPtr filePtr)
/* ------------------------------------------------------------------------- */
void Dbdih::selectMasterCandidateAndSend(Signal* signal)
{
- Uint32 gci = 0;
- Uint32 masterCandidateId = 0;
- NodeRecordPtr nodePtr;
- for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
- jam();
- ptrAss(nodePtr, nodeRecord);
- if (SYSFILE->lastCompletedGCI[nodePtr.i] > gci) {
- jam();
- masterCandidateId = nodePtr.i;
- gci = SYSFILE->lastCompletedGCI[nodePtr.i];
- }//if
- }//for
- ndbrequire(masterCandidateId != 0);
setNodeGroups();
- signal->theData[0] = masterCandidateId;
- signal->theData[1] = gci;
+ signal->theData[0] = getOwnNodeId();
+ signal->theData[1] = SYSFILE->lastCompletedGCI[getOwnNodeId()];
sendSignal(cntrlblockref, GSN_DIH_RESTARTCONF, signal, 2, JBB);
-
+
+ NodeRecordPtr nodePtr;
Uint32 node_groups[MAX_NDB_NODES];
memset(node_groups, 0, sizeof(node_groups));
for (nodePtr.i = 1; nodePtr.i < MAX_NDB_NODES; nodePtr.i++) {
@@ -3550,10 +3589,10 @@ void Dbdih::selectMasterCandidateAndSend(Signal* signal)
if(count != 0 && count != cnoReplicas){
char buf[255];
BaseString::snprintf(buf, sizeof(buf),
- "Illegal configuration change."
- " Initial start needs to be performed "
- " when changing no of replicas (%d != %d)",
- node_groups[nodePtr.i], cnoReplicas);
+ "Illegal configuration change."
+ " Initial start needs to be performed "
+ " when changing no of replicas (%d != %d)",
+ node_groups[nodePtr.i], cnoReplicas);
progError(__LINE__,
ERR_INVALID_CONFIG,
buf);
@@ -13359,6 +13398,22 @@ Dbdih::execDUMP_STATE_ORD(Signal* signal)
c_lcpState.ctimer += (1 << c_lcpState.clcpDelay);
return;
}
+
+ if (dumpState->args[0] == DumpStateOrd::DihSetTimeBetweenGcp)
+ {
+ if (signal->getLength() == 1)
+ {
+ const ndb_mgm_configuration_iterator * p =
+ theConfiguration.getOwnConfigIterator();
+ ndbrequire(p != 0);
+ ndb_mgm_get_int_parameter(p, CFG_DB_GCP_INTERVAL, &cgcpDelay);
+ }
+ else
+ {
+ cgcpDelay = signal->theData[1];
+ }
+ ndbout_c("Setting time between gcp : %d", cgcpDelay);
+ }
}//Dbdih::execDUMP_STATE_ORD()
void
diff --git a/ndb/test/include/NdbRestarter.hpp b/ndb/test/include/NdbRestarter.hpp
index 19a88b4f8ad..3ec92ae786e 100644
--- a/ndb/test/include/NdbRestarter.hpp
+++ b/ndb/test/include/NdbRestarter.hpp
@@ -62,6 +62,7 @@ public:
int dumpStateAllNodes(int * _args, int _num_args);
int getMasterNodeId();
+ int getRandomNodeSameNodeGroup(int nodeId, int randomNumber);
int getRandomNodeOtherNodeGroup(int nodeId, int randomNumber);
int getRandomNotMasterNodeId(int randomNumber);
diff --git a/ndb/test/ndbapi/testSystemRestart.cpp b/ndb/test/ndbapi/testSystemRestart.cpp
index 35016896495..30f7aca9b06 100644
--- a/ndb/test/ndbapi/testSystemRestart.cpp
+++ b/ndb/test/ndbapi/testSystemRestart.cpp
@@ -1051,6 +1051,52 @@ int runSystemRestart9(NDBT_Context* ctx, NDBT_Step* step){
return result;
}
+int runBug18385(NDBT_Context* ctx, NDBT_Step* step){
+ NdbRestarter restarter;
+ const Uint32 nodeCount = restarter.getNumDbNodes();
+ if(nodeCount < 2){
+ g_info << "Bug18385 - Needs atleast 2 nodes to test" << endl;
+ return NDBT_OK;
+ }
+
+ int node1 = restarter.getDbNodeId(rand() % nodeCount);
+ int node2 = restarter.getRandomNodeSameNodeGroup(node1, rand());
+
+ if (node1 == -1 || node2 == -1)
+ return NDBT_OK;
+
+ int dump[] = { DumpStateOrd::DihSetTimeBetweenGcp, 300 };
+
+ int result = NDBT_OK;
+ do {
+ CHECK(restarter.dumpStateAllNodes(dump, 2) == 0);
+ CHECK(restarter.restartOneDbNode(node1, false, true, false) == 0);
+ NdbSleep_SecSleep(3);
+ CHECK(restarter.restartAll(false, true, false) == 0);
+
+ Uint32 cnt = 0;
+ int nodes[128];
+ for(Uint32 i = 0; i<nodeCount; i++)
+ if ((nodes[cnt] = restarter.getDbNodeId(i)) != node2)
+ cnt++;
+
+ assert(cnt == nodeCount - 1);
+
+ CHECK(restarter.startNodes(nodes, cnt) == 0);
+ CHECK(restarter.waitNodesStarted(nodes, cnt, 300) == 0);
+
+ CHECK(restarter.insertErrorInNode(node2, 7170) == 0);
+ CHECK(restarter.waitNodesNoStart(&node2, 1) == 0);
+ CHECK(restarter.restartOneDbNode(node2, true, false, true) == 0);
+ CHECK(restarter.waitNodesStarted(&node2, 1) == 0);
+
+ } while(0);
+
+ g_info << "Bug18385 finished" << endl;
+
+ return result;
+}
+
int runWaitStarted(NDBT_Context* ctx, NDBT_Step* step){
NdbRestarter restarter;
@@ -1234,6 +1280,13 @@ TESTCASE("SR9",
STEP(runSystemRestart9);
FINALIZER(runClearTable);
}
+TESTCASE("Bug18385",
+ "Perform partition system restart with other nodes with higher GCI"){
+ INITIALIZER(runWaitStarted);
+ INITIALIZER(runClearTable);
+ STEP(runBug18385);
+ FINALIZER(runClearTable);
+}
NDBT_TESTSUITE_END(testSystemRestart);
int main(int argc, const char** argv){
diff --git a/ndb/test/run-test/daily-basic-tests.txt b/ndb/test/run-test/daily-basic-tests.txt
index 70518f7881d..0533d585a41 100644
--- a/ndb/test/run-test/daily-basic-tests.txt
+++ b/ndb/test/run-test/daily-basic-tests.txt
@@ -454,6 +454,10 @@ max-time: 500
cmd: testNodeRestart
args: -n Bug16772 T1
+max-time: 500
+cmd: testSystemRestart
+args: -n Bug18385 T1
+
# OLD FLEX
max-time: 500
cmd: flexBench
diff --git a/ndb/test/src/NdbRestarter.cpp b/ndb/test/src/NdbRestarter.cpp
index 91c0963feae..2c16a05240d 100644
--- a/ndb/test/src/NdbRestarter.cpp
+++ b/ndb/test/src/NdbRestarter.cpp
@@ -174,6 +174,39 @@ NdbRestarter::getRandomNodeOtherNodeGroup(int nodeId, int rand){
return -1;
}
+int
+NdbRestarter::getRandomNodeSameNodeGroup(int nodeId, int rand){
+ if (!isConnected())
+ return -1;
+
+ if (getStatus() != 0)
+ return -1;
+
+ int node_group = -1;
+ for(size_t i = 0; i < ndbNodes.size(); i++){
+ if(ndbNodes[i].node_id == nodeId){
+ node_group = ndbNodes[i].node_group;
+ break;
+ }
+ }
+ if(node_group == -1){
+ return -1;
+ }
+
+ Uint32 counter = 0;
+ rand = rand % ndbNodes.size();
+ while(counter++ < ndbNodes.size() &&
+ (ndbNodes[rand].node_id == nodeId ||
+ ndbNodes[rand].node_group != node_group))
+ rand = (rand + 1) % ndbNodes.size();
+
+ if(ndbNodes[rand].node_group == node_group &&
+ ndbNodes[rand].node_id != nodeId)
+ return ndbNodes[rand].node_id;
+
+ return -1;
+}
+
int
NdbRestarter::waitClusterStarted(unsigned int _timeout){
return waitClusterState(NDB_MGM_NODE_STATUS_STARTED, _timeout);