From 40f44b48b0f44f9eb7f13dc7c2e28e56ad805634 Mon Sep 17 00:00:00 2001 From: unknown Date: Thu, 8 Jun 2006 16:16:07 +0200 Subject: ndb - bug#18781 lock DICT during node restart ndb/src/kernel/main.cpp: signal log from start (#if 0-ed) ndb/test/ndbapi/testDict.cpp: test NF/NR + dict ops ndb/src/kernel/vm/DLFifoList.hpp: add hasPrev ndb/src/kernel/vm/pc.hpp: ERROR_INSERTED_CLEAR(x) test and clear if set ndb/src/common/debugger/SignalLoggerManager.cpp: block no fix ndb/src/kernel/blocks/qmgr/QmgrMain.cpp: spelling ndb/include/kernel/GlobalSignalNumbers.h: locking of master DICT against schema ops, used by slave DIH under NR ndb/include/kernel/signaldata/AlterTable.hpp: locking of master DICT against schema ops, used by slave DIH under NR ndb/include/kernel/signaldata/CreateTable.hpp: locking of master DICT against schema ops, used by slave DIH under NR ndb/include/kernel/signaldata/DictLock.hpp: locking of master DICT against schema ops, used by slave DIH under NR ndb/include/kernel/signaldata/DropTable.hpp: locking of master DICT against schema ops, used by slave DIH under NR ndb/src/common/debugger/signaldata/SignalNames.cpp: locking of master DICT against schema ops, used by slave DIH under NR ndb/src/kernel/blocks/ERROR_codes.txt: locking of master DICT against schema ops, used by slave DIH under NR ndb/src/kernel/blocks/dbdict/Dbdict.cpp: locking of master DICT against schema ops, used by slave DIH under NR ndb/src/kernel/blocks/dbdict/Dbdict.hpp: locking of master DICT against schema ops, used by slave DIH under NR ndb/src/kernel/blocks/dbdih/Dbdih.hpp: locking of master DICT against schema ops, used by slave DIH under NR ndb/src/kernel/blocks/dbdih/DbdihInit.cpp: locking of master DICT against schema ops, used by slave DIH under NR ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: locking of master DICT against schema ops, used by slave DIH under NR ndb/src/ndbapi/ndberror.c: locking of master DICT against schema ops, used by slave DIH under NR --- ndb/test/ndbapi/testDict.cpp | 304 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 304 insertions(+) (limited to 'ndb/test') diff --git a/ndb/test/ndbapi/testDict.cpp b/ndb/test/ndbapi/testDict.cpp index 710a47bf3dc..2cfb78f143e 100644 --- a/ndb/test/ndbapi/testDict.cpp +++ b/ndb/test/ndbapi/testDict.cpp @@ -1551,6 +1551,282 @@ end: return result; } +// NFNR + +// Restarter controls dict ops : 1-run 2-pause 3-stop +// synced by polling... + +static bool +send_dict_ops_cmd(NDBT_Context* ctx, Uint32 cmd) +{ + ctx->setProperty("DictOps_CMD", cmd); + while (1) { + if (ctx->isTestStopped()) + return false; + if (ctx->getProperty("DictOps_ACK") == cmd) + break; + NdbSleep_MilliSleep(100); + } + return true; +} + +static bool +recv_dict_ops_run(NDBT_Context* ctx) +{ + while (1) { + if (ctx->isTestStopped()) + return false; + Uint32 cmd = ctx->getProperty("DictOps_CMD"); + ctx->setProperty("DictOps_ACK", cmd); + if (cmd == 1) + break; + if (cmd == 3) + return false; + NdbSleep_MilliSleep(100); + } + return true; +} + +int +runRestarts(NDBT_Context* ctx, NDBT_Step* step) +{ + static int err_master[] = { // non-crashing + 0, + 7174 // send one fake START_PERMREF + }; + static int err_node[] = { + 0, + 7121, // crash on START_PERMCONF + 7130 // crash on START_MECONF + }; + const uint err_master_cnt = sizeof(err_master)/sizeof(err_master[0]); + const uint err_node_cnt = sizeof(err_node)/sizeof(err_node[0]); + + myRandom48Init(NdbTick_CurrentMillisecond()); + NdbRestarter restarter; + int result = NDBT_OK; + const int loops = ctx->getNumLoops(); + + for (int l = 0; l < loops && result == NDBT_OK; l++) { + g_info << "1: === loop " << l << " ===" << endl; + + // assuming 2-way replicated + + int numnodes = restarter.getNumDbNodes(); + CHECK(numnodes >= 1); + if (numnodes == 1) + break; + + int masterNodeId = restarter.getMasterNodeId(); + CHECK(masterNodeId != -1); + + // for more complex cases need more restarter support methods + + int nodeIdList[2] = { 0, 0 }; + int nodeIdCnt = 0; + + if (numnodes >= 2) { + int rand = myRandom48(numnodes); + int nodeId = restarter.getRandomNotMasterNodeId(rand); + CHECK(nodeId != -1); + nodeIdList[nodeIdCnt++] = nodeId; + } + + if (numnodes >= 4) { + int rand = myRandom48(numnodes); + int nodeId = restarter.getRandomNodeOtherNodeGroup(nodeIdList[0], rand); + CHECK(nodeId != -1); + if (nodeId != masterNodeId) + nodeIdList[nodeIdCnt++] = nodeId; + } + + g_info << "1: master=" << masterNodeId << " nodes=" << nodeIdList[0] << "," << nodeIdList[1] << endl; + + const unsigned maxsleep = 2000; //ms + + bool NF_ops = ctx->getProperty("Restart_NF_ops"); + uint NF_type = ctx->getProperty("Restart_NF_type"); + bool NR_ops = ctx->getProperty("Restart_NR_ops"); + bool NR_error = ctx->getProperty("Restart_NR_error"); + + g_info << "1: " << (NF_ops ? "run" : "pause") << " dict ops" << endl; + if (! send_dict_ops_cmd(ctx, NF_ops ? 1 : 2)) + break; + NdbSleep_MilliSleep(myRandom48(maxsleep)); + + { + int i = 0; + while (i < nodeIdCnt) { + int nodeId = nodeIdList[i++]; + + bool nostart = true; + bool abort = NF_type == 0 ? myRandom48(2) : (NF_type == 2); + bool initial = myRandom48(2); + + char flags[40]; + strcpy(flags, "flags: nostart"); + if (abort) + strcat(flags, ",abort"); + if (initial) + strcat(flags, ",initial"); + + g_info << "1: restart " << nodeId << " " << flags << endl; + CHECK(restarter.restartOneDbNode(nodeId, initial, nostart, abort) == 0); + } + } + + g_info << "1: wait for nostart" << endl; + CHECK(restarter.waitNodesNoStart(nodeIdList, nodeIdCnt) == 0); + NdbSleep_MilliSleep(myRandom48(maxsleep)); + + g_info << "1: " << (NR_ops ? "run" : "pause") << " dict ops" << endl; + if (! send_dict_ops_cmd(ctx, NR_ops ? 1 : 2)) + break; + NdbSleep_MilliSleep(myRandom48(maxsleep)); + + g_info << "1: start nodes" << endl; + CHECK(restarter.startNodes(nodeIdList, nodeIdCnt) == 0); + + if (NR_error) { + { + int rand = myRandom48(err_master_cnt); + int err = err_master[rand]; + if (err != 0) { + g_info << "1: insert master error " << err << endl; + CHECK(restarter.insertErrorInNode(masterNodeId, err) == 0); + } + } + + // limitation: cannot have 2 node restarts and crash_insert + // one node may die for real (NF during startup) + + int i = 0; + while (i < nodeIdCnt && nodeIdCnt == 1) { + int nodeId = nodeIdList[i++]; + + int rand = myRandom48(err_node_cnt); + int err = err_node[rand]; + if (err != 0) { + g_info << "1: insert node " << nodeId << " error " << err << endl; + CHECK(restarter.insertErrorInNode(nodeId, err) == 0); + } + } + } + NdbSleep_MilliSleep(myRandom48(maxsleep)); + + g_info << "1: wait cluster started" << endl; + CHECK(restarter.waitClusterStarted() == 0); + NdbSleep_MilliSleep(myRandom48(maxsleep)); + + g_info << "1: restart done" << endl; + } + + g_info << "1: stop dict ops" << endl; + send_dict_ops_cmd(ctx, 3); + + return result; +} + +int +runDictOps(NDBT_Context* ctx, NDBT_Step* step) +{ + myRandom48Init(NdbTick_CurrentMillisecond()); + int result = NDBT_OK; + + for (int l = 0; result == NDBT_OK; l++) { + if (! recv_dict_ops_run(ctx)) + break; + + g_info << "2: === loop " << l << " ===" << endl; + + Ndb* pNdb = GETNDB(step); + NdbDictionary::Dictionary* pDic = pNdb->getDictionary(); + const NdbDictionary::Table* pTab = ctx->getTab(); + const char* tabName = pTab->getName(); + + const unsigned long maxsleep = 100; //ms + + g_info << "2: create table" << endl; + { + uint count = 0; + try_create: + count++; + if (pDic->createTable(*pTab) != 0) { + const NdbError err = pDic->getNdbError(); + if (count == 1) + g_err << "2: " << tabName << ": create failed: " << err << endl; + if (err.code != 711) { + result = NDBT_FAILED; + break; + } + NdbSleep_MilliSleep(myRandom48(maxsleep)); + goto try_create; + } + } + NdbSleep_MilliSleep(myRandom48(maxsleep)); + + g_info << "2: verify create" << endl; + const NdbDictionary::Table* pTab2 = pDic->getTable(tabName); + if (pTab2 == NULL) { + const NdbError err = pDic->getNdbError(); + g_err << "2: " << tabName << ": verify create: " << err << endl; + result = NDBT_FAILED; + break; + } + NdbSleep_MilliSleep(myRandom48(maxsleep)); + + // replace by the Retrieved table + pTab = pTab2; + + int records = myRandom48(ctx->getNumRecords()); + g_info << "2: load " << records << " records" << endl; + HugoTransactions hugoTrans(*pTab); + if (hugoTrans.loadTable(pNdb, records) != 0) { + // XXX get error code from hugo + g_err << "2: " << tabName << ": load failed" << endl; + result = NDBT_FAILED; + break; + } + NdbSleep_MilliSleep(myRandom48(maxsleep)); + + g_info << "2: drop" << endl; + { + uint count = 0; + try_drop: + count++; + if (pDic->dropTable(tabName) != 0) { + const NdbError err = pDic->getNdbError(); + if (count == 1) + g_err << "2: " << tabName << ": drop failed: " << err << endl; + if (err.code != 711) { + result = NDBT_FAILED; + break; + } + NdbSleep_MilliSleep(myRandom48(maxsleep)); + goto try_drop; + } + } + NdbSleep_MilliSleep(myRandom48(maxsleep)); + + g_info << "2: verify drop" << endl; + const NdbDictionary::Table* pTab3 = pDic->getTable(tabName); + if (pTab3 != NULL) { + g_err << "2: " << tabName << ": verify drop: table exists" << endl; + result = NDBT_FAILED; + break; + } + if (pDic->getNdbError().code != 709) { + const NdbError err = pDic->getNdbError(); + g_err << "2: " << tabName << ": verify drop: " << err << endl; + result = NDBT_FAILED; + break; + } + NdbSleep_MilliSleep(myRandom48(maxsleep)); + } + + return result; +} + NDBT_TESTSUITE(testDict); TESTCASE("CreateAndDrop", "Try to create and drop the table loop number of times\n"){ @@ -1655,6 +1931,34 @@ TESTCASE("FailAddFragment", "Fail add fragment or attribute in ACC or TUP or TUX\n"){ INITIALIZER(runFailAddFragment); } +TESTCASE("Restart_NF1", + "DICT ops during node graceful shutdown (not master)"){ + TC_PROPERTY("Restart_NF_ops", 1); + TC_PROPERTY("Restart_NF_type", 1); + STEP(runRestarts); + STEP(runDictOps); +} +TESTCASE("Restart_NF2", + "DICT ops during node shutdown abort (not master)"){ + TC_PROPERTY("Restart_NF_ops", 1); + TC_PROPERTY("Restart_NF_type", 2); + STEP(runRestarts); + STEP(runDictOps); +} +TESTCASE("Restart_NR1", + "DICT ops during node startup (not master)"){ + TC_PROPERTY("Restart_NR_ops", 1); + STEP(runRestarts); + STEP(runDictOps); +} +TESTCASE("Restart_NR2", + "DICT ops during node startup with crash inserts (not master)"){ + TC_PROPERTY("Restart_NR_ops", 1); + TC_PROPERTY("Restart_NR_error", 1); + STEP(runRestarts); + STEP(runDictOps); +} + NDBT_TESTSUITE_END(testDict); int main(int argc, const char** argv){ -- cgit v1.2.1 From a182963cc04b7ff0ca27abe33a075ac06e4e0c2e Mon Sep 17 00:00:00 2001 From: unknown Date: Sun, 11 Jun 2006 20:46:47 +0200 Subject: ndb - bug#18781 (5.0) handle rolling upgrade, minor fixes, logging, docs ndb/src/kernel/blocks/dbdict/DictLock.txt: NR signals ndb/src/kernel/blocks/dbdict/Dbdict.cpp: call removeStaleDictLocks at right place, comment why it works more checks, better logging ndb/src/kernel/blocks/dbdict/Dbdict.hpp: call removeStaleDictLocks at right place, comment why it works more checks, better logging ndb/include/kernel/signaldata/DictLock.hpp: 2 more REFs ndb/include/ndb_version.h.in: DICT LOCK appeared in 5.0.23 ndb/src/kernel/blocks/dbdih/DbdihMain.cpp: DICT LOCK rolling upgrade from version < 5.0.23 ndb/src/kernel/blocks/ERROR_codes.txt: more DICT LOCK related testing ndb/test/ndbapi/testDict.cpp: more DICT LOCK related testing --- ndb/test/ndbapi/testDict.cpp | 67 +++++++++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 25 deletions(-) (limited to 'ndb/test') diff --git a/ndb/test/ndbapi/testDict.cpp b/ndb/test/ndbapi/testDict.cpp index 2cfb78f143e..397f41b3d4e 100644 --- a/ndb/test/ndbapi/testDict.cpp +++ b/ndb/test/ndbapi/testDict.cpp @@ -1590,17 +1590,18 @@ recv_dict_ops_run(NDBT_Context* ctx) int runRestarts(NDBT_Context* ctx, NDBT_Step* step) { - static int err_master[] = { // non-crashing - 0, - 7174 // send one fake START_PERMREF + static int errlst_master[] = { // non-crashing + 7175, // send one fake START_PERMREF + 0 }; - static int err_node[] = { - 0, - 7121, // crash on START_PERMCONF - 7130 // crash on START_MECONF + static int errlst_node[] = { + 7174, // crash before sending DICT_LOCK_REQ + 7176, // pretend master does not support DICT lock + 7121, // crash at receive START_PERMCONF + 0 }; - const uint err_master_cnt = sizeof(err_master)/sizeof(err_master[0]); - const uint err_node_cnt = sizeof(err_node)/sizeof(err_node[0]); + const uint errcnt_master = sizeof(errlst_master)/sizeof(errlst_master[0]); + const uint errcnt_node = sizeof(errlst_node)/sizeof(errlst_node[0]); myRandom48Init(NdbTick_CurrentMillisecond()); NdbRestarter restarter; @@ -1632,7 +1633,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) nodeIdList[nodeIdCnt++] = nodeId; } - if (numnodes >= 4) { + if (numnodes >= 4 && myRandom48(2) == 0) { int rand = myRandom48(numnodes); int nodeId = restarter.getRandomNodeOtherNodeGroup(nodeIdList[0], rand); CHECK(nodeId != -1); @@ -1642,6 +1643,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) g_info << "1: master=" << masterNodeId << " nodes=" << nodeIdList[0] << "," << nodeIdList[1] << endl; + const uint timeout = 60; //secs for node wait const unsigned maxsleep = 2000; //ms bool NF_ops = ctx->getProperty("Restart_NF_ops"); @@ -1655,9 +1657,8 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) NdbSleep_MilliSleep(myRandom48(maxsleep)); { - int i = 0; - while (i < nodeIdCnt) { - int nodeId = nodeIdList[i++]; + for (int i = 0; i < nodeIdCnt; i++) { + int nodeId = nodeIdList[i]; bool nostart = true; bool abort = NF_type == 0 ? myRandom48(2) : (NF_type == 2); @@ -1676,9 +1677,31 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) } g_info << "1: wait for nostart" << endl; - CHECK(restarter.waitNodesNoStart(nodeIdList, nodeIdCnt) == 0); + CHECK(restarter.waitNodesNoStart(nodeIdList, nodeIdCnt, timeout) == 0); NdbSleep_MilliSleep(myRandom48(maxsleep)); + int err_master = 0; + int err_node[2] = { 0, 0 }; + + if (NR_error) { + err_master = errlst_master[l % errcnt_master]; + + // limitation: cannot have 2 node restarts and crash_insert + // one node may die for real (NF during startup) + + for (int i = 0; i < nodeIdCnt && nodeIdCnt == 1; i++) { + err_node[i] = errlst_node[l % errcnt_node]; + + // 7176 - no DICT lock protection + + if (err_node[i] == 7176) { + g_info << "1: no dict ops due to error insert " + << err_node[i] << endl; + NR_ops = false; + } + } + } + g_info << "1: " << (NR_ops ? "run" : "pause") << " dict ops" << endl; if (! send_dict_ops_cmd(ctx, NR_ops ? 1 : 2)) break; @@ -1689,23 +1712,17 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) if (NR_error) { { - int rand = myRandom48(err_master_cnt); - int err = err_master[rand]; + int err = err_master; if (err != 0) { g_info << "1: insert master error " << err << endl; CHECK(restarter.insertErrorInNode(masterNodeId, err) == 0); } } - // limitation: cannot have 2 node restarts and crash_insert - // one node may die for real (NF during startup) + for (int i = 0; i < nodeIdCnt; i++) { + int nodeId = nodeIdList[i]; - int i = 0; - while (i < nodeIdCnt && nodeIdCnt == 1) { - int nodeId = nodeIdList[i++]; - - int rand = myRandom48(err_node_cnt); - int err = err_node[rand]; + int err = err_node[i]; if (err != 0) { g_info << "1: insert node " << nodeId << " error " << err << endl; CHECK(restarter.insertErrorInNode(nodeId, err) == 0); @@ -1715,7 +1732,7 @@ runRestarts(NDBT_Context* ctx, NDBT_Step* step) NdbSleep_MilliSleep(myRandom48(maxsleep)); g_info << "1: wait cluster started" << endl; - CHECK(restarter.waitClusterStarted() == 0); + CHECK(restarter.waitClusterStarted(timeout) == 0); NdbSleep_MilliSleep(myRandom48(maxsleep)); g_info << "1: restart done" << endl; -- cgit v1.2.1