diff options
Diffstat (limited to 'storage')
-rw-r--r-- | storage/ndb/include/kernel/signaldata/BackupContinueB.hpp | 3 | ||||
-rw-r--r-- | storage/ndb/include/mgmapi/mgmapi_config_parameters.h | 4 | ||||
-rw-r--r-- | storage/ndb/src/kernel/blocks/backup/Backup.cpp | 175 | ||||
-rw-r--r-- | storage/ndb/src/kernel/blocks/backup/Backup.hpp | 19 | ||||
-rw-r--r-- | storage/ndb/src/kernel/blocks/backup/BackupInit.cpp | 22 | ||||
-rw-r--r-- | storage/ndb/src/mgmsrv/ConfigInfo.cpp | 54 | ||||
-rw-r--r-- | storage/ndb/src/mgmsrv/InitConfigFileParser.cpp | 12 |
7 files changed, 242 insertions, 47 deletions
diff --git a/storage/ndb/include/kernel/signaldata/BackupContinueB.hpp b/storage/ndb/include/kernel/signaldata/BackupContinueB.hpp index fe3f48444ec..9035c6f8140 100644 --- a/storage/ndb/include/kernel/signaldata/BackupContinueB.hpp +++ b/storage/ndb/include/kernel/signaldata/BackupContinueB.hpp @@ -32,7 +32,8 @@ private: BUFFER_FULL_SCAN = 2, BUFFER_FULL_FRAG_COMPLETE = 3, BUFFER_FULL_META = 4, - BACKUP_FRAGMENT_INFO = 5 + BACKUP_FRAGMENT_INFO = 5, + RESET_DISK_SPEED_COUNTER = 6 }; }; diff --git a/storage/ndb/include/mgmapi/mgmapi_config_parameters.h b/storage/ndb/include/mgmapi/mgmapi_config_parameters.h index 78d34b31bbb..d1feaa1a7d3 100644 --- a/storage/ndb/include/mgmapi/mgmapi_config_parameters.h +++ b/storage/ndb/include/mgmapi/mgmapi_config_parameters.h @@ -92,6 +92,10 @@ #define CFG_DB_DISK_PAGE_BUFFER_MEMORY 160 #define CFG_DB_STRING_MEMORY 161 +#define CFG_DB_DISK_SYNCH_SIZE 163 +#define CFG_DB_CHECKPOINT_SPEED 164 +#define CFG_DB_CHECKPOINT_SPEED_SR 165 + #define CFG_DB_SGA 198 /* super pool mem */ #define CFG_DB_DATA_MEM_2 199 /* used in special build in 5.1 */ diff --git a/storage/ndb/src/kernel/blocks/backup/Backup.cpp b/storage/ndb/src/kernel/blocks/backup/Backup.cpp index 2e8d8b548ce..d170b3f5a6a 100644 --- a/storage/ndb/src/kernel/blocks/backup/Backup.cpp +++ b/storage/ndb/src/kernel/blocks/backup/Backup.cpp @@ -84,6 +84,16 @@ Backup::execSTTOR(Signal* signal) const Uint32 startphase = signal->theData[1]; const Uint32 typeOfStart = signal->theData[7]; + if (startphase == 1) + { + m_curr_disk_write_speed = c_defaults.m_disk_write_speed_sr; + m_overflow_disk_write = 0; + m_reset_disk_speed_time = NdbTick_CurrentMillisecond(); + m_reset_delay_used = Backup::DISK_SPEED_CHECK_DELAY; + signal->theData[0] = BackupContinueB::RESET_DISK_SPEED_COUNTER; + sendSignalWithDelay(BACKUP_REF, GSN_CONTINUEB, signal, + Backup::DISK_SPEED_CHECK_DELAY, 1); + } if (startphase == 3) { jam(); g_TypeOfStart = typeOfStart; @@ -92,6 +102,11 @@ Backup::execSTTOR(Signal* signal) return; }//if + if (startphase == 7) + { + m_curr_disk_write_speed = c_defaults.m_disk_write_speed; + } + if(startphase == 7 && g_TypeOfStart == NodeState::ST_INITIAL_START && c_masterNodeId == getOwnNodeId()){ jam(); @@ -170,6 +185,42 @@ Backup::execCONTINUEB(Signal* signal) const Uint32 Tdata2 = signal->theData[2]; switch(Tdata0) { + case BackupContinueB::RESET_DISK_SPEED_COUNTER: + { + /* + Adjust for upto 10 millisecond delay of this signal. Longer + delays will not be handled, in this case the system is most + likely under too high load and it won't matter very much that + we decrease the speed of checkpoints. + + We use a technique where we allow an overflow write in one + period. This overflow will be removed from the next period + such that the load will at average be as specified. + */ + int delay_time = m_reset_delay_used; + NDB_TICKS curr_time = NdbTick_CurrentMillisecond(); + int sig_delay = curr_time - m_reset_disk_speed_time; + + m_words_written_this_period = m_overflow_disk_write; + m_overflow_disk_write = 0; + m_reset_disk_speed_time = curr_time; + + if (sig_delay > delay_time + 10) + delay_time = Backup::DISK_SPEED_CHECK_DELAY - 10; + else if (sig_delay < delay_time - 10) + delay_time = Backup::DISK_SPEED_CHECK_DELAY + 10; + else + delay_time = Backup::DISK_SPEED_CHECK_DELAY - (sig_delay - delay_time); + m_reset_delay_used= delay_time; + signal->theData[0] = BackupContinueB::RESET_DISK_SPEED_COUNTER; + sendSignalWithDelay(BACKUP_REF, GSN_CONTINUEB, signal, delay_time, 1); +#if 0 + ndbout << "Signal delay was = " << sig_delay; + ndbout << " Current time = " << curr_time << endl; + ndbout << " Delay time will be = " << delay_time << endl << endl; +#endif + break; + } case BackupContinueB::BACKUP_FRAGMENT_INFO: { const Uint32 ptr_I = Tdata1; @@ -202,8 +253,8 @@ Backup::execCONTINUEB(Signal* signal) fragInfo->FragmentNo = htonl(fragPtr_I); fragInfo->NoOfRecordsLow = htonl(fragPtr.p->noOfRecords & 0xFFFFFFFF); fragInfo->NoOfRecordsHigh = htonl(fragPtr.p->noOfRecords >> 32); - fragInfo->FilePosLow = htonl(0 & 0xFFFFFFFF); - fragInfo->FilePosHigh = htonl(0 >> 32); + fragInfo->FilePosLow = htonl(0); + fragInfo->FilePosHigh = htonl(0); filePtr.p->operation.dataBuffer.updateWritePtr(sz); @@ -938,7 +989,7 @@ Backup::execBACKUP_REQ(Signal* signal) return; }//if - if (m_diskless) + if (c_defaults.m_diskless) { sendBackupRef(senderRef, flags, signal, senderData, BackupRef::CannotBackupDiskless); @@ -2610,9 +2661,10 @@ Backup::openFiles(Signal* signal, BackupRecordPtr ptr) FsOpenReq::OM_WRITEONLY | FsOpenReq::OM_TRUNCATE | FsOpenReq::OM_CREATE | - FsOpenReq::OM_APPEND; + FsOpenReq::OM_APPEND | + FsOpenReq::OM_AUTOSYNC; FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF); - + req->auto_sync_size = c_defaults.m_disk_synch_size; /** * Ctl file */ @@ -3881,6 +3933,69 @@ Backup::execFSAPPENDCONF(Signal* signal) checkFile(signal, filePtr); } +/* + This routine handles two problems with writing to disk during local + checkpoints and backups. The first problem is that we need to limit + the writing to ensure that we don't use too much CPU and disk resources + for backups and checkpoints. The perfect solution to this is to use + a dynamic algorithm that adapts to the environment. Until we have + implemented this we can satisfy ourselves with an algorithm that + uses a configurable limit. + + The second problem is that in Linux we can get severe problems if we + write very much to the disk without synching. In the worst case we + can have Gigabytes of data in the Linux page cache before we reach + the limit of how much we can write. If this happens the performance + will drop significantly when we reach this limit since the Linux flush + daemon will spend a few minutes on writing out the page cache to disk. + To avoid this we ensure that a file never have more than a certain + amount of data outstanding before synch. This variable is also + configurable. +*/ +bool +Backup::ready_to_write(bool ready, Uint32 sz, bool eof, BackupFile *fileP) +{ +#if 0 + ndbout << "ready_to_write: ready = " << ready << " eof = " << eof; + ndbout << " sz = " << sz << endl; + ndbout << "words this period = " << m_words_written_this_period; + ndbout << endl << "overflow disk write = " << m_overflow_disk_write; + ndbout << endl << "Current Millisecond is = "; + ndbout << NdbTick_CurrentMillisecond() << endl; +#endif + if ((ready || eof) && + m_words_written_this_period <= m_curr_disk_write_speed) + { + /* + We have a buffer ready to write or we have reached end of + file and thus we must write the last before closing the + file. + We have already check that we are allowed to write at this + moment. We only worry about history of last 100 milliseconds. + What happened before that is of no interest since a disk + write that was issued more than 100 milliseconds should be + completed by now. + */ + int overflow; + m_words_written_this_period += sz; + overflow = m_words_written_this_period - m_curr_disk_write_speed; + if (overflow > 0) + m_overflow_disk_write = overflow; +#if 0 + ndbout << "Will write with " << endl; + ndbout << endl; +#endif + return true; + } + else + { +#if 0 + ndbout << "Will not write now" << endl << endl; +#endif + return false; + } +} + void Backup::checkFile(Signal* signal, BackupFilePtr filePtr) { @@ -3890,35 +4005,23 @@ Backup::checkFile(Signal* signal, BackupFilePtr filePtr) #endif OperationRecord & op = filePtr.p->operation; - - Uint32 * tmp, sz; bool eof; - if(op.dataBuffer.getReadPtr(&tmp, &sz, &eof)) + Uint32 *tmp = NULL; + Uint32 sz = 0; + bool eof = FALSE; + bool ready = op.dataBuffer.getReadPtr(&tmp, &sz, &eof); +#if 0 + ndbout << "Ptr to data = " << hex << tmp << endl; +#endif + if (!ready_to_write(ready, sz, eof, filePtr.p)) { jam(); - - jam(); - FsAppendReq * req = (FsAppendReq *)signal->getDataPtrSend(); - req->filePointer = filePtr.p->filePointer; - req->userPointer = filePtr.i; - req->userReference = reference(); - req->varIndex = 0; - req->offset = tmp - c_startOfPages; - req->size = sz; - - sendSignal(NDBFS_REF, GSN_FSAPPENDREQ, signal, - FsAppendReq::SignalLength, JBA); - return; - } - - if(!eof) { - jam(); signal->theData[0] = BackupContinueB::BUFFER_UNDERFLOW; signal->theData[1] = filePtr.i; - sendSignalWithDelay(BACKUP_REF, GSN_CONTINUEB, signal, 50, 2); + sendSignalWithDelay(BACKUP_REF, GSN_CONTINUEB, signal, 20, 2); return; - }//if - - if(sz > 0) { + } + else if (sz > 0) + { jam(); FsAppendReq * req = (FsAppendReq *)signal->getDataPtrSend(); req->filePointer = filePtr.p->filePointer; @@ -3926,13 +4029,14 @@ Backup::checkFile(Signal* signal, BackupFilePtr filePtr) req->userReference = reference(); req->varIndex = 0; req->offset = tmp - c_startOfPages; - req->size = sz; // Round up + req->size = sz; + req->synch_flag = 0; sendSignal(NDBFS_REF, GSN_FSAPPENDREQ, signal, FsAppendReq::SignalLength, JBA); return; - }//if - + } + #ifdef DEBUG_ABORT Uint32 running= filePtr.p->fileRunning; Uint32 closing= filePtr.p->fileClosing; @@ -4214,16 +4318,15 @@ Backup::closeFiles(Signal* sig, BackupRecordPtr ptr) continue; }//if + filePtr.p->operation.dataBuffer.eof(); if(filePtr.p->fileRunning == 1){ jam(); #ifdef DEBUG_ABORT ndbout_c("Close files fileRunning == 1, filePtr.i=%u", filePtr.i); #endif - filePtr.p->operation.dataBuffer.eof(); } else { jam(); filePtr.p->fileClosing = 1; - filePtr.p->operation.dataBuffer.eof(); checkFile(sig, filePtr); // make sure we write everything before closing FsCloseReq * req = (FsCloseReq *)sig->getDataPtrSend(); @@ -4712,8 +4815,10 @@ Backup::lcp_open_file(Signal* signal, BackupRecordPtr ptr) FsOpenReq::OM_WRITEONLY | FsOpenReq::OM_TRUNCATE | FsOpenReq::OM_CREATE | - FsOpenReq::OM_APPEND; + FsOpenReq::OM_APPEND | + FsOpenReq::OM_AUTOSYNC; FsOpenReq::v2_setCount(req->fileNumber, 0xFFFFFFFF); + req->auto_sync_size = c_defaults.m_disk_synch_size; TablePtr tabPtr; FragmentPtr fragPtr; diff --git a/storage/ndb/src/kernel/blocks/backup/Backup.hpp b/storage/ndb/src/kernel/blocks/backup/Backup.hpp index afacf01ab2f..4f54918ed9d 100644 --- a/storage/ndb/src/kernel/blocks/backup/Backup.hpp +++ b/storage/ndb/src/kernel/blocks/backup/Backup.hpp @@ -33,6 +33,7 @@ #include <blocks/mutexes.hpp> #include <NdbTCP.h> +#include <NdbTick.h> #include <Array.hpp> /** @@ -522,6 +523,11 @@ public: Uint32 m_minWriteSize; Uint32 m_maxWriteSize; Uint32 m_lcp_buffer_size; + + Uint32 m_disk_write_speed_sr; + Uint32 m_disk_write_speed; + Uint32 m_disk_synch_size; + Uint32 m_diskless; }; /** @@ -533,8 +539,17 @@ public: NdbNodeBitmask c_aliveNodes; DLList<BackupRecord> c_backups; Config c_defaults; - Uint32 m_diskless; + /* + Variables that control checkpoint to disk speed + */ + Uint32 m_curr_disk_write_speed; + Uint32 m_words_written_this_period; + Uint32 m_overflow_disk_write; + Uint32 m_reset_delay_used; + NDB_TICKS m_reset_disk_speed_time; + static const int DISK_SPEED_CHECK_DELAY = 100; + STATIC_CONST(NO_OF_PAGES_META_FILE = MAX_WORDS_META_FILE/BACKUP_WORDS_PER_PAGE); /** @@ -631,6 +646,8 @@ public: void lcp_open_file_done(Signal*, BackupRecordPtr); void lcp_close_file_conf(Signal* signal, BackupRecordPtr); void lcp_send_end_lcp_conf(Signal* signal, BackupRecordPtr); + + bool ready_to_write(bool ready, Uint32 sz, bool eof, BackupFile *fileP); }; inline diff --git a/storage/ndb/src/kernel/blocks/backup/BackupInit.cpp b/storage/ndb/src/kernel/blocks/backup/BackupInit.cpp index 38a60ac04d6..4cbe0c32a29 100644 --- a/storage/ndb/src/kernel/blocks/backup/BackupInit.cpp +++ b/storage/ndb/src/kernel/blocks/backup/BackupInit.cpp @@ -146,8 +146,28 @@ Backup::execREAD_CONFIG_REQ(Signal* signal) m_ctx.m_config.getOwnConfigIterator(); ndbrequire(p != 0); + c_defaults.m_disk_write_speed = 10 * (1024 * 1024); + c_defaults.m_disk_write_speed_sr = 100 * (1024 * 1024); + c_defaults.m_disk_synch_size = 4 * (1024 * 1024); + Uint32 noBackups = 0, noTables = 0, noAttribs = 0, noFrags = 0; - ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS, &m_diskless)); + ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DB_DISCLESS, + &c_defaults.m_diskless)); + ndb_mgm_get_int_parameter(p, CFG_DB_CHECKPOINT_SPEED_SR, + &c_defaults.m_disk_write_speed_sr); + ndb_mgm_get_int_parameter(p, CFG_DB_CHECKPOINT_SPEED, + &c_defaults.m_disk_write_speed); + ndb_mgm_get_int_parameter(p, CFG_DB_DISK_SYNCH_SIZE, + &c_defaults.m_disk_synch_size); + + /* + We adjust the disk speed parameters from bytes per second to rather be + words per 100 milliseconds. We convert disk synch size from bytes per + second to words per second. + */ + c_defaults.m_disk_write_speed /= (4 * 10); + c_defaults.m_disk_write_speed_sr /= (4 * 10); + ndb_mgm_get_int_parameter(p, CFG_DB_PARALLEL_BACKUPS, &noBackups); // ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DB_NO_TABLES, &noTables)); ndbrequire(!ndb_mgm_get_int_parameter(p, CFG_DICT_TABLE, &noTables)); diff --git a/storage/ndb/src/mgmsrv/ConfigInfo.cpp b/storage/ndb/src/mgmsrv/ConfigInfo.cpp index 6c172a29819..fb15e35ecc7 100644 --- a/storage/ndb/src/mgmsrv/ConfigInfo.cpp +++ b/storage/ndb/src/mgmsrv/ConfigInfo.cpp @@ -877,7 +877,7 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = { ConfigInfo::CI_USED, false, ConfigInfo::CI_INT, - "8", + "16", "3", STR_VALUE(MAX_INT_RNIL) }, @@ -952,8 +952,8 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = { CFG_DB_LCP_DISC_PAGES_TUP_SR, "NoOfDiskPagesToDiskDuringRestartTUP", DB_TOKEN, - "?", - ConfigInfo::CI_USED, + "DiskCheckpointSpeedSr", + ConfigInfo::CI_DEPRICATED, true, ConfigInfo::CI_INT, "40", @@ -964,8 +964,8 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = { CFG_DB_LCP_DISC_PAGES_TUP, "NoOfDiskPagesToDiskAfterRestartTUP", DB_TOKEN, - "?", - ConfigInfo::CI_USED, + "DiskCheckpointSpeed", + ConfigInfo::CI_DEPRICATED, true, ConfigInfo::CI_INT, "40", @@ -976,8 +976,8 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = { CFG_DB_LCP_DISC_PAGES_ACC_SR, "NoOfDiskPagesToDiskDuringRestartACC", DB_TOKEN, - "?", - ConfigInfo::CI_USED, + "DiskCheckpointSpeedSr", + ConfigInfo::CI_DEPRICATED, true, ConfigInfo::CI_INT, "20", @@ -988,8 +988,8 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = { CFG_DB_LCP_DISC_PAGES_ACC, "NoOfDiskPagesToDiskAfterRestartACC", DB_TOKEN, - "?", - ConfigInfo::CI_USED, + "DiskCheckpointSpeed", + ConfigInfo::CI_DEPRICATED, true, ConfigInfo::CI_INT, "20", @@ -1192,6 +1192,42 @@ const ConfigInfo::ParamInfo ConfigInfo::m_ParamInfo[] = { 0, 0 }, { + CFG_DB_DISK_SYNCH_SIZE, + "DiskSyncSize", + DB_TOKEN, + "Data written to a file before a synch is forced", + ConfigInfo::CI_USED, + false, + ConfigInfo::CI_INT, + "4M", + "32k", + STR_VALUE(MAX_INT_RNIL) }, + + { + CFG_DB_CHECKPOINT_SPEED, + "DiskCheckpointSpeed", + DB_TOKEN, + "Bytes per second allowed to be written by checkpoint", + ConfigInfo::CI_USED, + false, + ConfigInfo::CI_INT, + "10M", + "1M", + STR_VALUE(MAX_INT_RNIL) }, + + { + CFG_DB_CHECKPOINT_SPEED_SR, + "DiskCheckpointSpeedInRestart", + DB_TOKEN, + "Bytes per second allowed to be written by checkpoint during restart", + ConfigInfo::CI_USED, + false, + ConfigInfo::CI_INT, + "100M", + "1M", + STR_VALUE(MAX_INT_RNIL) }, + + { CFG_DB_BACKUP_MEM, "BackupMemory", DB_TOKEN, diff --git a/storage/ndb/src/mgmsrv/InitConfigFileParser.cpp b/storage/ndb/src/mgmsrv/InitConfigFileParser.cpp index 68a5f02f4c5..bf5cb9d726e 100644 --- a/storage/ndb/src/mgmsrv/InitConfigFileParser.cpp +++ b/storage/ndb/src/mgmsrv/InitConfigFileParser.cpp @@ -655,6 +655,18 @@ InitConfigFileParser::store_in_properties(Vector<struct my_option>& options, m_info->getMax(ctx.m_currentInfo, fname)); return false; } + + ConfigInfo::Status status = m_info->getStatus(ctx.m_currentInfo, fname); + if (status == ConfigInfo::CI_DEPRICATED) { + const char * desc = m_info->getDescription(ctx.m_currentInfo, fname); + if(desc && desc[0]){ + ctx.reportWarning("[%s] %s is depricated, use %s instead", + ctx.fname, fname, desc); + } else if (desc == 0){ + ctx.reportWarning("[%s] %s is depricated", ctx.fname, fname); + } + } + if (options[i].var_type == GET_INT) ctx.m_currentSection->put(options[i].name, (Uint32)value_int); else |