diff options
Diffstat (limited to 'storage/maria/ma_checkpoint.c')
-rw-r--r-- | storage/maria/ma_checkpoint.c | 459 |
1 files changed, 459 insertions, 0 deletions
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c new file mode 100644 index 00000000000..83312ce37b8 --- /dev/null +++ b/storage/maria/ma_checkpoint.c @@ -0,0 +1,459 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3071 Maria checkpoint + First version written by Guilhem Bichot on 2006-04-27. + Does not compile yet. +*/ + +/* Here is the implementation of this module */ + +/* + Summary: + - there are asynchronous checkpoints (a writer to the log notices that it's + been a long time since we last checkpoint-ed, so posts a request for a + background thread to do a checkpoint; does not care about the success of the + checkpoint). Then the checkpoint is done by the checkpoint thread, at an + unspecified moment ("later") (==soon, of course). + - there are synchronous checkpoints: a thread requests a checkpoint to + happen now and wants to know when it finishes and if it succeeded; then the + checkpoint is done by that same thread. +*/ + +#include "page_cache.h" +#include "least_recently_dirtied.h" +#include "transaction.h" +#include "share.h" +#include "log.h" + +/* could also be called LSN_ERROR */ +#define LSN_IMPOSSIBLE ((LSN)0) +#define LSN_MAX ((LSN)ULONGLONG_MAX) + +/* + this transaction is used for any system work (purge, checkpoint writing + etc), that is, background threads. It will not be declared/initialized here + in the final version. +*/ +st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,...}; + +/* those three are protected by the log's mutex */ +/* + The maximum rec_lsn in the LRD when last checkpoint was run, serves for the + MEDIUM checkpoint. +*/ +LSN max_rec_lsn_at_last_checkpoint= 0; +CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE; +CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE; + +/* + Used by MySQL client threads requesting a checkpoint (like "ALTER MARIA + ENGINE DO CHECKPOINT"), and probably by maria_panic(), and at the end of the + UNDO recovery phase. +*/ +my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level) +{ + DBUG_ENTER("execute_synchronous_checkpoint"); + DBUG_ASSERT(level > NONE); + + lock(log_mutex); + while ((synchronous_checkpoint_in_progress != NONE) || + (next_asynchronous_checkpoint_to_do != NONE)) + wait_on_checkpoint_done_cond(); + + synchronous_checkpoint_in_progress= level; + execute_checkpoint(level); + safemutex_assert_owner(log_mutex); + synchronous_checkpoint_in_progress= NONE; + unlock(log_mutex); + broadcast(checkpoint_done_cond); +} + +/* Picks a checkpoint request, if there is one, and executes it */ +my_bool execute_asynchronous_checkpoint_if_any() +{ + CHECKPOINT_LEVEL level; + DBUG_ENTER("execute_asynchronous_checkpoint"); + + lock(log_mutex); + if (likely(next_asynchronous_checkpoint_to_do == NONE)) + { + unlock(log_mutex); + DBUG_RETURN(FALSE); + } + + while (synchronous_checkpoint_in_progress) + wait_on_checkpoint_done_cond(); + +do_checkpoint: + level= next_asynchronous_checkpoint_to_do; + DBUG_ASSERT(level > NONE); + execute_checkpoint(level); + safemutex_assert_owner(log_mutex); + if (next_asynchronous_checkpoint_to_do > level) + goto do_checkpoint; /* one more request was posted */ + else + { + DBUG_ASSERT(next_asynchronous_checkpoint_to_do == level); + next_asynchronous_checkpoint_to_do= NONE; /* all work done */ + } + unlock(log_mutex); + broadcast(checkpoint_done_cond); +} + + +/* + Does the actual checkpointing. Called by + execute_synchronous_checkpoint() and + execute_asynchronous_checkpoint_if_any(). +*/ +my_bool execute_checkpoint(CHECKPOINT_LEVEL level) +{ + LSN candidate_max_rec_lsn_at_last_checkpoint; + /* to avoid { lock + no-op + unlock } in the common (==indirect) case */ + my_bool need_log_mutex; + + DBUG_ENTER("execute_checkpoint"); + + safemutex_assert_owner(log_mutex); + copy_of_max_rec_lsn_at_last_checkpoint= max_rec_lsn_at_last_checkpoint; + + if (unlikely(need_log_mutex= (level > INDIRECT))) + { + /* much I/O work to do, release log mutex */ + unlock(log_mutex); + + switch (level) + { + case FULL: + /* flush all pages up to the current end of the LRD */ + flush_all_LRD_to_lsn(LSN_MAX); + /* this will go full speed (normal scheduling, no sleep) */ + break; + case MEDIUM: + /* + flush all pages which were already dirty at last checkpoint: + ensures that recovery will never start from before the next-to-last + checkpoint (two-checkpoint rule). + It is max, not min as the WL says (TODO update WL). + */ + flush_all_LRD_to_lsn(copy_of_max_rec_lsn_at_last_checkpoint); + /* this will go full speed (normal scheduling, no sleep) */ + break; + } + } + + candidate_max_rec_lsn_at_last_checkpoint= checkpoint_indirect(need_log_mutex); + + lock(log_mutex); + /* + this portion cannot be done as a hook in write_log_record() for the + LOGREC_CHECKPOINT type because: + - at that moment we still have not written to the control file so cannot + mark the request as done; this could be solved by writing to the control + file in the hook but that would be an I/O under the log's mutex, bad. + - it would not be nice organisation of code (I tried it :). + */ + if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE) + { + /* checkpoint succeeded */ + maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint; + written_since_last_checkpoint= (my_off_t)0; + DBUG_RETURN(FALSE); + } + /* + keep mutex locked because callers will want to clear mutex-protected + status variables + */ + DBUG_RETURN(TRUE); +} + + +/* + Does an indirect checpoint (collects data from data structures, writes into + a checkpoint log record). + Returns the largest LSN of the LRD when the checkpoint happened (this is a + fuzzy definition), or LSN_IMPOSSIBLE on error. That LSN is used for the + "two-checkpoint rule" (MEDIUM checkpoints). +*/ +LSN checkpoint_indirect(my_bool need_log_mutex) +{ + DBUG_ENTER("checkpoint_indirect"); + + int error= 0; + /* checkpoint record data: */ + LSN checkpoint_start_lsn; + LEX_STRING string1={0,0}, string2={0,0}, string3={0,0}; + LEX_STRING *string_array[4]; + char *ptr; + LSN checkpoint_lsn; + LSN candidate_max_rec_lsn_at_last_checkpoint= 0; + list_element *el; /* to scan lists */ + ulong stored_LRD_size= 0; + + + DBUG_ASSERT(sizeof(byte *) <= 8); + DBUG_ASSERT(sizeof(LSN) <= 8); + + if (need_log_mutex) + lock(log_mutex); /* maybe this will clash with log_read_end_lsn() */ + checkpoint_start_lsn= log_read_end_lsn(); + unlock(log_mutex); + + DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn)); + + /* STEP 1: fetch information about dirty pages */ + + /* + We lock the entire cache but will be quick, just reading/writing a few MBs + of memory at most. + */ + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + + /* + This is an over-estimation, as in theory blocks_changed may contain + non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the + checkpoint record; the true number of page-LRD-info we'll store into the + record is stored_LRD_size. + */ + string1.length= 8+8+(8+8)*pagecache->blocks_changed; + if (NULL == (string1.str= my_malloc(string1.length))) + goto err; + ptr= string1.str; + int8store(ptr, checkpoint_start_lsn); + ptr+= 8+8; /* don't store stored_LRD_size now, wait */ + if (pagecache->blocks_changed > 0) + { + /* + There are different ways to scan the dirty blocks; + flush_all_key_blocks() uses a loop over pagecache->used_last->next_used, + and for each element of the loop, loops into + pagecache->changed_blocks[FILE_HASH(file of the element)]. + This has the drawback that used_last includes non-dirty blocks, and it's + two loops over many elements. Here we try something simpler. + If there are no blocks in changed_blocks[file_hash], we should hit + zeroes and skip them. + */ + uint file_hash; + for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[file_hash] ; + block; + block= block->next_changed) + { + DBUG_ASSERT(block->hash_link != NULL); + DBUG_ASSERT(block->status & BLOCK_CHANGED); + if (block->type != PAGECACHE_LSN_PAGE) + { + /* no need to store it in the checkpoint record */ + continue; + } + /* Q: two "block"s cannot have the same "hash_link", right? */ + int8store(ptr, block->hash_link->pageno); + ptr+= 8; + /* I assume rec_lsn will be member of "block", not of "hash_link" */ + int8store(ptr, block->rec_lsn); + ptr+= 8; + stored_LRD_size++; + set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint, + block->rec_lsn); + } + } + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed); + int8store(string1.str+8, stored_LRD_size); + string1.length= 8+8+(8+8)*stored_LRD_size; + + /* STEP 2: fetch information about transactions */ + + /* + If trx are in more than one list (e.g. three: + running transactions, committed transactions, purge queue), we can either + take mutexes of all three together or do crabbing. + But if an element can move from list 1 to list 3 without passing through + list 2, crabbing is dangerous. + Hopefully it's ok to take 3 mutexes together... + Otherwise I'll have to make sure I miss no important trx and I handle dups. + */ + lock(global_transactions_list_mutex); /* or 3 mutexes if there are 3 */ + string2.length= 8+(8+8)*trx_list->count; + if (NULL == (string2.str= my_malloc(string2.length))) + goto err; + ptr= string2.str; + int8store(ptr, trx_list->count); + ptr+= 8; + for (el= trx_list->first; el; el= el->next) + { + /* possibly latch el.rwlock */ + *ptr= el->state; + ptr++; + int7store(ptr, el->long_trans_id); + ptr+= 7; + int2store(ptr, el->short_trans_id); + ptr+= 2; + int8store(ptr, el->undo_lsn); + ptr+= 8; + int8store(ptr, el->undo_purge_lsn); + ptr+= 8; + /* + if no latch, use double variable of type ULONGLONG_CONSISTENT in + st_transaction, or even no need if Intel >=486 + */ + int8store(ptr, el->first_undo_lsn); + ptr+= 8; + /* possibly unlatch el.rwlock */ + } + unlock(global_transactions_list_mutex); + + /* STEP 3: fetch information about table files */ + + /* This global mutex is in fact THR_LOCK_maria (see ma_open()) */ + lock(global_share_list_mutex); + string3.length= 8+(8+8)*share_list->count; + if (NULL == (string3.str= my_malloc(string3.length))) + goto err; + ptr= string3.str; + /* possibly latch each MARIA_SHARE, one by one, like this: */ + pthread_mutex_lock(&share->intern_lock); + /* + We'll copy the file id (a bit like share->kfile), the file name + (like share->unique_file_name[_length]). + */ + make_copy_of_global_share_list_to_array; + pthread_mutex_unlock(&share->intern_lock); + unlock(global_share_list_mutex); + + /* work on copy */ + int8store(ptr, elements_in_array); + ptr+= 8; + for (el in array) + { + int8store(ptr, array[...].file_id); + ptr+= 8; + memcpy(ptr, array[...].file_name, ...); + ptr+= ...; + /* + these two are long ops (involving disk I/O) that's why we copied the + list, to not keep the list locked for long: + */ + flush_bitmap_pages(el); + /* TODO: and also autoinc counter, logical file end, free page list */ + + /* + fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per + second, so if you have touched 1000 files it's 7 seconds). + */ + force_file(el); + } + + /* LAST STEP: now write the checkpoint log record */ + + string_array[0]= string1; + string_array[1]= string2; + string_array[2]= string3; + string_array[3]= NULL; + + checkpoint_lsn= log_write_record(LOGREC_CHECKPOINT, + &system_trans, string_array); + + /* + Do nothing between the log write and the control file write, for the + "repair control file" tool to be possible one day. + */ + + if (LSN_IMPOSSIBLE == checkpoint_lsn) + goto err; + + if (0 != control_file_write_and_force(checkpoint_lsn, NULL)) + goto err; + + goto end; + +err: + print_error_to_error_log(the_error_message); + candidate_max_rec_lsn_at_last_checkpoint= LSN_IMPOSSIBLE; + +end: + my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR)); + my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR)); + my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR)); + + DBUG_RETURN(candidate_max_rec_lsn_at_last_checkpoint); +} + + + +/* + Here's what should be put in log_write_record() in the log handler: +*/ +log_write_record(...) +{ + ...; + lock(log_mutex); + ...; + write_to_log(length); + written_since_last_checkpoint+= length; + if (written_since_last_checkpoint > + MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS) + { + /* + ask one system thread (the "LRD background flusher and checkpointer + thread" WL#3261) to do a checkpoint + */ + request_asynchronous_checkpoint(INDIRECT); + } + ...; + unlock(log_mutex); + ...; +} + +/* + Requests a checkpoint from the background thread, *asynchronously* + (requestor does not wait for completion, and does not even later check the + result). + In real life it will be called by log_write_record(). +*/ +void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level); +{ + safemutex_assert_owner(log_mutex); + + DBUG_ASSERT(level > NONE); + if (checkpoint_request < level) + { + /* no equal or stronger running or to run, we post request */ + /* + note that thousands of requests for checkpoints are going to come all + at the same time (when the log bound + MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS is passed), so it may not be a + good idea for each of them to broadcast a cond to wake up the background + checkpoint thread. We just don't broacast a cond, the checkpoint thread + (see least_recently_dirtied.c) will notice our request in max a few + seconds. + */ + checkpoint_request= level; /* post request */ + } + + /* + If there was an error, only an error + message to the error log will say it; normal, for a checkpoint triggered + by a log write, we probably don't want the client's log write to throw an + error, as the log write succeeded and a checkpoint failure is not + critical: the failure in this case is more for the DBA to know than for + the end user. + */ +} |