diff options
author | unknown <guilhem@gbichot3.local> | 2007-02-28 17:50:51 +0100 |
---|---|---|
committer | unknown <guilhem@gbichot3.local> | 2007-02-28 17:50:51 +0100 |
commit | ea57b3d4a066a5507a7e322b53e3acab24a2855e (patch) | |
tree | a0703039f00da454a2a91c8b14835d45a6146209 /storage/maria/ma_checkpoint.c | |
parent | ae72e394502e13b854b6e9bb00889fa6b69a7ed9 (diff) | |
parent | fdf847fb62a0fcdf0edf25d6c8654b19eaa9a9ad (diff) | |
download | mariadb-git-ea57b3d4a066a5507a7e322b53e3acab24a2855e.tar.gz |
Merge gbichot3.local:/home/mysql_src/mysql-5.1-for-maria
into gbichot3.local:/home/mysql_src/mysql-maria
BitKeeper/etc/ignore:
auto-union
BUILD/SETUP.sh:
Auto merged
client/mysqldump.c:
Auto merged
config/ac-macros/plugins.m4:
Auto merged
configure.in:
Auto merged
include/Makefile.am:
Auto merged
include/atomic/nolock.h:
Auto merged
include/atomic/rwlock.h:
Auto merged
include/atomic/x86-gcc.h:
Auto merged
include/atomic/x86-msvc.h:
Auto merged
include/ft_global.h:
Auto merged
include/keycache.h:
Auto merged
include/m_string.h:
Auto merged
include/my_atomic.h:
Auto merged
include/my_base.h:
Auto merged
include/my_dbug.h:
Auto merged
include/my_global.h:
Auto merged
include/my_handler.h:
Auto merged
include/my_sys.h:
Auto merged
include/myisam.h:
Auto merged
libmysql/CMakeLists.txt:
Auto merged
libmysqld/Makefile.am:
Auto merged
mysql-test/mysql-test-run.pl:
Auto merged
mysql-test/r/events_logs_tests.result:
Auto merged
mysql-test/t/events_logs_tests.test:
Auto merged
mysys/Makefile.am:
Auto merged
mysys/array.c:
Auto merged
mysys/mf_keycache.c:
Auto merged
mysys/mf_keycaches.c:
Auto merged
mysys/my_atomic.c:
Auto merged
mysys/my_bit.c:
Auto merged
mysys/my_bitmap.c:
Auto merged
mysys/my_create.c:
Auto merged
mysys/my_delete.c:
Auto merged
mysys/my_getsystime.c:
Auto merged
mysys/my_handler.c:
Auto merged
mysys/my_init.c:
Auto merged
mysys/my_open.c:
Auto merged
mysys/my_pread.c:
Auto merged
mysys/my_rename.c:
Auto merged
mysys/my_symlink.c:
Auto merged
mysys/my_sync.c:
Auto merged
plugin/daemon_example/daemon_example.cc:
Auto merged
sql/Makefile.am:
Auto merged
sql/filesort.cc:
Auto merged
sql/gen_lex_hash.cc:
Auto merged
sql/ha_ndbcluster.cc:
Auto merged
sql/handler.h:
Auto merged
sql/item_func.cc:
Auto merged
sql/item_func.h:
Auto merged
sql/log.cc:
Auto merged
sql/mysql_priv.h:
Auto merged
sql/set_var.h:
Auto merged
sql/sql_class.h:
Auto merged
sql/sql_parse.cc:
Auto merged
sql/sql_select.cc:
Auto merged
sql/sql_sort.h:
Auto merged
sql/sql_test.cc:
Auto merged
sql/uniques.cc:
Auto merged
sql/unireg.cc:
Auto merged
storage/Makefile.am:
Auto merged
storage/csv/ha_tina.cc:
Auto merged
storage/myisam/Makefile.am:
Auto merged
storage/myisam/ft_boolean_search.c:
Auto merged
storage/myisam/ft_nlq_search.c:
Auto merged
storage/myisam/ft_parser.c:
Auto merged
storage/myisam/ft_static.c:
Auto merged
storage/myisam/ft_stopwords.c:
Auto merged
storage/myisam/ft_update.c:
Auto merged
storage/myisam/fulltext.h:
Auto merged
storage/myisam/ha_myisam.h:
Auto merged
storage/myisam/mi_check.c:
Auto merged
storage/myisam/mi_create.c:
Auto merged
storage/myisam/mi_delete.c:
Auto merged
storage/myisam/mi_delete_all.c:
Auto merged
storage/myisam/mi_dynrec.c:
Auto merged
storage/myisam/mi_key.c:
Auto merged
storage/myisam/mi_log.c:
Auto merged
storage/myisam/mi_open.c:
Auto merged
storage/myisam/mi_packrec.c:
Auto merged
storage/myisam/mi_range.c:
Auto merged
storage/myisam/mi_rsamepos.c:
Auto merged
storage/myisam/mi_search.c:
Auto merged
storage/myisam/mi_test1.c:
Auto merged
storage/myisam/mi_test2.c:
Auto merged
storage/myisam/mi_unique.c:
Auto merged
storage/myisam/mi_update.c:
Auto merged
storage/myisam/mi_write.c:
Auto merged
storage/myisam/myisamchk.c:
Auto merged
storage/myisam/myisamlog.c:
Auto merged
storage/myisam/myisampack.c:
Auto merged
storage/myisam/rt_index.c:
Auto merged
storage/myisam/sort.c:
Auto merged
storage/myisammrg/ha_myisammrg.h:
Auto merged
unittest/mytap/tap.c:
Auto merged
mysql-test/r/view.result:
manual merge
mysql-test/t/view.test:
manual merge
Makefile.am:
manual merge
mysql-test/t/disabled.def:
manual merge
sql/mysqld.cc:
manual merge
sql/set_var.cc:
manual merge
sql/udf_example.c:
manual merge
storage/myisam/ha_myisam.cc:
manual merge
storage/myisam/myisamdef.h:
manual merge
storage/ndb/src/mgmapi/mgmapi.cpp:
manual merge
unittest/Makefile.am:
manual merge
unittest/mysys/Makefile.am:
manual merge
unittest/mysys/my_atomic-t.c:
manual merge
Diffstat (limited to 'storage/maria/ma_checkpoint.c')
-rw-r--r-- | storage/maria/ma_checkpoint.c | 429 |
1 files changed, 429 insertions, 0 deletions
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c new file mode 100644 index 00000000000..02d887f758a --- /dev/null +++ b/storage/maria/ma_checkpoint.c @@ -0,0 +1,429 @@ +/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + WL#3071 Maria checkpoint + First version written by Guilhem Bichot on 2006-04-27. + Does not compile yet. +*/ + +/* Here is the implementation of this module */ + +/* + Summary: + - there are asynchronous checkpoints (a writer to the log notices that it's + been a long time since we last checkpoint-ed, so posts a request for a + background thread to do a checkpoint; does not care about the success of the + checkpoint). Then the checkpoint is done by the checkpoint thread, at an + unspecified moment ("later") (==soon, of course). + - there are synchronous checkpoints: a thread requests a checkpoint to + happen now and wants to know when it finishes and if it succeeded; then the + checkpoint is done by that same thread. +*/ + +#include "page_cache.h" +#include "least_recently_dirtied.h" +#include "transaction.h" +#include "share.h" +#include "log.h" + +#define LSN_IMPOSSIBLE ((LSN)0) /* could also be called LSN_ERROR */ +#define LSN_MAX ((LSN)ULONGLONG_MAX) + +/* + this transaction is used for any system work (purge, checkpoint writing + etc), that is, background threads. It will not be declared/initialized here + in the final version. +*/ +st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,...}; + +/* those three are protected by the log's mutex */ +/* + The maximum rec_lsn in the LRD when last checkpoint was run, serves for the + MEDIUM checkpoint. +*/ +LSN max_rec_lsn_at_last_checkpoint= 0; +/* last submitted checkpoint request; cleared when starts */ +CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE; +CHECKPOINT_LEVEL checkpoint_in_progress= NONE; + +static inline ulonglong read_non_atomic(ulonglong volatile *x); + +/* + Used by MySQL client threads requesting a checkpoint (like "ALTER MARIA + ENGINE DO CHECKPOINT"), and probably by maria_panic(), and at the end of the + UNDO recovery phase. +*/ +my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level) +{ + my_bool result; + DBUG_ENTER("execute_synchronous_checkpoint"); + DBUG_ASSERT(level > NONE); + + lock(log_mutex); + while (checkpoint_in_progress != NONE) + wait_on_checkpoint_done_cond(); + + result= execute_checkpoint(level); + DBUG_RETURN(result); +} + +/* + If no checkpoint is running, and there is a pending asynchronous checkpoint + request, executes it. + Is safe if multiple threads call it, though in first version only one will. + It's intended to be used by a thread which regularly calls this function; + this is why, if there is a request, it does not wait in a loop for + synchronous checkpoints to be finished, but just exits (because the thread + may want to do something useful meanwhile (flushing dirty pages for example) + instead of waiting). +*/ +my_bool execute_asynchronous_checkpoint_if_any() +{ + my_bool result; + CHECKPOINT_LEVEL level; + DBUG_ENTER("execute_asynchronous_checkpoint"); + + /* first check without mutex, ok to see old data */ + if (likely((next_asynchronous_checkpoint_to_do == NONE) || + (checkpoint_in_progress != NONE))) + DBUG_RETURN(FALSE); + + lock(log_mutex); + if (likely((next_asynchronous_checkpoint_to_do == NONE) || + (checkpoint_in_progress != NONE))) + { + unlock(log_mutex); + DBUG_RETURN(FALSE); + } + + result= execute_checkpoint(next_asynchronous_checkpoint_to_do); + DBUG_RETURN(result); +} + + +/* + Does the actual checkpointing. Called by + execute_synchronous_checkpoint() and + execute_asynchronous_checkpoint_if_any(). +*/ +my_bool execute_checkpoint(CHECKPOINT_LEVEL level) +{ + my_bool result; + DBUG_ENTER("execute_checkpoint"); + + safemutex_assert_owner(log_mutex); + if (next_asynchronous_checkpoint_to_do <= level) + next_asynchronous_checkpoint_to_do= NONE; + checkpoint_in_progress= level; + + if (unlikely(level > INDIRECT)) + { + LSN copy_of_max_rec_lsn_at_last_checkpoint= + max_rec_lsn_at_last_checkpoint; + /* much I/O work to do, release log mutex */ + unlock(log_mutex); + + switch (level) + { + case FULL: + /* flush all pages up to the current end of the LRD */ + flush_all_LRD_to_lsn(LSN_MAX); + /* this will go full speed (normal scheduling, no sleep) */ + break; + case MEDIUM: + /* + flush all pages which were already dirty at last checkpoint: + ensures that recovery will never start from before the next-to-last + checkpoint (two-checkpoint rule). + */ + flush_all_LRD_to_lsn(copy_of_max_rec_lsn_at_last_checkpoint); + /* this will go full speed (normal scheduling, no sleep) */ + break; + } + lock(log_mutex); + } + + result= execute_checkpoint_indirect(); + checkpoint_in_progress= NONE; + unlock(log_mutex); + broadcast(checkpoint_done_cond); + DBUG_RETURN(result); +} + + +/* + Does an indirect checpoint (collects data from data structures, writes into + a checkpoint log record). + Starts and ends while having log's mutex (released in the middle). +*/ +my_bool execute_checkpoint_indirect() +{ + int error= 0, i; + /* checkpoint record data: */ + LSN checkpoint_start_lsn; + char checkpoint_start_lsn_char[8]; + LEX_STRING strings[6]= + {checkpoint_start_lsn_char, 8}, {0,0}, {0,0}, {0,0}, {0,0}, {0,0} }; + char *ptr; + LSN checkpoint_lsn; + LSN candidate_max_rec_lsn_at_last_checkpoint; + DBUG_ENTER("execute_checkpoint_indirect"); + + DBUG_ASSERT(sizeof(byte *) <= 8); + DBUG_ASSERT(sizeof(LSN) <= 8); + + safemutex_assert_owner(log_mutex); + + /* STEP 1: record current end-of-log LSN */ + checkpoint_start_lsn= log_read_end_lsn(); + if (LSN_IMPOSSIBLE == checkpoint_start_lsn) /* error */ + DBUG_RETURN(TRUE); + unlock(log_mutex); + + DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn)); + int8store(strings[0].str, checkpoint_start_lsn); + + /* STEP 2: fetch information about dirty pages */ + + if (pagecache_collect_changed_blocks_with_LSN(pagecache, &strings[1], + &candidate_max_rec_lsn_at_last_checkpoint)) + goto err; + + /* STEP 3: fetch information about transactions */ + if (trnman_collect_transactions(&strings[2], &strings[3])) + goto err; + + /* STEP 4: fetch information about table files */ + + { + /* This global mutex is in fact THR_LOCK_maria (see ma_open()) */ + lock(global_share_list_mutex); + strings[4].length= 8+(8+8)*share_list->count; + if (NULL == (strings[4].str= my_malloc(strings[4].length))) + goto err; + ptr= string3.str; + /* + Note that maria_open_list is a list of MARIA_HA*, while we would prefer + a list of MARIA_SHARE* here (we are interested in the short id, + unique file name, members of MARIA_SHARE*, and in file descriptors, + which will in the end be in MARIA_SHARE*). + */ + for (iterate on the maria_open_list) + { + /* latch each MARIA_SHARE, one by one, like this: */ + pthread_mutex_lock(&share->intern_lock); + /* + TODO: + we need to prevent the share from going away while we later flush and + force it without holding THR_LOCK_maria. For example if the share is + free()d by maria_close() we'll have a problem. Or if the share's file + descriptor is closed by maria_close() we will not be able to my_sync() + it. + */ + pthread_mutex_unlock(&share->intern_lock); + store the share pointer into a private array; + } + unlock(global_share_list_mutex); + + /* work on copy */ + int8store(ptr, elements_in_array); + ptr+= 8; + for (el in array) + { + int8store(ptr, array[...].short_id); + ptr+= 8; + memcpy(ptr, array[...].unique_file_name[_length], ...); + ptr+= ...; + /* maybe we need to lock share->intern_lock here */ + /* + these two are long ops (involving disk I/O) that's why we copied the + list, to not keep the list locked for long: + */ + flush_bitmap_pages(el); + /* TODO: and also autoinc counter, logical file end, free page list */ + + /* + fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per + second, so if you have touched 1000 files it's 7 seconds). + */ + force_file(el); + } + } + + /* LAST STEP: now write the checkpoint log record */ + + checkpoint_lsn= log_write_record(LOGREC_CHECKPOINT, + &system_trans, strings); + + /* + Do nothing between the log write and the control file write, for the + "repair control file" tool to be possible one day. + */ + + if (LSN_IMPOSSIBLE == checkpoint_lsn) + goto err; + + if (0 != control_file_write_and_force(checkpoint_lsn, NULL)) + goto err; + + /* + Note that we should not alter memory structures until we have successfully + written the checkpoint record and control file. + Btw, a log write failure is serious: + - if we know how many bytes we managed to write, we should try to write + more, keeping the log's mutex (MY_FULL_IO) + - if we don't know, this log record is corrupted and we have no way to + "de-corrupt" it, so it will stay corrupted, and as the log is sequential, + any log record written after it will not be reachable (for example if we + would write UNDOs and crash, we would not be able to read the log and so + not be able to rollback), so we should stop the engine now (holding the + log's mutex) and do a recovery. + */ + goto end; + +err: + print_error_to_error_log(the_error_message); + candidate_max_rec_lsn_at_last_checkpoint= LSN_IMPOSSIBLE; + +end: + + for (i= 1; i<6; i++) + my_free(strings[i].str, MYF(MY_ALLOW_ZERO_PTR)); + + /* + this portion cannot be done as a hook in write_log_record() for the + LOGREC_CHECKPOINT type because: + - at that moment we still have not written to the control file so cannot + mark the request as done; this could be solved by writing to the control + file in the hook but that would be an I/O under the log's mutex, bad. + - it would not be nice organisation of code (I tried it :). + */ + if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE) + { + /* checkpoint succeeded */ + /* + TODO: compute log's low water mark (how to do that with our fuzzy + ARIES-like reads of data structures? TODO think about it :). + */ + lock(log_mutex); + /* That LSN is used for the "two-checkpoint rule" (MEDIUM checkpoints) */ + maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint; + DBUG_RETURN(FALSE); + } + lock(log_mutex); + DBUG_RETURN(TRUE); + /* + keep mutex locked upon exit because callers will want to clear + mutex-protected status variables + */ +} + + + +/* + Here's what should be put in log_write_record() in the log handler: +*/ +log_write_record(...) +{ + ...; + lock(log_mutex); + ...; + write_to_log(length); + written_since_last_checkpoint+= length; + if (written_since_last_checkpoint > + MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS) + { + /* + ask one system thread (the "LRD background flusher and checkpointer + thread" WL#3261) to do a checkpoint + */ + request_asynchronous_checkpoint(INDIRECT); + /* prevent similar redundant requests */ + written_since_last_checkpoint= (my_off_t)0; + } + ...; + unlock(log_mutex); + ...; +} + +/* + Requests a checkpoint from the background thread, *asynchronously* + (requestor does not wait for completion, and does not even later check the + result). + In real life it will be called by log_write_record(). +*/ +void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level); +{ + safemutex_assert_owner(log_mutex); + + DBUG_ASSERT(level > NONE); + if ((next_asynchronous_checkpoint_to_do < level) && + (checkpoint_in_progress < level)) + { + /* no equal or stronger running or to run, we post request */ + /* + We just don't broacast a cond, the checkpoint thread + (see ma_least_recently_dirtied.c) will notice our request in max a few + seconds. + */ + next_asynchronous_checkpoint_to_do= level; /* post request */ + } + + /* + If there was an error, only an error + message to the error log will say it; normal, for a checkpoint triggered + by a log write, we probably don't want the client's log write to throw an + error, as the log write succeeded and a checkpoint failure is not + critical: the failure in this case is more for the DBA to know than for + the end user. + */ +} + + +/* + If a 64-bit variable transitions from both halves being zero to both halves + being non-zero, and never changes after that (like the transaction's + first_undo_lsn), this function can be used to do a read of it (without + mutex, without atomic load) which always produces a correct (though maybe + slightly old) value (even on 32-bit CPUs). + The prototype will change with Sanja's new LSN type. +*/ +static inline ulonglong read_non_atomic(ulonglong volatile *x) +{ +#if ( SIZEOF_CHARP >= 8 ) + /* 64-bit CPU (right?), 64-bit reads are atomic */ + return *x; +#else + /* + 32-bit CPU, 64-bit reads may give a mixed of old half and new half (old + low bits and new high bits, or the contrary). + As the variable we read transitions from both halves being zero to both + halves being non-zero, and never changes then, we can detect atomicity + problems: + */ + ulonglong y; + for (;;) /* loop until no atomicity problems */ + { + y= *x; + if (likely(((0 == y) || + ((0 != (y >> 32)) && (0 != (y << 32))))) + return y; + /* Worth seeing it! */ + DBUG_PRINT("info",("atomicity problem")); + } +#endif +} |