diff options
author | unknown <guilhem@gbichot3.local> | 2006-12-16 18:10:47 +0100 |
---|---|---|
committer | unknown <guilhem@gbichot3.local> | 2006-12-16 18:10:47 +0100 |
commit | fa05e9c9f426a19f016897ec57c047c277bf52c7 (patch) | |
tree | acc9c5f4294894b87069de2cf9ed3fc78bf64356 | |
parent | ad29d5520b1ba379a75adc447f301851ff4588a4 (diff) | |
download | mariadb-git-fa05e9c9f426a19f016897ec57c047c277bf52c7.tar.gz |
WL#3071 - Maria checkpoint
Adding rec_lsn to Maria's page cache. Misc fixes to Checkpoint.
mysys/mf_pagecache.c:
adding rec_lsn, the LSN when a page first became dirty.
It is set when unlocking a page (TODO: should also be set when
the unlocking is an implicit part of pagecache_write()).
It is reset in link_to_file_list() and free_block()
(one of which is used every time we flush a block).
It is a ulonglong and not LSN, because its destination is comparisons
for which ulonglong is better than a struct.
storage/maria/ma_checkpoint.c:
misc fixes to Checkpoint (updates now that the transaction manager
and the page cache are more known)
storage/maria/ma_close.c:
an important note for the future.
storage/maria/ma_least_recently_dirtied.c:
comment
-rwxr-xr-x | mysys/mf_pagecache.c | 12 | ||||
-rw-r--r-- | storage/maria/ma_checkpoint.c | 433 | ||||
-rw-r--r-- | storage/maria/ma_close.c | 6 | ||||
-rw-r--r-- | storage/maria/ma_least_recently_dirtied.c | 5 |
4 files changed, 277 insertions, 179 deletions
diff --git a/mysys/mf_pagecache.c b/mysys/mf_pagecache.c index 3e3484d5efb..807a3ea520a 100755 --- a/mysys/mf_pagecache.c +++ b/mysys/mf_pagecache.c @@ -295,6 +295,7 @@ struct st_pagecache_block_link enum pagecache_page_type type; /* type of the block */ uint hits_left; /* number of hits left until promotion */ ulonglong last_hit_time; /* timestamp of the last hit */ + ulonglong rec_lsn; /* LSN when first became dirty */ KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */ }; @@ -1202,6 +1203,7 @@ static void link_to_file_list(PAGECACHE *pagecache, if (block->status & BLOCK_CHANGED) { block->status&= ~BLOCK_CHANGED; + block->rec_lsn= 0; pagecache->blocks_changed--; pagecache->global_blocks_changed--; } @@ -2509,6 +2511,8 @@ void pagecache_unlock_page(PAGECACHE *pagecache, DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK && pin == PAGECACHE_UNPIN); /* TODO: insert LSN writing code */ + DBUG_ASSERT(first_REDO_LSN_for_page > 0); + set_if_bigger(block->rec_lsn, first_REDO_LSN_for_page); } #ifndef DBUG_OFF @@ -2671,6 +2675,8 @@ void pagecache_unlock(PAGECACHE *pagecache, DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK && pin == PAGECACHE_UNPIN); /* TODO: insert LSN writing code */ + DBUG_ASSERT(first_REDO_LSN_for_page > 0); + set_if_bigger(block->rec_lsn, first_REDO_LSN_for_page); } #ifndef DBUG_OFF @@ -3012,10 +3018,9 @@ restart: pagecache->blocks_changed--; pagecache->global_blocks_changed--; /* - free_block() will change the status of the block so no need to change - it here. + free_block() will change the status and rec_lsn of the block so no + need to change them here. */ - } /* Cache is locked, so we can relese page before freeing it */ pagecache_make_lock_and_pin(pagecache, block, @@ -3328,6 +3333,7 @@ static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block) #ifndef DBUG_OFF block->type= PAGECACHE_EMPTY_PAGE; #endif + block->rec_lsn= 0; KEYCACHE_THREAD_TRACE("free block"); KEYCACHE_DBUG_PRINT("free_block", ("block is freed")); diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c index 83312ce37b8..717b6202559 100644 --- a/storage/maria/ma_checkpoint.c +++ b/storage/maria/ma_checkpoint.c @@ -40,8 +40,7 @@ #include "share.h" #include "log.h" -/* could also be called LSN_ERROR */ -#define LSN_IMPOSSIBLE ((LSN)0) +#define LSN_IMPOSSIBLE ((LSN)0) /* could also be called LSN_ERROR */ #define LSN_MAX ((LSN)ULONGLONG_MAX) /* @@ -57,9 +56,12 @@ st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,.. MEDIUM checkpoint. */ LSN max_rec_lsn_at_last_checkpoint= 0; +/* last submitted checkpoint request; cleared only when executed */ CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE; CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE; +static inline ulonglong read_non_atomic(ulonglong volatile *x); + /* Used by MySQL client threads requesting a checkpoint (like "ALTER MARIA ENGINE DO CHECKPOINT"), and probably by maria_panic(), and at the end of the @@ -67,6 +69,7 @@ CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE; */ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level) { + my_bool result; DBUG_ENTER("execute_synchronous_checkpoint"); DBUG_ASSERT(level > NONE); @@ -76,43 +79,52 @@ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level) wait_on_checkpoint_done_cond(); synchronous_checkpoint_in_progress= level; - execute_checkpoint(level); + result= execute_checkpoint(level); safemutex_assert_owner(log_mutex); synchronous_checkpoint_in_progress= NONE; unlock(log_mutex); broadcast(checkpoint_done_cond); + DBUG_RETURN(result); } -/* Picks a checkpoint request, if there is one, and executes it */ +/* + If no checkpoint is running, and there is a pending asynchronous checkpoint + request, executes it. + Is safe if multiple threads call it, though in first version only one will. + It's intended to be used by a thread which regularly calls this function; + this is why, if there is a request,it does not wait in a loop for + synchronous checkpoints to be finished, but just exits (because the thread + may want to do something useful meanwhile (flushing dirty pages for example) + instead of waiting). +*/ my_bool execute_asynchronous_checkpoint_if_any() { + my_bool result; CHECKPOINT_LEVEL level; DBUG_ENTER("execute_asynchronous_checkpoint"); lock(log_mutex); - if (likely(next_asynchronous_checkpoint_to_do == NONE)) + if (likely((next_asynchronous_checkpoint_to_do == NONE) || + (synchronous_checkpoint_in_progress != NONE))) { unlock(log_mutex); DBUG_RETURN(FALSE); } - while (synchronous_checkpoint_in_progress) - wait_on_checkpoint_done_cond(); - -do_checkpoint: level= next_asynchronous_checkpoint_to_do; DBUG_ASSERT(level > NONE); - execute_checkpoint(level); + result= execute_checkpoint(level); safemutex_assert_owner(log_mutex); - if (next_asynchronous_checkpoint_to_do > level) - goto do_checkpoint; /* one more request was posted */ - else + /* If only one thread calls this function, "<" can never happen below */ + if (next_asynchronous_checkpoint_to_do <= level) { - DBUG_ASSERT(next_asynchronous_checkpoint_to_do == level); - next_asynchronous_checkpoint_to_do= NONE; /* all work done */ + /* it's our request or weaker/equal ones, all work is done */ + next_asynchronous_checkpoint_to_do= NONE; } + /* otherwise if it is a stronger request, we'll deal with it at next call */ unlock(log_mutex); broadcast(checkpoint_done_cond); + DBUG_RETURN(result); } @@ -123,17 +135,14 @@ do_checkpoint: */ my_bool execute_checkpoint(CHECKPOINT_LEVEL level) { - LSN candidate_max_rec_lsn_at_last_checkpoint; - /* to avoid { lock + no-op + unlock } in the common (==indirect) case */ - my_bool need_log_mutex; - DBUG_ENTER("execute_checkpoint"); safemutex_assert_owner(log_mutex); - copy_of_max_rec_lsn_at_last_checkpoint= max_rec_lsn_at_last_checkpoint; - if (unlikely(need_log_mutex= (level > INDIRECT))) + if (unlikely(level > INDIRECT)) { + LSN copy_of_max_rec_lsn_at_last_checkpoint= + max_rec_lsn_at_last_checkpoint; /* much I/O work to do, release log mutex */ unlock(log_mutex); @@ -149,51 +158,29 @@ my_bool execute_checkpoint(CHECKPOINT_LEVEL level) flush all pages which were already dirty at last checkpoint: ensures that recovery will never start from before the next-to-last checkpoint (two-checkpoint rule). - It is max, not min as the WL says (TODO update WL). */ flush_all_LRD_to_lsn(copy_of_max_rec_lsn_at_last_checkpoint); /* this will go full speed (normal scheduling, no sleep) */ break; } + lock(log_mutex); } - candidate_max_rec_lsn_at_last_checkpoint= checkpoint_indirect(need_log_mutex); - - lock(log_mutex); - /* - this portion cannot be done as a hook in write_log_record() for the - LOGREC_CHECKPOINT type because: - - at that moment we still have not written to the control file so cannot - mark the request as done; this could be solved by writing to the control - file in the hook but that would be an I/O under the log's mutex, bad. - - it would not be nice organisation of code (I tried it :). - */ - if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE) - { - /* checkpoint succeeded */ - maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint; - written_since_last_checkpoint= (my_off_t)0; - DBUG_RETURN(FALSE); - } /* - keep mutex locked because callers will want to clear mutex-protected - status variables + keep mutex locked upon exit because callers will want to clear + mutex-protected status variables */ - DBUG_RETURN(TRUE); + DBUG_RETURN(execute_checkpoint_indirect()); } /* Does an indirect checpoint (collects data from data structures, writes into a checkpoint log record). - Returns the largest LSN of the LRD when the checkpoint happened (this is a - fuzzy definition), or LSN_IMPOSSIBLE on error. That LSN is used for the - "two-checkpoint rule" (MEDIUM checkpoints). + Starts and ends while having log's mutex (released in the middle). */ -LSN checkpoint_indirect(my_bool need_log_mutex) +my_bool execute_checkpoint_indirect() { - DBUG_ENTER("checkpoint_indirect"); - int error= 0; /* checkpoint record data: */ LSN checkpoint_start_lsn; @@ -202,163 +189,198 @@ LSN checkpoint_indirect(my_bool need_log_mutex) char *ptr; LSN checkpoint_lsn; LSN candidate_max_rec_lsn_at_last_checkpoint= 0; - list_element *el; /* to scan lists */ - ulong stored_LRD_size= 0; - + DBUG_ENTER("execute_checkpoint_indirect"); DBUG_ASSERT(sizeof(byte *) <= 8); DBUG_ASSERT(sizeof(LSN) <= 8); - if (need_log_mutex) - lock(log_mutex); /* maybe this will clash with log_read_end_lsn() */ + safemutex_assert_owner(log_mutex); checkpoint_start_lsn= log_read_end_lsn(); + if (LSN_IMPOSSIBLE == checkpoint_start_lsn) /* error */ + DBUG_RETURN(TRUE); unlock(log_mutex); DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn)); /* STEP 1: fetch information about dirty pages */ - - /* - We lock the entire cache but will be quick, just reading/writing a few MBs - of memory at most. - */ - pagecache_pthread_mutex_lock(&pagecache->cache_lock); - - /* - This is an over-estimation, as in theory blocks_changed may contain - non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the - checkpoint record; the true number of page-LRD-info we'll store into the - record is stored_LRD_size. - */ - string1.length= 8+8+(8+8)*pagecache->blocks_changed; - if (NULL == (string1.str= my_malloc(string1.length))) - goto err; - ptr= string1.str; - int8store(ptr, checkpoint_start_lsn); - ptr+= 8+8; /* don't store stored_LRD_size now, wait */ - if (pagecache->blocks_changed > 0) + /* note: this piece will move into mysys/mf_pagecache.c */ { + ulong stored_LRD_size= 0; + /* + We lock the entire cache but will be quick, just reading/writing a few MBs + of memory at most. + When we enter here, we must be sure that no "first_in_switch" situation + is happening or will happen (either we have to get rid of + first_in_switch in the code or, first_in_switch has to increment a + "danger" counter for Checkpoint to know it has to wait. + */ + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + + /* + This is an over-estimation, as in theory blocks_changed may contain + non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the + checkpoint record; the true number of page-LRD-info we'll store into the + record is stored_LRD_size. + */ /* - There are different ways to scan the dirty blocks; - flush_all_key_blocks() uses a loop over pagecache->used_last->next_used, - and for each element of the loop, loops into - pagecache->changed_blocks[FILE_HASH(file of the element)]. - This has the drawback that used_last includes non-dirty blocks, and it's - two loops over many elements. Here we try something simpler. - If there are no blocks in changed_blocks[file_hash], we should hit - zeroes and skip them. + TODO: Ingo says blocks_changed is not a reliable number (see his + document); ask him. */ - uint file_hash; - for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++) + string1.length= 8+8+(8+8+8)*pagecache->blocks_changed; + if (NULL == (string1.str= my_malloc(string1.length))) + goto err; + ptr= string1.str; + int8store(ptr, checkpoint_start_lsn); + ptr+= 8+8; /* don't store stored_LRD_size now, wait */ + if (pagecache->blocks_changed > 0) { - PAGECACHE_BLOCK_LINK *block; - for (block= pagecache->changed_blocks[file_hash] ; - block; - block= block->next_changed) + uint file_hash; + for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++) { - DBUG_ASSERT(block->hash_link != NULL); - DBUG_ASSERT(block->status & BLOCK_CHANGED); - if (block->type != PAGECACHE_LSN_PAGE) + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[file_hash] ; + block; + block= block->next_changed) { - /* no need to store it in the checkpoint record */ - continue; + DBUG_ASSERT(block->hash_link != NULL); + DBUG_ASSERT(block->status & BLOCK_CHANGED); + if (block->type != PAGECACHE_LSN_PAGE) + { + continue; /* no need to store it in the checkpoint record */ + } + /* + In the current pagecache, rec_lsn is not set correctly: + 1) it is set on pagecache_unlock(), too late (a page is dirty + (BLOCK_CHANGED) since the first pagecache_write()). It may however + be not too late, because until unlock(), the page's update is not + committed, so it's ok that REDOs for it be skipped at Recovery + (which is what happens with an unset rec_lsn). Note that this + relies on the assumption that a transaction never commits while + holding locks on pages. + 2) sometimes the unlocking can be an implicit action of + pagecache_write(), without any call to pagecache_unlock(), then + rec_lsn is not set. That one is a critical problem. + TODO: fix this when Monty has explained how he writes BLOB pages. + */ + if (0 == block->rec_lsn) + abort(); /* always fail in all builds, in case it's problem 2) */ + + int8store(ptr, block->hash_link->file.file); + ptr+= 8; + int8store(ptr, block->hash_link->pageno); + ptr+= 8; + int8store(ptr, block->rec_lsn); + ptr+= 8; + stored_LRD_size++; + DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed); + set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint, + block->rec_lsn); } - /* Q: two "block"s cannot have the same "hash_link", right? */ - int8store(ptr, block->hash_link->pageno); - ptr+= 8; - /* I assume rec_lsn will be member of "block", not of "hash_link" */ - int8store(ptr, block->rec_lsn); - ptr+= 8; - stored_LRD_size++; - set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint, - block->rec_lsn); } - } - pagecache_pthread_mutex_unlock(&pagecache->cache_lock); - DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed); - int8store(string1.str+8, stored_LRD_size); - string1.length= 8+8+(8+8)*stored_LRD_size; + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + int8store(string1.str+8, stored_LRD_size); + string1.length= 8+8+(8+8+8)*stored_LRD_size; + } /* STEP 2: fetch information about transactions */ - + /* note: this piece will move into trnman.c */ /* - If trx are in more than one list (e.g. three: - running transactions, committed transactions, purge queue), we can either - take mutexes of all three together or do crabbing. - But if an element can move from list 1 to list 3 without passing through - list 2, crabbing is dangerous. - Hopefully it's ok to take 3 mutexes together... - Otherwise I'll have to make sure I miss no important trx and I handle dups. + Transactions are in the "active list" (protected by a mutex) and in a + lock-free hash of "committed" (insertion protected by the same mutex, + deletion lock-free). */ - lock(global_transactions_list_mutex); /* or 3 mutexes if there are 3 */ - string2.length= 8+(8+8)*trx_list->count; - if (NULL == (string2.str= my_malloc(string2.length))) - goto err; - ptr= string2.str; - int8store(ptr, trx_list->count); - ptr+= 8; - for (el= trx_list->first; el; el= el->next) { - /* possibly latch el.rwlock */ - *ptr= el->state; - ptr++; - int7store(ptr, el->long_trans_id); - ptr+= 7; - int2store(ptr, el->short_trans_id); - ptr+= 2; - int8store(ptr, el->undo_lsn); - ptr+= 8; - int8store(ptr, el->undo_purge_lsn); + TRN *trn; + ulong stored_trn_size= 0; + /* First, the active transactions */ + pthread_mutex_lock(LOCK_trn_list); + string2.length= 8+(7+2+8+8+8)*trnman_active_transactions; + if (NULL == (string2.str= my_malloc(string2.length))) + goto err; + ptr= string2.str; ptr+= 8; + for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next) + { + /* we would latch trn.rwlock if it existed */ + if (0 == trn->short_trid) /* trn is not inited, skip */ + continue; + /* state is not needed for now (only when we have prepared trx) */ + /* int7store does not exist but mi_int7store does */ + int7store(ptr, trn->trid); + ptr+= 7; + int2store(ptr, trn->short_trid); + ptr+= 2; + int8store(ptr, trn->undo_lsn); /* is an LSN 7 or 8 bytes really? */ + ptr+= 8; + int8store(ptr, trn->undo_purge_lsn); + ptr+= 8; + int8store(ptr, read_non_atomic(&trn->first_undo_lsn)); + ptr+= 8; + /* possibly unlatch el.rwlock */ + stored_trn_size++; + } + pthread_mutex_unlock(LOCK_trn_list); /* - if no latch, use double variable of type ULONGLONG_CONSISTENT in - st_transaction, or even no need if Intel >=486 + Now the committed ones. + We need a function which scans the hash's list of elements in a + lock-free manner (a bit like lfind(), starting from bucket 0), and for + each node (committed transaction) stores the transaction's + information (trid, undo_purge_lsn, first_undo_lsn) into a buffer. + This big buffer is malloc'ed at the start, so the number of elements (or + an upper bound of it) found in the hash needs to be known in advance + (one solution is to keep LOCK_trn_list locked, ensuring that nodes are + only deleted). */ - int8store(ptr, el->first_undo_lsn); - ptr+= 8; - /* possibly unlatch el.rwlock */ + /* + TODO: if we see there exists no transaction (active and committed) we can + tell the lock-free structures to do some freeing (my_free()). + */ + int8store(string1.str, stored_trn_size); + string2.length= 8+(7+2+8+8+8)*stored_trn_size; } - unlock(global_transactions_list_mutex); /* STEP 3: fetch information about table files */ - /* This global mutex is in fact THR_LOCK_maria (see ma_open()) */ - lock(global_share_list_mutex); - string3.length= 8+(8+8)*share_list->count; - if (NULL == (string3.str= my_malloc(string3.length))) - goto err; - ptr= string3.str; - /* possibly latch each MARIA_SHARE, one by one, like this: */ - pthread_mutex_lock(&share->intern_lock); - /* - We'll copy the file id (a bit like share->kfile), the file name - (like share->unique_file_name[_length]). - */ - make_copy_of_global_share_list_to_array; - pthread_mutex_unlock(&share->intern_lock); - unlock(global_share_list_mutex); - - /* work on copy */ - int8store(ptr, elements_in_array); - ptr+= 8; - for (el in array) { - int8store(ptr, array[...].file_id); - ptr+= 8; - memcpy(ptr, array[...].file_name, ...); - ptr+= ...; + /* This global mutex is in fact THR_LOCK_maria (see ma_open()) */ + lock(global_share_list_mutex); + string3.length= 8+(8+8)*share_list->count; + if (NULL == (string3.str= my_malloc(string3.length))) + goto err; + ptr= string3.str; + /* possibly latch each MARIA_SHARE, one by one, like this: */ + pthread_mutex_lock(&share->intern_lock); /* - these two are long ops (involving disk I/O) that's why we copied the - list, to not keep the list locked for long: + We'll copy the file id (a bit like share->kfile), the file name + (like share->unique_file_name[_length]). */ - flush_bitmap_pages(el); - /* TODO: and also autoinc counter, logical file end, free page list */ + make_copy_of_global_share_list_to_array; + pthread_mutex_unlock(&share->intern_lock); + unlock(global_share_list_mutex); - /* - fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per - second, so if you have touched 1000 files it's 7 seconds). - */ - force_file(el); + /* work on copy */ + int8store(ptr, elements_in_array); + ptr+= 8; + for (el in array) + { + int8store(ptr, array[...].file_id); + ptr+= 8; + memcpy(ptr, array[...].file_name, ...); + ptr+= ...; + /* + these two are long ops (involving disk I/O) that's why we copied the + list, to not keep the list locked for long: + */ + /* TODO: what if the table pointer is gone/reused now? */ + flush_bitmap_pages(el); + /* TODO: and also autoinc counter, logical file end, free page list */ + + /* + fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per + second, so if you have touched 1000 files it's 7 seconds). + */ + force_file(el); + } } /* LAST STEP: now write the checkpoint log record */ @@ -389,11 +411,38 @@ err: candidate_max_rec_lsn_at_last_checkpoint= LSN_IMPOSSIBLE; end: + my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR)); my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR)); my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR)); - DBUG_RETURN(candidate_max_rec_lsn_at_last_checkpoint); + /* + this portion cannot be done as a hook in write_log_record() for the + LOGREC_CHECKPOINT type because: + - at that moment we still have not written to the control file so cannot + mark the request as done; this could be solved by writing to the control + file in the hook but that would be an I/O under the log's mutex, bad. + - it would not be nice organisation of code (I tried it :). + */ + if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE) + { + /* checkpoint succeeded */ + /* + TODO: compute log's low water mark (how to do that with our fuzzy + ARIES-like reads of data structures? TODO think about it :). + */ + lock(log_mutex); + /* That LSN is used for the "two-checkpoint rule" (MEDIUM checkpoints) */ + maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint; + written_since_last_checkpoint= (my_off_t)0; + DBUG_RETURN(FALSE); + } + lock(log_mutex); + DBUG_RETURN(TRUE); + /* + keep mutex locked upon exit because callers will want to clear + mutex-protected status variables + */ } @@ -433,7 +482,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level); safemutex_assert_owner(log_mutex); DBUG_ASSERT(level > NONE); - if (checkpoint_request < level) + if (next_asynchronous_checkpoint_to_do < level) { /* no equal or stronger running or to run, we post request */ /* @@ -445,7 +494,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level); (see least_recently_dirtied.c) will notice our request in max a few seconds. */ - checkpoint_request= level; /* post request */ + next_asynchronous_checkpoint_to_do= level; /* post request */ } /* @@ -457,3 +506,37 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level); the end user. */ } + + +/* + If a 64-bit variable transitions from both halves being zero to both halves + being non-zero, and never changes after that (like the transaction's + first_undo_lsn), this function can be used to do a read of it (without + mutex, without atomic load) which always produces a correct (though maybe + slightly old) value (even on 32-bit CPUs). +*/ +static inline ulonglong read_non_atomic(ulonglong volatile *x) +{ +#if ( SIZEOF_CHARP >= 8 ) + /* 64-bit CPU (right?), 64-bit reads are atomic */ + return *x; +#else + /* + 32-bit CPU, 64-bit reads may give a mixed of old half and new half (old + low bits and new high bits, or the contrary). + As the variable we read transitions from both halves being zero to both + halves being non-zero, and never changes then, we can detect atomicity + problems: + */ + ulonglong y; + for (;;) /* loop until no atomicity problems */ + { + y= *x; + if (likely(((0 == y) || + ((0 != (y >> 32)) && (0 != (y << 32))))) + return y; + /* Worth seeing it! */ + DBUG_PRINT("info",("atomicity problem")); + } +#endif +} diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c index 5b940eaf4c3..73764cf444a 100644 --- a/storage/maria/ma_close.c +++ b/storage/maria/ma_close.c @@ -57,6 +57,12 @@ int maria_close(register MARIA_HA *info) info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED); } flag= !--share->reopen; + /* + RECOVERYTODO: + Below we are going to make the table unknown to future checkpoints, so it + needs to have fsync'ed itself entirely (bitmap, pages, etc) at this + point. + */ maria_open_list=list_delete(maria_open_list,&info->open_list); pthread_mutex_unlock(&share->intern_lock); diff --git a/storage/maria/ma_least_recently_dirtied.c b/storage/maria/ma_least_recently_dirtied.c index b0b7fb1ef10..809442b4e97 100644 --- a/storage/maria/ma_least_recently_dirtied.c +++ b/storage/maria/ma_least_recently_dirtied.c @@ -94,7 +94,10 @@ pthread_handler_decl background_flush_and_checkpoint_thread() while (this_thread_not_killed) { if ((flush_calls++) & ((2<<CHECKPOINT_PROBING_PERIOD_LOG2)-1) == 0) - execute_asynchronous_checkpoint_if_any(); + { + /* note that we don't care of the checkpoint's success */ + (void)execute_asynchronous_checkpoint_if_any(); + } lock(global_LRD_mutex); flush_one_group_from_LRD(); safemutex_assert_not_owner(global_LRD_mutex); |