summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorunknown <guilhem@gbichot3.local>2006-12-16 18:10:47 +0100
committerunknown <guilhem@gbichot3.local>2006-12-16 18:10:47 +0100
commitfa05e9c9f426a19f016897ec57c047c277bf52c7 (patch)
treeacc9c5f4294894b87069de2cf9ed3fc78bf64356
parentad29d5520b1ba379a75adc447f301851ff4588a4 (diff)
downloadmariadb-git-fa05e9c9f426a19f016897ec57c047c277bf52c7.tar.gz
WL#3071 - Maria checkpoint
Adding rec_lsn to Maria's page cache. Misc fixes to Checkpoint. mysys/mf_pagecache.c: adding rec_lsn, the LSN when a page first became dirty. It is set when unlocking a page (TODO: should also be set when the unlocking is an implicit part of pagecache_write()). It is reset in link_to_file_list() and free_block() (one of which is used every time we flush a block). It is a ulonglong and not LSN, because its destination is comparisons for which ulonglong is better than a struct. storage/maria/ma_checkpoint.c: misc fixes to Checkpoint (updates now that the transaction manager and the page cache are more known) storage/maria/ma_close.c: an important note for the future. storage/maria/ma_least_recently_dirtied.c: comment
-rwxr-xr-xmysys/mf_pagecache.c12
-rw-r--r--storage/maria/ma_checkpoint.c433
-rw-r--r--storage/maria/ma_close.c6
-rw-r--r--storage/maria/ma_least_recently_dirtied.c5
4 files changed, 277 insertions, 179 deletions
diff --git a/mysys/mf_pagecache.c b/mysys/mf_pagecache.c
index 3e3484d5efb..807a3ea520a 100755
--- a/mysys/mf_pagecache.c
+++ b/mysys/mf_pagecache.c
@@ -295,6 +295,7 @@ struct st_pagecache_block_link
enum pagecache_page_type type; /* type of the block */
uint hits_left; /* number of hits left until promotion */
ulonglong last_hit_time; /* timestamp of the last hit */
+ ulonglong rec_lsn; /* LSN when first became dirty */
KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */
};
@@ -1202,6 +1203,7 @@ static void link_to_file_list(PAGECACHE *pagecache,
if (block->status & BLOCK_CHANGED)
{
block->status&= ~BLOCK_CHANGED;
+ block->rec_lsn= 0;
pagecache->blocks_changed--;
pagecache->global_blocks_changed--;
}
@@ -2509,6 +2511,8 @@ void pagecache_unlock_page(PAGECACHE *pagecache,
DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK &&
pin == PAGECACHE_UNPIN);
/* TODO: insert LSN writing code */
+ DBUG_ASSERT(first_REDO_LSN_for_page > 0);
+ set_if_bigger(block->rec_lsn, first_REDO_LSN_for_page);
}
#ifndef DBUG_OFF
@@ -2671,6 +2675,8 @@ void pagecache_unlock(PAGECACHE *pagecache,
DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK &&
pin == PAGECACHE_UNPIN);
/* TODO: insert LSN writing code */
+ DBUG_ASSERT(first_REDO_LSN_for_page > 0);
+ set_if_bigger(block->rec_lsn, first_REDO_LSN_for_page);
}
#ifndef DBUG_OFF
@@ -3012,10 +3018,9 @@ restart:
pagecache->blocks_changed--;
pagecache->global_blocks_changed--;
/*
- free_block() will change the status of the block so no need to change
- it here.
+ free_block() will change the status and rec_lsn of the block so no
+ need to change them here.
*/
-
}
/* Cache is locked, so we can relese page before freeing it */
pagecache_make_lock_and_pin(pagecache, block,
@@ -3328,6 +3333,7 @@ static void free_block(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block)
#ifndef DBUG_OFF
block->type= PAGECACHE_EMPTY_PAGE;
#endif
+ block->rec_lsn= 0;
KEYCACHE_THREAD_TRACE("free block");
KEYCACHE_DBUG_PRINT("free_block",
("block is freed"));
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c
index 83312ce37b8..717b6202559 100644
--- a/storage/maria/ma_checkpoint.c
+++ b/storage/maria/ma_checkpoint.c
@@ -40,8 +40,7 @@
#include "share.h"
#include "log.h"
-/* could also be called LSN_ERROR */
-#define LSN_IMPOSSIBLE ((LSN)0)
+#define LSN_IMPOSSIBLE ((LSN)0) /* could also be called LSN_ERROR */
#define LSN_MAX ((LSN)ULONGLONG_MAX)
/*
@@ -57,9 +56,12 @@ st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,..
MEDIUM checkpoint.
*/
LSN max_rec_lsn_at_last_checkpoint= 0;
+/* last submitted checkpoint request; cleared only when executed */
CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE;
CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE;
+static inline ulonglong read_non_atomic(ulonglong volatile *x);
+
/*
Used by MySQL client threads requesting a checkpoint (like "ALTER MARIA
ENGINE DO CHECKPOINT"), and probably by maria_panic(), and at the end of the
@@ -67,6 +69,7 @@ CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE;
*/
my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
{
+ my_bool result;
DBUG_ENTER("execute_synchronous_checkpoint");
DBUG_ASSERT(level > NONE);
@@ -76,43 +79,52 @@ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
wait_on_checkpoint_done_cond();
synchronous_checkpoint_in_progress= level;
- execute_checkpoint(level);
+ result= execute_checkpoint(level);
safemutex_assert_owner(log_mutex);
synchronous_checkpoint_in_progress= NONE;
unlock(log_mutex);
broadcast(checkpoint_done_cond);
+ DBUG_RETURN(result);
}
-/* Picks a checkpoint request, if there is one, and executes it */
+/*
+ If no checkpoint is running, and there is a pending asynchronous checkpoint
+ request, executes it.
+ Is safe if multiple threads call it, though in first version only one will.
+ It's intended to be used by a thread which regularly calls this function;
+ this is why, if there is a request,it does not wait in a loop for
+ synchronous checkpoints to be finished, but just exits (because the thread
+ may want to do something useful meanwhile (flushing dirty pages for example)
+ instead of waiting).
+*/
my_bool execute_asynchronous_checkpoint_if_any()
{
+ my_bool result;
CHECKPOINT_LEVEL level;
DBUG_ENTER("execute_asynchronous_checkpoint");
lock(log_mutex);
- if (likely(next_asynchronous_checkpoint_to_do == NONE))
+ if (likely((next_asynchronous_checkpoint_to_do == NONE) ||
+ (synchronous_checkpoint_in_progress != NONE)))
{
unlock(log_mutex);
DBUG_RETURN(FALSE);
}
- while (synchronous_checkpoint_in_progress)
- wait_on_checkpoint_done_cond();
-
-do_checkpoint:
level= next_asynchronous_checkpoint_to_do;
DBUG_ASSERT(level > NONE);
- execute_checkpoint(level);
+ result= execute_checkpoint(level);
safemutex_assert_owner(log_mutex);
- if (next_asynchronous_checkpoint_to_do > level)
- goto do_checkpoint; /* one more request was posted */
- else
+ /* If only one thread calls this function, "<" can never happen below */
+ if (next_asynchronous_checkpoint_to_do <= level)
{
- DBUG_ASSERT(next_asynchronous_checkpoint_to_do == level);
- next_asynchronous_checkpoint_to_do= NONE; /* all work done */
+ /* it's our request or weaker/equal ones, all work is done */
+ next_asynchronous_checkpoint_to_do= NONE;
}
+ /* otherwise if it is a stronger request, we'll deal with it at next call */
unlock(log_mutex);
broadcast(checkpoint_done_cond);
+ DBUG_RETURN(result);
}
@@ -123,17 +135,14 @@ do_checkpoint:
*/
my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
{
- LSN candidate_max_rec_lsn_at_last_checkpoint;
- /* to avoid { lock + no-op + unlock } in the common (==indirect) case */
- my_bool need_log_mutex;
-
DBUG_ENTER("execute_checkpoint");
safemutex_assert_owner(log_mutex);
- copy_of_max_rec_lsn_at_last_checkpoint= max_rec_lsn_at_last_checkpoint;
- if (unlikely(need_log_mutex= (level > INDIRECT)))
+ if (unlikely(level > INDIRECT))
{
+ LSN copy_of_max_rec_lsn_at_last_checkpoint=
+ max_rec_lsn_at_last_checkpoint;
/* much I/O work to do, release log mutex */
unlock(log_mutex);
@@ -149,51 +158,29 @@ my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
flush all pages which were already dirty at last checkpoint:
ensures that recovery will never start from before the next-to-last
checkpoint (two-checkpoint rule).
- It is max, not min as the WL says (TODO update WL).
*/
flush_all_LRD_to_lsn(copy_of_max_rec_lsn_at_last_checkpoint);
/* this will go full speed (normal scheduling, no sleep) */
break;
}
+ lock(log_mutex);
}
- candidate_max_rec_lsn_at_last_checkpoint= checkpoint_indirect(need_log_mutex);
-
- lock(log_mutex);
- /*
- this portion cannot be done as a hook in write_log_record() for the
- LOGREC_CHECKPOINT type because:
- - at that moment we still have not written to the control file so cannot
- mark the request as done; this could be solved by writing to the control
- file in the hook but that would be an I/O under the log's mutex, bad.
- - it would not be nice organisation of code (I tried it :).
- */
- if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE)
- {
- /* checkpoint succeeded */
- maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
- written_since_last_checkpoint= (my_off_t)0;
- DBUG_RETURN(FALSE);
- }
/*
- keep mutex locked because callers will want to clear mutex-protected
- status variables
+ keep mutex locked upon exit because callers will want to clear
+ mutex-protected status variables
*/
- DBUG_RETURN(TRUE);
+ DBUG_RETURN(execute_checkpoint_indirect());
}
/*
Does an indirect checpoint (collects data from data structures, writes into
a checkpoint log record).
- Returns the largest LSN of the LRD when the checkpoint happened (this is a
- fuzzy definition), or LSN_IMPOSSIBLE on error. That LSN is used for the
- "two-checkpoint rule" (MEDIUM checkpoints).
+ Starts and ends while having log's mutex (released in the middle).
*/
-LSN checkpoint_indirect(my_bool need_log_mutex)
+my_bool execute_checkpoint_indirect()
{
- DBUG_ENTER("checkpoint_indirect");
-
int error= 0;
/* checkpoint record data: */
LSN checkpoint_start_lsn;
@@ -202,163 +189,198 @@ LSN checkpoint_indirect(my_bool need_log_mutex)
char *ptr;
LSN checkpoint_lsn;
LSN candidate_max_rec_lsn_at_last_checkpoint= 0;
- list_element *el; /* to scan lists */
- ulong stored_LRD_size= 0;
-
+ DBUG_ENTER("execute_checkpoint_indirect");
DBUG_ASSERT(sizeof(byte *) <= 8);
DBUG_ASSERT(sizeof(LSN) <= 8);
- if (need_log_mutex)
- lock(log_mutex); /* maybe this will clash with log_read_end_lsn() */
+ safemutex_assert_owner(log_mutex);
checkpoint_start_lsn= log_read_end_lsn();
+ if (LSN_IMPOSSIBLE == checkpoint_start_lsn) /* error */
+ DBUG_RETURN(TRUE);
unlock(log_mutex);
DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn));
/* STEP 1: fetch information about dirty pages */
-
- /*
- We lock the entire cache but will be quick, just reading/writing a few MBs
- of memory at most.
- */
- pagecache_pthread_mutex_lock(&pagecache->cache_lock);
-
- /*
- This is an over-estimation, as in theory blocks_changed may contain
- non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the
- checkpoint record; the true number of page-LRD-info we'll store into the
- record is stored_LRD_size.
- */
- string1.length= 8+8+(8+8)*pagecache->blocks_changed;
- if (NULL == (string1.str= my_malloc(string1.length)))
- goto err;
- ptr= string1.str;
- int8store(ptr, checkpoint_start_lsn);
- ptr+= 8+8; /* don't store stored_LRD_size now, wait */
- if (pagecache->blocks_changed > 0)
+ /* note: this piece will move into mysys/mf_pagecache.c */
{
+ ulong stored_LRD_size= 0;
+ /*
+ We lock the entire cache but will be quick, just reading/writing a few MBs
+ of memory at most.
+ When we enter here, we must be sure that no "first_in_switch" situation
+ is happening or will happen (either we have to get rid of
+ first_in_switch in the code or, first_in_switch has to increment a
+ "danger" counter for Checkpoint to know it has to wait.
+ */
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+ /*
+ This is an over-estimation, as in theory blocks_changed may contain
+ non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the
+ checkpoint record; the true number of page-LRD-info we'll store into the
+ record is stored_LRD_size.
+ */
/*
- There are different ways to scan the dirty blocks;
- flush_all_key_blocks() uses a loop over pagecache->used_last->next_used,
- and for each element of the loop, loops into
- pagecache->changed_blocks[FILE_HASH(file of the element)].
- This has the drawback that used_last includes non-dirty blocks, and it's
- two loops over many elements. Here we try something simpler.
- If there are no blocks in changed_blocks[file_hash], we should hit
- zeroes and skip them.
+ TODO: Ingo says blocks_changed is not a reliable number (see his
+ document); ask him.
*/
- uint file_hash;
- for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+ string1.length= 8+8+(8+8+8)*pagecache->blocks_changed;
+ if (NULL == (string1.str= my_malloc(string1.length)))
+ goto err;
+ ptr= string1.str;
+ int8store(ptr, checkpoint_start_lsn);
+ ptr+= 8+8; /* don't store stored_LRD_size now, wait */
+ if (pagecache->blocks_changed > 0)
{
- PAGECACHE_BLOCK_LINK *block;
- for (block= pagecache->changed_blocks[file_hash] ;
- block;
- block= block->next_changed)
+ uint file_hash;
+ for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
{
- DBUG_ASSERT(block->hash_link != NULL);
- DBUG_ASSERT(block->status & BLOCK_CHANGED);
- if (block->type != PAGECACHE_LSN_PAGE)
+ PAGECACHE_BLOCK_LINK *block;
+ for (block= pagecache->changed_blocks[file_hash] ;
+ block;
+ block= block->next_changed)
{
- /* no need to store it in the checkpoint record */
- continue;
+ DBUG_ASSERT(block->hash_link != NULL);
+ DBUG_ASSERT(block->status & BLOCK_CHANGED);
+ if (block->type != PAGECACHE_LSN_PAGE)
+ {
+ continue; /* no need to store it in the checkpoint record */
+ }
+ /*
+ In the current pagecache, rec_lsn is not set correctly:
+ 1) it is set on pagecache_unlock(), too late (a page is dirty
+ (BLOCK_CHANGED) since the first pagecache_write()). It may however
+ be not too late, because until unlock(), the page's update is not
+ committed, so it's ok that REDOs for it be skipped at Recovery
+ (which is what happens with an unset rec_lsn). Note that this
+ relies on the assumption that a transaction never commits while
+ holding locks on pages.
+ 2) sometimes the unlocking can be an implicit action of
+ pagecache_write(), without any call to pagecache_unlock(), then
+ rec_lsn is not set. That one is a critical problem.
+ TODO: fix this when Monty has explained how he writes BLOB pages.
+ */
+ if (0 == block->rec_lsn)
+ abort(); /* always fail in all builds, in case it's problem 2) */
+
+ int8store(ptr, block->hash_link->file.file);
+ ptr+= 8;
+ int8store(ptr, block->hash_link->pageno);
+ ptr+= 8;
+ int8store(ptr, block->rec_lsn);
+ ptr+= 8;
+ stored_LRD_size++;
+ DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed);
+ set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint,
+ block->rec_lsn);
}
- /* Q: two "block"s cannot have the same "hash_link", right? */
- int8store(ptr, block->hash_link->pageno);
- ptr+= 8;
- /* I assume rec_lsn will be member of "block", not of "hash_link" */
- int8store(ptr, block->rec_lsn);
- ptr+= 8;
- stored_LRD_size++;
- set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint,
- block->rec_lsn);
}
- }
- pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
- DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed);
- int8store(string1.str+8, stored_LRD_size);
- string1.length= 8+8+(8+8)*stored_LRD_size;
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ int8store(string1.str+8, stored_LRD_size);
+ string1.length= 8+8+(8+8+8)*stored_LRD_size;
+ }
/* STEP 2: fetch information about transactions */
-
+ /* note: this piece will move into trnman.c */
/*
- If trx are in more than one list (e.g. three:
- running transactions, committed transactions, purge queue), we can either
- take mutexes of all three together or do crabbing.
- But if an element can move from list 1 to list 3 without passing through
- list 2, crabbing is dangerous.
- Hopefully it's ok to take 3 mutexes together...
- Otherwise I'll have to make sure I miss no important trx and I handle dups.
+ Transactions are in the "active list" (protected by a mutex) and in a
+ lock-free hash of "committed" (insertion protected by the same mutex,
+ deletion lock-free).
*/
- lock(global_transactions_list_mutex); /* or 3 mutexes if there are 3 */
- string2.length= 8+(8+8)*trx_list->count;
- if (NULL == (string2.str= my_malloc(string2.length)))
- goto err;
- ptr= string2.str;
- int8store(ptr, trx_list->count);
- ptr+= 8;
- for (el= trx_list->first; el; el= el->next)
{
- /* possibly latch el.rwlock */
- *ptr= el->state;
- ptr++;
- int7store(ptr, el->long_trans_id);
- ptr+= 7;
- int2store(ptr, el->short_trans_id);
- ptr+= 2;
- int8store(ptr, el->undo_lsn);
- ptr+= 8;
- int8store(ptr, el->undo_purge_lsn);
+ TRN *trn;
+ ulong stored_trn_size= 0;
+ /* First, the active transactions */
+ pthread_mutex_lock(LOCK_trn_list);
+ string2.length= 8+(7+2+8+8+8)*trnman_active_transactions;
+ if (NULL == (string2.str= my_malloc(string2.length)))
+ goto err;
+ ptr= string2.str;
ptr+= 8;
+ for (trn= active_list_min.next; trn != &active_list_max; trn= trn->next)
+ {
+ /* we would latch trn.rwlock if it existed */
+ if (0 == trn->short_trid) /* trn is not inited, skip */
+ continue;
+ /* state is not needed for now (only when we have prepared trx) */
+ /* int7store does not exist but mi_int7store does */
+ int7store(ptr, trn->trid);
+ ptr+= 7;
+ int2store(ptr, trn->short_trid);
+ ptr+= 2;
+ int8store(ptr, trn->undo_lsn); /* is an LSN 7 or 8 bytes really? */
+ ptr+= 8;
+ int8store(ptr, trn->undo_purge_lsn);
+ ptr+= 8;
+ int8store(ptr, read_non_atomic(&trn->first_undo_lsn));
+ ptr+= 8;
+ /* possibly unlatch el.rwlock */
+ stored_trn_size++;
+ }
+ pthread_mutex_unlock(LOCK_trn_list);
/*
- if no latch, use double variable of type ULONGLONG_CONSISTENT in
- st_transaction, or even no need if Intel >=486
+ Now the committed ones.
+ We need a function which scans the hash's list of elements in a
+ lock-free manner (a bit like lfind(), starting from bucket 0), and for
+ each node (committed transaction) stores the transaction's
+ information (trid, undo_purge_lsn, first_undo_lsn) into a buffer.
+ This big buffer is malloc'ed at the start, so the number of elements (or
+ an upper bound of it) found in the hash needs to be known in advance
+ (one solution is to keep LOCK_trn_list locked, ensuring that nodes are
+ only deleted).
*/
- int8store(ptr, el->first_undo_lsn);
- ptr+= 8;
- /* possibly unlatch el.rwlock */
+ /*
+ TODO: if we see there exists no transaction (active and committed) we can
+ tell the lock-free structures to do some freeing (my_free()).
+ */
+ int8store(string1.str, stored_trn_size);
+ string2.length= 8+(7+2+8+8+8)*stored_trn_size;
}
- unlock(global_transactions_list_mutex);
/* STEP 3: fetch information about table files */
- /* This global mutex is in fact THR_LOCK_maria (see ma_open()) */
- lock(global_share_list_mutex);
- string3.length= 8+(8+8)*share_list->count;
- if (NULL == (string3.str= my_malloc(string3.length)))
- goto err;
- ptr= string3.str;
- /* possibly latch each MARIA_SHARE, one by one, like this: */
- pthread_mutex_lock(&share->intern_lock);
- /*
- We'll copy the file id (a bit like share->kfile), the file name
- (like share->unique_file_name[_length]).
- */
- make_copy_of_global_share_list_to_array;
- pthread_mutex_unlock(&share->intern_lock);
- unlock(global_share_list_mutex);
-
- /* work on copy */
- int8store(ptr, elements_in_array);
- ptr+= 8;
- for (el in array)
{
- int8store(ptr, array[...].file_id);
- ptr+= 8;
- memcpy(ptr, array[...].file_name, ...);
- ptr+= ...;
+ /* This global mutex is in fact THR_LOCK_maria (see ma_open()) */
+ lock(global_share_list_mutex);
+ string3.length= 8+(8+8)*share_list->count;
+ if (NULL == (string3.str= my_malloc(string3.length)))
+ goto err;
+ ptr= string3.str;
+ /* possibly latch each MARIA_SHARE, one by one, like this: */
+ pthread_mutex_lock(&share->intern_lock);
/*
- these two are long ops (involving disk I/O) that's why we copied the
- list, to not keep the list locked for long:
+ We'll copy the file id (a bit like share->kfile), the file name
+ (like share->unique_file_name[_length]).
*/
- flush_bitmap_pages(el);
- /* TODO: and also autoinc counter, logical file end, free page list */
+ make_copy_of_global_share_list_to_array;
+ pthread_mutex_unlock(&share->intern_lock);
+ unlock(global_share_list_mutex);
- /*
- fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per
- second, so if you have touched 1000 files it's 7 seconds).
- */
- force_file(el);
+ /* work on copy */
+ int8store(ptr, elements_in_array);
+ ptr+= 8;
+ for (el in array)
+ {
+ int8store(ptr, array[...].file_id);
+ ptr+= 8;
+ memcpy(ptr, array[...].file_name, ...);
+ ptr+= ...;
+ /*
+ these two are long ops (involving disk I/O) that's why we copied the
+ list, to not keep the list locked for long:
+ */
+ /* TODO: what if the table pointer is gone/reused now? */
+ flush_bitmap_pages(el);
+ /* TODO: and also autoinc counter, logical file end, free page list */
+
+ /*
+ fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per
+ second, so if you have touched 1000 files it's 7 seconds).
+ */
+ force_file(el);
+ }
}
/* LAST STEP: now write the checkpoint log record */
@@ -389,11 +411,38 @@ err:
candidate_max_rec_lsn_at_last_checkpoint= LSN_IMPOSSIBLE;
end:
+
my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR));
my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR));
my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR));
- DBUG_RETURN(candidate_max_rec_lsn_at_last_checkpoint);
+ /*
+ this portion cannot be done as a hook in write_log_record() for the
+ LOGREC_CHECKPOINT type because:
+ - at that moment we still have not written to the control file so cannot
+ mark the request as done; this could be solved by writing to the control
+ file in the hook but that would be an I/O under the log's mutex, bad.
+ - it would not be nice organisation of code (I tried it :).
+ */
+ if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE)
+ {
+ /* checkpoint succeeded */
+ /*
+ TODO: compute log's low water mark (how to do that with our fuzzy
+ ARIES-like reads of data structures? TODO think about it :).
+ */
+ lock(log_mutex);
+ /* That LSN is used for the "two-checkpoint rule" (MEDIUM checkpoints) */
+ maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
+ written_since_last_checkpoint= (my_off_t)0;
+ DBUG_RETURN(FALSE);
+ }
+ lock(log_mutex);
+ DBUG_RETURN(TRUE);
+ /*
+ keep mutex locked upon exit because callers will want to clear
+ mutex-protected status variables
+ */
}
@@ -433,7 +482,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
safemutex_assert_owner(log_mutex);
DBUG_ASSERT(level > NONE);
- if (checkpoint_request < level)
+ if (next_asynchronous_checkpoint_to_do < level)
{
/* no equal or stronger running or to run, we post request */
/*
@@ -445,7 +494,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
(see least_recently_dirtied.c) will notice our request in max a few
seconds.
*/
- checkpoint_request= level; /* post request */
+ next_asynchronous_checkpoint_to_do= level; /* post request */
}
/*
@@ -457,3 +506,37 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
the end user.
*/
}
+
+
+/*
+ If a 64-bit variable transitions from both halves being zero to both halves
+ being non-zero, and never changes after that (like the transaction's
+ first_undo_lsn), this function can be used to do a read of it (without
+ mutex, without atomic load) which always produces a correct (though maybe
+ slightly old) value (even on 32-bit CPUs).
+*/
+static inline ulonglong read_non_atomic(ulonglong volatile *x)
+{
+#if ( SIZEOF_CHARP >= 8 )
+ /* 64-bit CPU (right?), 64-bit reads are atomic */
+ return *x;
+#else
+ /*
+ 32-bit CPU, 64-bit reads may give a mixed of old half and new half (old
+ low bits and new high bits, or the contrary).
+ As the variable we read transitions from both halves being zero to both
+ halves being non-zero, and never changes then, we can detect atomicity
+ problems:
+ */
+ ulonglong y;
+ for (;;) /* loop until no atomicity problems */
+ {
+ y= *x;
+ if (likely(((0 == y) ||
+ ((0 != (y >> 32)) && (0 != (y << 32)))))
+ return y;
+ /* Worth seeing it! */
+ DBUG_PRINT("info",("atomicity problem"));
+ }
+#endif
+}
diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c
index 5b940eaf4c3..73764cf444a 100644
--- a/storage/maria/ma_close.c
+++ b/storage/maria/ma_close.c
@@ -57,6 +57,12 @@ int maria_close(register MARIA_HA *info)
info->opt_flag&= ~(READ_CACHE_USED | WRITE_CACHE_USED);
}
flag= !--share->reopen;
+ /*
+ RECOVERYTODO:
+ Below we are going to make the table unknown to future checkpoints, so it
+ needs to have fsync'ed itself entirely (bitmap, pages, etc) at this
+ point.
+ */
maria_open_list=list_delete(maria_open_list,&info->open_list);
pthread_mutex_unlock(&share->intern_lock);
diff --git a/storage/maria/ma_least_recently_dirtied.c b/storage/maria/ma_least_recently_dirtied.c
index b0b7fb1ef10..809442b4e97 100644
--- a/storage/maria/ma_least_recently_dirtied.c
+++ b/storage/maria/ma_least_recently_dirtied.c
@@ -94,7 +94,10 @@ pthread_handler_decl background_flush_and_checkpoint_thread()
while (this_thread_not_killed)
{
if ((flush_calls++) & ((2<<CHECKPOINT_PROBING_PERIOD_LOG2)-1) == 0)
- execute_asynchronous_checkpoint_if_any();
+ {
+ /* note that we don't care of the checkpoint's success */
+ (void)execute_asynchronous_checkpoint_if_any();
+ }
lock(global_LRD_mutex);
flush_one_group_from_LRD();
safemutex_assert_not_owner(global_LRD_mutex);