diff options
author | unknown <guilhem@gbichot3.local> | 2006-12-18 17:24:02 +0100 |
---|---|---|
committer | unknown <guilhem@gbichot3.local> | 2006-12-18 17:24:02 +0100 |
commit | 7199c905590391f64802913369aab7d288eff4c8 (patch) | |
tree | d38393d634c6dc8b3886863dbdab2d348526eb45 | |
parent | 71b404973c1f6343e9e63d3179c65f3642aade9a (diff) | |
download | mariadb-git-7199c905590391f64802913369aab7d288eff4c8.tar.gz |
WL#3071 Maria checkpoint
- cleanups, simplifications
- moving the construction of the "dirty pages table" into the
pagecache where it belongs (because it's the pagecache which knows
dirty pages). TODO: do the same soon for the "transactions table".
- fix for a small bug in the pagecache (decrementation of "changed_blocks")
include/pagecache.h:
prototype
mysys/mf_pagecache.c:
m_string.h moves up for LEX_STRING to be known for pagecache.h.
In pagecache_delete_page(), we must decrement "blocks_changed" even
if we just delete the page without flushing it.
A new function pagecache_collect_changed_blocks_with_LSN()
(used by the Checkpoint module), which stores information about the
changed blocks (a.k.a. "the dirty pages table") into a LEX_STRING.
This function is not tested now, it will be when there is a Checkpoint.
storage/maria/ma_checkpoint.c:
refining the checkpoint code: factoring functions, moving the
construction of the "dirty pages table" into mf_pagecache.c
(I'll do the same with the construction of the "transactions table"
once Serg tells me what's the best way to do it).
storage/maria/ma_least_recently_dirtied.c:
Simplifying the thread which does background flushing of
least-recently-dirtied pages:
- in first version that thread will not flush, just do checkpoints
- in 2nd version, flushing should re-use existing page cache functions
like flush_pagecache_blocks().
unittest/mysys/test_file.h:
m_string.h moves up for LEX_STRING to be known in pagecache.h
-rw-r--r-- | include/pagecache.h | 3 | ||||
-rwxr-xr-x | mysys/mf_pagecache.c | 180 | ||||
-rw-r--r-- | storage/maria/ma_checkpoint.c | 179 | ||||
-rw-r--r-- | storage/maria/ma_least_recently_dirtied.c | 182 | ||||
-rw-r--r-- | unittest/mysys/test_file.h | 2 |
5 files changed, 231 insertions, 315 deletions
diff --git a/include/pagecache.h b/include/pagecache.h index 4d64070ad62..9f215325ae5 100644 --- a/include/pagecache.h +++ b/include/pagecache.h @@ -221,6 +221,9 @@ extern my_bool pagecache_delete_page(PAGECACHE *pagecache, enum pagecache_page_lock lock, my_bool flush); extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup); +extern my_bool pagecache_collect_changed_blocks_with_LSN(PAGECACHE *pagecache, + LEX_STRING *str, + LSN *max_lsn); C_MODE_END #endif /* _keycache_h */ diff --git a/mysys/mf_pagecache.c b/mysys/mf_pagecache.c index 807a3ea520a..96c855fda0a 100755 --- a/mysys/mf_pagecache.c +++ b/mysys/mf_pagecache.c @@ -40,9 +40,9 @@ */ #include "mysys_priv.h" +#include <m_string.h> #include <pagecache.h> #include "my_static.h" -#include <m_string.h> #include <my_bit.h> #include <errno.h> #include <stdarg.h> @@ -295,7 +295,7 @@ struct st_pagecache_block_link enum pagecache_page_type type; /* type of the block */ uint hits_left; /* number of hits left until promotion */ ulonglong last_hit_time; /* timestamp of the last hit */ - ulonglong rec_lsn; /* LSN when first became dirty */ + LSN rec_lsn; /* LSN when first became dirty */ KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */ }; @@ -2988,33 +2988,35 @@ restart: goto restart; } - if (block->status & BLOCK_CHANGED && flush) + if (block->status & BLOCK_CHANGED) { - /* The block contains a dirty page - push it out of the cache */ - - KEYCACHE_DBUG_PRINT("find_key_block", ("block is dirty")); - - pagecache_pthread_mutex_unlock(&pagecache->cache_lock); - /* - The call is thread safe because only the current - thread might change the block->hash_link value - */ - DBUG_ASSERT(block->pins == 1); - error= pagecache_fwrite(pagecache, - &block->hash_link->file, - block->buffer, - block->hash_link->pageno, - block->type, - MYF(MY_NABP | MY_WAIT_IF_FULL)); - pagecache_pthread_mutex_lock(&pagecache->cache_lock); - pagecache->global_cache_write++; - - if (error) + if (flush) { - block->status|= BLOCK_ERROR; - goto err; + /* The block contains a dirty page - push it out of the cache */ + + KEYCACHE_DBUG_PRINT("find_key_block", ("block is dirty")); + + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + /* + The call is thread safe because only the current + thread might change the block->hash_link value + */ + DBUG_ASSERT(block->pins == 1); + error= pagecache_fwrite(pagecache, + &block->hash_link->file, + block->buffer, + block->hash_link->pageno, + block->type, + MYF(MY_NABP | MY_WAIT_IF_FULL)); + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + pagecache->global_cache_write++; + + if (error) + { + block->status|= BLOCK_ERROR; + goto err; + } } - pagecache->blocks_changed--; pagecache->global_blocks_changed--; /* @@ -3793,6 +3795,132 @@ int reset_key_cache_counters(const char *name, PAGECACHE *key_cache) } +/* + Allocates a buffer and stores in it some information about all dirty pages + of type PAGECACHE_LSN_PAGE. + + SYNOPSIS + pagecache_collect_changed_blocks_with_LSN() + pagecache pointer to the page cache + str (OUT) pointer to a LEX_STRING where the allocated buffer, and + its size, will be put + max_lsn (OUT) pointer to a LSN where the maximum rec_lsn of all + relevant dirty pages will be put + + DESCRIPTION + Does the allocation because the caller cannot know the size itself. + Memory freeing is done by the caller. + Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they + are not interesting for a checkpoint record. + The caller has the intention of doing checkpoints. + + RETURN + 0 on success + 1 on error +*/ +my_bool pagecache_collect_changed_blocks_with_LSN(PAGECACHE *pagecache, + LEX_STRING *str, + LSN *max_lsn) +{ + my_bool error; + ulong stored_LRD_size= 0; + uint file_hash; + char *ptr; + DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN"); + + *max_lsn= 0; + /* + We lock the entire cache but will be quick, just reading/writing a few MBs + of memory at most. + When we enter here, we must be sure that no "first_in_switch" situation + is happening or will happen (either we have to get rid of + first_in_switch in the code or, first_in_switch has to increment a + "danger" counter for this function to know it has to wait). TODO. + */ + pagecache_pthread_mutex_lock(&pagecache->cache_lock); + + /* Count how many dirty pages are interesting */ + for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[file_hash] ; + block; + block= block->next_changed) + { + /* + Q: is there somthing subtle with block->hash_link: can it be NULL? + does it have to be == hash_link->block... ? + */ + DBUG_ASSERT(block->hash_link != NULL); + DBUG_ASSERT(block->status & BLOCK_CHANGED); + if (block->type != PAGECACHE_LSN_PAGE) + continue; /* no need to store it */ + /* + In the current pagecache, rec_lsn is not set correctly: + 1) it is set on pagecache_unlock(), too late (a page is dirty + (BLOCK_CHANGED) since the first pagecache_write()). So in this + scenario: + thread1: thread2: + write_REDO + pagecache_write() checkpoint : reclsn not known + pagecache_unlock(sets rec_lsn) + commit + crash, + at recovery we will wrongly skip the REDO. It also affects the + low-water mark's computation. + 2) sometimes the unlocking can be an implicit action of + pagecache_write(), without any call to pagecache_unlock(), then + rec_lsn is not set. + 1) and 2) are critical problems. + TODO: fix this when Monty has explained how he writes BLOB pages. + */ + if (0 == block->rec_lsn) + { + DBUG_ASSERT(0); + goto err; + } + stored_LRD_size++; + } + } + + str->length= 8+(4+4+8)*stored_LRD_size; + if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME)))) + goto err; + ptr= str->str; + int8store(ptr, stored_LRD_size); + ptr+= 8; + if (0 == stored_LRD_size) + goto end; + for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++) + { + PAGECACHE_BLOCK_LINK *block; + for (block= pagecache->changed_blocks[file_hash] ; + block; + block= block->next_changed) + { + if (block->type != PAGECACHE_LSN_PAGE) + continue; /* no need to store it in the checkpoint record */ + DBUG_ASSERT((4 == sizeof(block->hash_link->file.file)) && + (4 == sizeof(block->hash_link->pageno))); + int4store(ptr, block->hash_link->file.file); + ptr+= 4; + int4store(ptr, block->hash_link->pageno); + ptr+= 4; + int8store(ptr, (ulonglong)block->rec_lsn); + ptr+= 8; + set_if_bigger(*max_lsn, block->rec_lsn); + } + } + error= 0; + goto end; +err: + error= 1; +end: + pagecache_pthread_mutex_unlock(&pagecache->cache_lock); + DBUG_RETURN(error); +} + + #ifndef DBUG_OFF /* Test if disk-cache is ok diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c index 608a6fb9fcd..a1d094d7da1 100644 --- a/storage/maria/ma_checkpoint.c +++ b/storage/maria/ma_checkpoint.c @@ -56,9 +56,9 @@ st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,.. MEDIUM checkpoint. */ LSN max_rec_lsn_at_last_checkpoint= 0; -/* last submitted checkpoint request; cleared only when executed */ +/* last submitted checkpoint request; cleared when starts */ CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE; -CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE; +CHECKPOINT_LEVEL checkpoint_in_progress= NONE; static inline ulonglong read_non_atomic(ulonglong volatile *x); @@ -74,16 +74,10 @@ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level) DBUG_ASSERT(level > NONE); lock(log_mutex); - while ((synchronous_checkpoint_in_progress != NONE) || - (next_asynchronous_checkpoint_to_do != NONE)) + while (checkpoint_in_progress != NONE) wait_on_checkpoint_done_cond(); - synchronous_checkpoint_in_progress= level; result= execute_checkpoint(level); - safemutex_assert_owner(log_mutex); - synchronous_checkpoint_in_progress= NONE; - unlock(log_mutex); - broadcast(checkpoint_done_cond); DBUG_RETURN(result); } @@ -92,7 +86,7 @@ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level) request, executes it. Is safe if multiple threads call it, though in first version only one will. It's intended to be used by a thread which regularly calls this function; - this is why, if there is a request,it does not wait in a loop for + this is why, if there is a request, it does not wait in a loop for synchronous checkpoints to be finished, but just exits (because the thread may want to do something useful meanwhile (flushing dirty pages for example) instead of waiting). @@ -103,27 +97,20 @@ my_bool execute_asynchronous_checkpoint_if_any() CHECKPOINT_LEVEL level; DBUG_ENTER("execute_asynchronous_checkpoint"); + /* first check without mutex, ok to see old data */ + if (likely((next_asynchronous_checkpoint_to_do == NONE) || + (checkpoint_in_progress != NONE))) + DBUG_RETURN(FALSE); + lock(log_mutex); if (likely((next_asynchronous_checkpoint_to_do == NONE) || - (synchronous_checkpoint_in_progress != NONE))) + (checkpoint_in_progress != NONE))) { unlock(log_mutex); DBUG_RETURN(FALSE); } - level= next_asynchronous_checkpoint_to_do; - DBUG_ASSERT(level > NONE); - result= execute_checkpoint(level); - safemutex_assert_owner(log_mutex); - /* If only one thread calls this function, "<" can never happen below */ - if (next_asynchronous_checkpoint_to_do <= level) - { - /* it's our request or weaker/equal ones, all work is done */ - next_asynchronous_checkpoint_to_do= NONE; - } - /* otherwise if it is a stronger request, we'll deal with it at next call */ - unlock(log_mutex); - broadcast(checkpoint_done_cond); + result= execute_checkpoint(next_asynchronous_checkpoint_to_do); DBUG_RETURN(result); } @@ -135,9 +122,13 @@ my_bool execute_asynchronous_checkpoint_if_any() */ my_bool execute_checkpoint(CHECKPOINT_LEVEL level) { + my_bool result; DBUG_ENTER("execute_checkpoint"); safemutex_assert_owner(log_mutex); + if (next_asynchronous_checkpoint_to_do <= level) + next_asynchronous_checkpoint_to_do= NONE; + checkpoint_in_progress= level; if (unlikely(level > INDIRECT)) { @@ -166,11 +157,11 @@ my_bool execute_checkpoint(CHECKPOINT_LEVEL level) lock(log_mutex); } - /* - keep mutex locked upon exit because callers will want to clear - mutex-protected status variables - */ - DBUG_RETURN(execute_checkpoint_indirect()); + result= execute_checkpoint_indirect(); + checkpoint_in_progress= NONE; + unlock(log_mutex); + broadcast(checkpoint_done_cond); + DBUG_RETURN(result); } @@ -181,114 +172,37 @@ my_bool execute_checkpoint(CHECKPOINT_LEVEL level) */ my_bool execute_checkpoint_indirect() { - int error= 0; + int error= 0, i; /* checkpoint record data: */ LSN checkpoint_start_lsn; - LEX_STRING string1={0,0}, string2={0,0}, string3={0,0}; - LEX_STRING *string_array[4]; + char checkpoint_start_lsn_char[8]; + LEX_STRING strings[5]={ {&checkpoint_start_lsn_str, 8}, {0,0}, {0,0}, {0,0}, {0,0} }; char *ptr; LSN checkpoint_lsn; - LSN candidate_max_rec_lsn_at_last_checkpoint= 0; + LSN candidate_max_rec_lsn_at_last_checkpoint; DBUG_ENTER("execute_checkpoint_indirect"); DBUG_ASSERT(sizeof(byte *) <= 8); DBUG_ASSERT(sizeof(LSN) <= 8); safemutex_assert_owner(log_mutex); + + /* STEP 1: record current end-of-log LSN */ checkpoint_start_lsn= log_read_end_lsn(); if (LSN_IMPOSSIBLE == checkpoint_start_lsn) /* error */ DBUG_RETURN(TRUE); unlock(log_mutex); DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn)); + int8store(strings[0].str, checkpoint_start_lsn); - /* STEP 1: fetch information about dirty pages */ - /* note: this piece will move into mysys/mf_pagecache.c */ - { - ulong stored_LRD_size= 0; - /* - We lock the entire cache but will be quick, just reading/writing a few MBs - of memory at most. - When we enter here, we must be sure that no "first_in_switch" situation - is happening or will happen (either we have to get rid of - first_in_switch in the code or, first_in_switch has to increment a - "danger" counter for Checkpoint to know it has to wait. TODO. - */ - pagecache_pthread_mutex_lock(&pagecache->cache_lock); + /* STEP 2: fetch information about dirty pages */ - /* - This is an over-estimation, as in theory blocks_changed may contain - non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the - checkpoint record; the true number of page-LRD-info we'll store into the - record is stored_LRD_size. - */ - /* - TODO: Ingo says blocks_changed is not a reliable number (see his - document); ask him. - */ - string1.length= 8+8+(8+8+8)*pagecache->blocks_changed; - if (NULL == (string1.str= my_malloc(string1.length))) - goto err; - ptr= string1.str; - int8store(ptr, checkpoint_start_lsn); - ptr+= 8+8; /* don't store stored_LRD_size now, wait */ - if (pagecache->blocks_changed > 0) - { - uint file_hash; - for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++) - { - PAGECACHE_BLOCK_LINK *block; - for (block= pagecache->changed_blocks[file_hash] ; - block; - block= block->next_changed) - { - DBUG_ASSERT(block->hash_link != NULL); - DBUG_ASSERT(block->status & BLOCK_CHANGED); - if (block->type != PAGECACHE_LSN_PAGE) - { - continue; /* no need to store it in the checkpoint record */ - } - /* - In the current pagecache, rec_lsn is not set correctly: - 1) it is set on pagecache_unlock(), too late (a page is dirty - (BLOCK_CHANGED) since the first pagecache_write()). So in this - scenario: - thread1: thread2: - write_REDO - pagecache_write() - checkpoint : reclsn not known - pagecache_unlock(sets rec_lsn) - commit - crash, - at recovery we will wrongly skip the REDO. It also affects the - low-water mark's computation. - 2) sometimes the unlocking can be an implicit action of - pagecache_write(), without any call to pagecache_unlock(), then - rec_lsn is not set. - 1) and 2) are critical problems. - TODO: fix this when Monty has explained how he writes BLOB pages. - */ - if (0 == block->rec_lsn) - abort(); /* always fail in all builds */ - - int8store(ptr, block->hash_link->file.file); - ptr+= 8; - int8store(ptr, block->hash_link->pageno); - ptr+= 8; - int8store(ptr, block->rec_lsn); - ptr+= 8; - stored_LRD_size++; - DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed); - set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint, - block->rec_lsn); - } - } - pagecache_pthread_mutex_unlock(&pagecache->cache_lock); - int8store(string1.str+8, stored_LRD_size); - string1.length= 8+8+(8+8+8)*stored_LRD_size; - } + if (pagecache_collect_changed_blocks_with_LSN(pagecache, &strings[1], + &candidate_max_rec_lsn_at_last_checkpoint)) + goto err; - /* STEP 2: fetch information about transactions */ + /* STEP 3: fetch information about transactions */ /* note: this piece will move into trnman.c */ /* Transactions are in the "active list" (protected by a mutex) and in a @@ -345,7 +259,7 @@ my_bool execute_checkpoint_indirect() string2.length= 8+(7+2+8+8+8)*stored_trn_size; } - /* STEP 3: fetch information about table files */ + /* STEP 4: fetch information about table files */ { /* This global mutex is in fact THR_LOCK_maria (see ma_open()) */ @@ -391,13 +305,8 @@ my_bool execute_checkpoint_indirect() /* LAST STEP: now write the checkpoint log record */ - string_array[0]= string1; - string_array[1]= string2; - string_array[2]= string3; - string_array[3]= NULL; - checkpoint_lsn= log_write_record(LOGREC_CHECKPOINT, - &system_trans, string_array); + &system_trans, strings); /* Do nothing between the log write and the control file write, for the @@ -418,9 +327,8 @@ err: end: - my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR)); - my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR)); - my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR)); + for (i= 1; i<5; i++) + my_free(strings[i], MYF(MY_ALLOW_ZERO_PTR)); /* this portion cannot be done as a hook in write_log_record() for the @@ -440,7 +348,6 @@ end: lock(log_mutex); /* That LSN is used for the "two-checkpoint rule" (MEDIUM checkpoints) */ maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint; - written_since_last_checkpoint= (my_off_t)0; DBUG_RETURN(FALSE); } lock(log_mutex); @@ -471,6 +378,8 @@ log_write_record(...) thread" WL#3261) to do a checkpoint */ request_asynchronous_checkpoint(INDIRECT); + /* prevent similar redundant requests */ + written_since_last_checkpoint= (my_off_t)0; } ...; unlock(log_mutex); @@ -488,16 +397,13 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level); safemutex_assert_owner(log_mutex); DBUG_ASSERT(level > NONE); - if (next_asynchronous_checkpoint_to_do < level) + if ((next_asynchronous_checkpoint_to_do < level) && + (checkpoint_in_progress < level)) { /* no equal or stronger running or to run, we post request */ /* - note that thousands of requests for checkpoints are going to come all - at the same time (when the log bound - MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS is passed), so it may not be a - good idea for each of them to broadcast a cond to wake up the background - checkpoint thread. We just don't broacast a cond, the checkpoint thread - (see least_recently_dirtied.c) will notice our request in max a few + We just don't broacast a cond, the checkpoint thread + (see ma_least_recently_dirtied.c) will notice our request in max a few seconds. */ next_asynchronous_checkpoint_to_do= level; /* post request */ @@ -520,6 +426,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level); first_undo_lsn), this function can be used to do a read of it (without mutex, without atomic load) which always produces a correct (though maybe slightly old) value (even on 32-bit CPUs). + The prototype will change with Sanja's new LSN type. */ static inline ulonglong read_non_atomic(ulonglong volatile *x) { diff --git a/storage/maria/ma_least_recently_dirtied.c b/storage/maria/ma_least_recently_dirtied.c index 809442b4e97..170e59a601a 100644 --- a/storage/maria/ma_least_recently_dirtied.c +++ b/storage/maria/ma_least_recently_dirtied.c @@ -36,162 +36,57 @@ #include "least_recently_dirtied.h" /* - MikaelR suggested removing this global_LRD_mutex (I have a paper note of - comments), however at least for the first version we'll start with this - mutex (which will be a LOCK-based atomic_rwlock). -*/ -pthread_mutex_t global_LRD_mutex; - -/* - When we flush a page, we should pin page. - This "pin" is to protect against that: - I make copy, - you modify in memory and flush to disk and remove from LRD and from cache, - I write copy to disk, - checkpoint happens. - result: old page is on disk, page is absent from LRD, your REDO will be - wrongly ignored. - - Pin: there can be multiple pins, flushing imposes that there are zero pins. - For example, pin could be a uint counter protected by the page's latch. - - Maybe it's ok if when there is a page replacement, the replacer does not - remove page from the LRD (it would save global mutex); for that, background - flusher should be prepared to see pages in the LRD which are not in the page - cache (then just ignore them). However checkpoint will contain superfluous - entries and so do more work. -*/ - -#define PAGE_SIZE (16*1024) /* just as an example */ -/* - Optimization: - LRD flusher should not flush pages one by one: to be fast, it flushes a - group of pages in sequential disk order if possible; a group of pages is just - FLUSH_GROUP_SIZE pages. - Key cache has groupping already somehow Monty said (investigate that). -*/ -#define FLUSH_GROUP_SIZE 512 /* 8 MB */ -/* - We don't want to probe for checkpoint requests all the time (it takes - the log mutex). - If FLUSH_GROUP_SIZE is 8MB, assuming a local disk which can write 30MB/s - (1.8GB/min), probing every 16th call to flush_one_group_from_LRD() is every - 16*8=128MB which is every 128/30=4.2second. - Using a power of 2 gives a fast modulo operation. -*/ -#define CHECKPOINT_PROBING_PERIOD_LOG2 4 - -/* - This thread does background flush of pieces of the LRD, and all checkpoints. + This thread does background flush of pieces of the LRD, and serves + requests for asynchronous checkpoints. Just launch it when engine starts. MikaelR questioned why the same thread does two different jobs, the risk could be that while a checkpoint happens no LRD flushing happens. + For now, we only do checkpoints - no LRD flushing (to be done when the + second version of the page cache is ready WL#3077). + Reasons to delay: + - Recovery will work (just slower) + - new page cache may be different, why do then re-do + - current pagecache probably has issues with flushing when somebody is + writing to the table being flushed - better avoid that. */ pthread_handler_decl background_flush_and_checkpoint_thread() { - char *flush_group_buffer= my_malloc(PAGE_SIZE*FLUSH_GROUP_SIZE); - uint flush_calls= 0; while (this_thread_not_killed) { - if ((flush_calls++) & ((2<<CHECKPOINT_PROBING_PERIOD_LOG2)-1) == 0) - { - /* note that we don't care of the checkpoint's success */ - (void)execute_asynchronous_checkpoint_if_any(); - } - lock(global_LRD_mutex); - flush_one_group_from_LRD(); - safemutex_assert_not_owner(global_LRD_mutex); + /* note that we don't care of the checkpoint's success */ + (void)execute_asynchronous_checkpoint_if_any(); + sleep(5); /* - We are a background thread, leave time for client threads or we would - monopolize the disk: + in the final version, we will not sleep but call flush_pages_from_LRD() + repeatedly. If there are no dirty pages, we'll make sure to not have a + tight loop probing for checkpoint requests. */ - pthread_yield(); } - my_free(flush_group_buffer); } +/* The rest of this file will not serve in first version */ + /* - flushes only the first FLUSH_GROUP_SIZE pages of the LRD. + flushes only the first pages of the LRD. + max_this_number could be FLUSH_CACHE (of mf_pagecache.c) for example. */ -flush_one_group_from_LRD() +flush_pages_from_LRD(uint max_this_number, LSN max_this_lsn) { - char *ptr; - safe_mutex_assert_owner(global_LRD_mutex); - - for (page= 0; page<FLUSH_GROUP_SIZE; page++) - { - copy_element_to_array; - } /* One rule to better observe is "page must be flushed to disk before it is removed from LRD" (otherwise checkpoint is incomplete info, corruption). */ - unlock(global_LRD_mutex); - /* page id is concatenation of "file id" and "number of page in file" */ - qsort(array, sizeof(*element), FLUSH_GROUP_SIZE, by_page_id); - for (scan_array) - { - if (page_cache_latch(page_id, READ) == PAGE_ABSENT) - { - /* - page disappeared since we made the copy (it was flushed to be - replaced), remove from array (memcpy tail of array over it)... - */ - continue; - } - memcpy(flush_group_buffer+..., page->data, PAGE_SIZE); - pin_page; - page_cache_unlatch(page_id, KEEP_PINNED); /* but keep pinned */ - } - for (scan_the_array) - { - /* - As an optimization, we try to identify contiguous-in-the-file segments (to - issue one big write()). - In non-optimized version, contiguous segment is always only one page. - */ - if ((next_page.page_id - this_page.page_id) == 1) - { - /* - this page and next page are in same file and are contiguous in the - file: add page to contiguous segment... - */ - continue; /* defer write() to next pages */ - } - /* contiguous segment ends */ - my_pwrite(file, contiguous_segment_start_offset, contiguous_segment_size); - /* - note that if we had doublewrite, doublewrite buffer may prevent us from - doing this write() grouping (if doublewrite space is shorter). - */ - } /* - Now remove pages from LRD. As we have pinned them, all pages that we - managed to pin are still in the LRD, in the same order, we can just cut - the LRD at the last element of "array". This is more efficient that - removing element by element (which would take LRD mutex many times) in the - loop above. + Build a list of pages to flush: + changed_blocks[i] is roughly sorted by descending rec_lsn, + so we could do a merge sort of changed_blocks[] lists, stopping after we + have the max_this_number first elements or after we have found a page with + rec_lsn > max_this_lsn. + Then do like pagecache_flush_blocks_int() does (beware! this time we are + not alone on the file! there may be dangers! TODO: sort this out). */ - lock(global_LRD_mutex); - /* cut LRD by bending LRD->first, free cut portion... */ - unlock(global_LRD_mutex); - for (scan_array) - { - /* - if the page has a property "modified since last flush" (i.e. which is - redundant with the presence of the page in the LRD, this property can - just be a pointer to the LRD element) we should reset it - (note that then the property would live slightly longer than - the presence in LRD). - */ - page_cache_unpin(page_id); - /* - order between unpin and removal from LRD is not clear, depends on what - pin actually is. - */ - } - free(array); + /* MikaelR noted that he observed that Linux's file cache may never fsync to disk until this cache is full, at which point it decides to empty the @@ -201,28 +96,11 @@ flush_one_group_from_LRD() } /* - Flushes all page from LRD up to approximately rec_lsn>=max_lsn. - This is approximate because we flush groups, and because the LRD list may + Note that when we flush all page from LRD up to rec_lsn>=max_lsn, + this is approximate because the LRD list may not be exactly sorted by rec_lsn (because for a big row, all pages of the row are inserted into the LRD with rec_lsn being the LSN of the REDO for the first page, so if there are concurrent insertions, the last page of the big row may have a smaller rec_lsn than the previous pages inserted by concurrent inserters). */ -int flush_all_LRD_to_lsn(LSN max_lsn) -{ - lock(global_LRD_mutex); - if (max_lsn == MAX_LSN) /* don't want to flush forever, so make it fixed: */ - max_lsn= LRD->first->prev->rec_lsn; - while (LRD->first->rec_lsn < max_lsn) - { - if (flush_one_group_from_LRD()) /* will unlock LRD mutex */ - return 1; - /* - The scheduler may preempt us here as we released the mutex; this is good. - */ - lock(global_LRD_mutex); - } - unlock(global_LRD_mutex); - return 0; -} diff --git a/unittest/mysys/test_file.h b/unittest/mysys/test_file.h index ea787c123ed..bfc660b13d0 100644 --- a/unittest/mysys/test_file.h +++ b/unittest/mysys/test_file.h @@ -1,4 +1,4 @@ - +#include <m_string.h> #include <pagecache.h> /* |