summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorunknown <guilhem@gbichot3.local>2006-12-18 17:24:02 +0100
committerunknown <guilhem@gbichot3.local>2006-12-18 17:24:02 +0100
commit7199c905590391f64802913369aab7d288eff4c8 (patch)
treed38393d634c6dc8b3886863dbdab2d348526eb45
parent71b404973c1f6343e9e63d3179c65f3642aade9a (diff)
downloadmariadb-git-7199c905590391f64802913369aab7d288eff4c8.tar.gz
WL#3071 Maria checkpoint
- cleanups, simplifications - moving the construction of the "dirty pages table" into the pagecache where it belongs (because it's the pagecache which knows dirty pages). TODO: do the same soon for the "transactions table". - fix for a small bug in the pagecache (decrementation of "changed_blocks") include/pagecache.h: prototype mysys/mf_pagecache.c: m_string.h moves up for LEX_STRING to be known for pagecache.h. In pagecache_delete_page(), we must decrement "blocks_changed" even if we just delete the page without flushing it. A new function pagecache_collect_changed_blocks_with_LSN() (used by the Checkpoint module), which stores information about the changed blocks (a.k.a. "the dirty pages table") into a LEX_STRING. This function is not tested now, it will be when there is a Checkpoint. storage/maria/ma_checkpoint.c: refining the checkpoint code: factoring functions, moving the construction of the "dirty pages table" into mf_pagecache.c (I'll do the same with the construction of the "transactions table" once Serg tells me what's the best way to do it). storage/maria/ma_least_recently_dirtied.c: Simplifying the thread which does background flushing of least-recently-dirtied pages: - in first version that thread will not flush, just do checkpoints - in 2nd version, flushing should re-use existing page cache functions like flush_pagecache_blocks(). unittest/mysys/test_file.h: m_string.h moves up for LEX_STRING to be known in pagecache.h
-rw-r--r--include/pagecache.h3
-rwxr-xr-xmysys/mf_pagecache.c180
-rw-r--r--storage/maria/ma_checkpoint.c179
-rw-r--r--storage/maria/ma_least_recently_dirtied.c182
-rw-r--r--unittest/mysys/test_file.h2
5 files changed, 231 insertions, 315 deletions
diff --git a/include/pagecache.h b/include/pagecache.h
index 4d64070ad62..9f215325ae5 100644
--- a/include/pagecache.h
+++ b/include/pagecache.h
@@ -221,6 +221,9 @@ extern my_bool pagecache_delete_page(PAGECACHE *pagecache,
enum pagecache_page_lock lock,
my_bool flush);
extern void end_pagecache(PAGECACHE *keycache, my_bool cleanup);
+extern my_bool pagecache_collect_changed_blocks_with_LSN(PAGECACHE *pagecache,
+ LEX_STRING *str,
+ LSN *max_lsn);
C_MODE_END
#endif /* _keycache_h */
diff --git a/mysys/mf_pagecache.c b/mysys/mf_pagecache.c
index 807a3ea520a..96c855fda0a 100755
--- a/mysys/mf_pagecache.c
+++ b/mysys/mf_pagecache.c
@@ -40,9 +40,9 @@
*/
#include "mysys_priv.h"
+#include <m_string.h>
#include <pagecache.h>
#include "my_static.h"
-#include <m_string.h>
#include <my_bit.h>
#include <errno.h>
#include <stdarg.h>
@@ -295,7 +295,7 @@ struct st_pagecache_block_link
enum pagecache_page_type type; /* type of the block */
uint hits_left; /* number of hits left until promotion */
ulonglong last_hit_time; /* timestamp of the last hit */
- ulonglong rec_lsn; /* LSN when first became dirty */
+ LSN rec_lsn; /* LSN when first became dirty */
KEYCACHE_CONDVAR *condvar; /* condition variable for 'no readers' event */
};
@@ -2988,33 +2988,35 @@ restart:
goto restart;
}
- if (block->status & BLOCK_CHANGED && flush)
+ if (block->status & BLOCK_CHANGED)
{
- /* The block contains a dirty page - push it out of the cache */
-
- KEYCACHE_DBUG_PRINT("find_key_block", ("block is dirty"));
-
- pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
- /*
- The call is thread safe because only the current
- thread might change the block->hash_link value
- */
- DBUG_ASSERT(block->pins == 1);
- error= pagecache_fwrite(pagecache,
- &block->hash_link->file,
- block->buffer,
- block->hash_link->pageno,
- block->type,
- MYF(MY_NABP | MY_WAIT_IF_FULL));
- pagecache_pthread_mutex_lock(&pagecache->cache_lock);
- pagecache->global_cache_write++;
-
- if (error)
+ if (flush)
{
- block->status|= BLOCK_ERROR;
- goto err;
+ /* The block contains a dirty page - push it out of the cache */
+
+ KEYCACHE_DBUG_PRINT("find_key_block", ("block is dirty"));
+
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ /*
+ The call is thread safe because only the current
+ thread might change the block->hash_link value
+ */
+ DBUG_ASSERT(block->pins == 1);
+ error= pagecache_fwrite(pagecache,
+ &block->hash_link->file,
+ block->buffer,
+ block->hash_link->pageno,
+ block->type,
+ MYF(MY_NABP | MY_WAIT_IF_FULL));
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ pagecache->global_cache_write++;
+
+ if (error)
+ {
+ block->status|= BLOCK_ERROR;
+ goto err;
+ }
}
-
pagecache->blocks_changed--;
pagecache->global_blocks_changed--;
/*
@@ -3793,6 +3795,132 @@ int reset_key_cache_counters(const char *name, PAGECACHE *key_cache)
}
+/*
+ Allocates a buffer and stores in it some information about all dirty pages
+ of type PAGECACHE_LSN_PAGE.
+
+ SYNOPSIS
+ pagecache_collect_changed_blocks_with_LSN()
+ pagecache pointer to the page cache
+ str (OUT) pointer to a LEX_STRING where the allocated buffer, and
+ its size, will be put
+ max_lsn (OUT) pointer to a LSN where the maximum rec_lsn of all
+ relevant dirty pages will be put
+
+ DESCRIPTION
+ Does the allocation because the caller cannot know the size itself.
+ Memory freeing is done by the caller.
+ Ignores all pages of another type than PAGECACHE_LSN_PAGE, because they
+ are not interesting for a checkpoint record.
+ The caller has the intention of doing checkpoints.
+
+ RETURN
+ 0 on success
+ 1 on error
+*/
+my_bool pagecache_collect_changed_blocks_with_LSN(PAGECACHE *pagecache,
+ LEX_STRING *str,
+ LSN *max_lsn)
+{
+ my_bool error;
+ ulong stored_LRD_size= 0;
+ uint file_hash;
+ char *ptr;
+ DBUG_ENTER("pagecache_collect_changed_blocks_with_LSN");
+
+ *max_lsn= 0;
+ /*
+ We lock the entire cache but will be quick, just reading/writing a few MBs
+ of memory at most.
+ When we enter here, we must be sure that no "first_in_switch" situation
+ is happening or will happen (either we have to get rid of
+ first_in_switch in the code or, first_in_switch has to increment a
+ "danger" counter for this function to know it has to wait). TODO.
+ */
+ pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+
+ /* Count how many dirty pages are interesting */
+ for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+ {
+ PAGECACHE_BLOCK_LINK *block;
+ for (block= pagecache->changed_blocks[file_hash] ;
+ block;
+ block= block->next_changed)
+ {
+ /*
+ Q: is there somthing subtle with block->hash_link: can it be NULL?
+ does it have to be == hash_link->block... ?
+ */
+ DBUG_ASSERT(block->hash_link != NULL);
+ DBUG_ASSERT(block->status & BLOCK_CHANGED);
+ if (block->type != PAGECACHE_LSN_PAGE)
+ continue; /* no need to store it */
+ /*
+ In the current pagecache, rec_lsn is not set correctly:
+ 1) it is set on pagecache_unlock(), too late (a page is dirty
+ (BLOCK_CHANGED) since the first pagecache_write()). So in this
+ scenario:
+ thread1: thread2:
+ write_REDO
+ pagecache_write() checkpoint : reclsn not known
+ pagecache_unlock(sets rec_lsn)
+ commit
+ crash,
+ at recovery we will wrongly skip the REDO. It also affects the
+ low-water mark's computation.
+ 2) sometimes the unlocking can be an implicit action of
+ pagecache_write(), without any call to pagecache_unlock(), then
+ rec_lsn is not set.
+ 1) and 2) are critical problems.
+ TODO: fix this when Monty has explained how he writes BLOB pages.
+ */
+ if (0 == block->rec_lsn)
+ {
+ DBUG_ASSERT(0);
+ goto err;
+ }
+ stored_LRD_size++;
+ }
+ }
+
+ str->length= 8+(4+4+8)*stored_LRD_size;
+ if (NULL == (str->str= my_malloc(str->length, MYF(MY_WME))))
+ goto err;
+ ptr= str->str;
+ int8store(ptr, stored_LRD_size);
+ ptr+= 8;
+ if (0 == stored_LRD_size)
+ goto end;
+ for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
+ {
+ PAGECACHE_BLOCK_LINK *block;
+ for (block= pagecache->changed_blocks[file_hash] ;
+ block;
+ block= block->next_changed)
+ {
+ if (block->type != PAGECACHE_LSN_PAGE)
+ continue; /* no need to store it in the checkpoint record */
+ DBUG_ASSERT((4 == sizeof(block->hash_link->file.file)) &&
+ (4 == sizeof(block->hash_link->pageno)));
+ int4store(ptr, block->hash_link->file.file);
+ ptr+= 4;
+ int4store(ptr, block->hash_link->pageno);
+ ptr+= 4;
+ int8store(ptr, (ulonglong)block->rec_lsn);
+ ptr+= 8;
+ set_if_bigger(*max_lsn, block->rec_lsn);
+ }
+ }
+ error= 0;
+ goto end;
+err:
+ error= 1;
+end:
+ pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
+ DBUG_RETURN(error);
+}
+
+
#ifndef DBUG_OFF
/*
Test if disk-cache is ok
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c
index 608a6fb9fcd..a1d094d7da1 100644
--- a/storage/maria/ma_checkpoint.c
+++ b/storage/maria/ma_checkpoint.c
@@ -56,9 +56,9 @@ st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,..
MEDIUM checkpoint.
*/
LSN max_rec_lsn_at_last_checkpoint= 0;
-/* last submitted checkpoint request; cleared only when executed */
+/* last submitted checkpoint request; cleared when starts */
CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE;
-CHECKPOINT_LEVEL synchronous_checkpoint_in_progress= NONE;
+CHECKPOINT_LEVEL checkpoint_in_progress= NONE;
static inline ulonglong read_non_atomic(ulonglong volatile *x);
@@ -74,16 +74,10 @@ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
DBUG_ASSERT(level > NONE);
lock(log_mutex);
- while ((synchronous_checkpoint_in_progress != NONE) ||
- (next_asynchronous_checkpoint_to_do != NONE))
+ while (checkpoint_in_progress != NONE)
wait_on_checkpoint_done_cond();
- synchronous_checkpoint_in_progress= level;
result= execute_checkpoint(level);
- safemutex_assert_owner(log_mutex);
- synchronous_checkpoint_in_progress= NONE;
- unlock(log_mutex);
- broadcast(checkpoint_done_cond);
DBUG_RETURN(result);
}
@@ -92,7 +86,7 @@ my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
request, executes it.
Is safe if multiple threads call it, though in first version only one will.
It's intended to be used by a thread which regularly calls this function;
- this is why, if there is a request,it does not wait in a loop for
+ this is why, if there is a request, it does not wait in a loop for
synchronous checkpoints to be finished, but just exits (because the thread
may want to do something useful meanwhile (flushing dirty pages for example)
instead of waiting).
@@ -103,27 +97,20 @@ my_bool execute_asynchronous_checkpoint_if_any()
CHECKPOINT_LEVEL level;
DBUG_ENTER("execute_asynchronous_checkpoint");
+ /* first check without mutex, ok to see old data */
+ if (likely((next_asynchronous_checkpoint_to_do == NONE) ||
+ (checkpoint_in_progress != NONE)))
+ DBUG_RETURN(FALSE);
+
lock(log_mutex);
if (likely((next_asynchronous_checkpoint_to_do == NONE) ||
- (synchronous_checkpoint_in_progress != NONE)))
+ (checkpoint_in_progress != NONE)))
{
unlock(log_mutex);
DBUG_RETURN(FALSE);
}
- level= next_asynchronous_checkpoint_to_do;
- DBUG_ASSERT(level > NONE);
- result= execute_checkpoint(level);
- safemutex_assert_owner(log_mutex);
- /* If only one thread calls this function, "<" can never happen below */
- if (next_asynchronous_checkpoint_to_do <= level)
- {
- /* it's our request or weaker/equal ones, all work is done */
- next_asynchronous_checkpoint_to_do= NONE;
- }
- /* otherwise if it is a stronger request, we'll deal with it at next call */
- unlock(log_mutex);
- broadcast(checkpoint_done_cond);
+ result= execute_checkpoint(next_asynchronous_checkpoint_to_do);
DBUG_RETURN(result);
}
@@ -135,9 +122,13 @@ my_bool execute_asynchronous_checkpoint_if_any()
*/
my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
{
+ my_bool result;
DBUG_ENTER("execute_checkpoint");
safemutex_assert_owner(log_mutex);
+ if (next_asynchronous_checkpoint_to_do <= level)
+ next_asynchronous_checkpoint_to_do= NONE;
+ checkpoint_in_progress= level;
if (unlikely(level > INDIRECT))
{
@@ -166,11 +157,11 @@ my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
lock(log_mutex);
}
- /*
- keep mutex locked upon exit because callers will want to clear
- mutex-protected status variables
- */
- DBUG_RETURN(execute_checkpoint_indirect());
+ result= execute_checkpoint_indirect();
+ checkpoint_in_progress= NONE;
+ unlock(log_mutex);
+ broadcast(checkpoint_done_cond);
+ DBUG_RETURN(result);
}
@@ -181,114 +172,37 @@ my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
*/
my_bool execute_checkpoint_indirect()
{
- int error= 0;
+ int error= 0, i;
/* checkpoint record data: */
LSN checkpoint_start_lsn;
- LEX_STRING string1={0,0}, string2={0,0}, string3={0,0};
- LEX_STRING *string_array[4];
+ char checkpoint_start_lsn_char[8];
+ LEX_STRING strings[5]={ {&checkpoint_start_lsn_str, 8}, {0,0}, {0,0}, {0,0}, {0,0} };
char *ptr;
LSN checkpoint_lsn;
- LSN candidate_max_rec_lsn_at_last_checkpoint= 0;
+ LSN candidate_max_rec_lsn_at_last_checkpoint;
DBUG_ENTER("execute_checkpoint_indirect");
DBUG_ASSERT(sizeof(byte *) <= 8);
DBUG_ASSERT(sizeof(LSN) <= 8);
safemutex_assert_owner(log_mutex);
+
+ /* STEP 1: record current end-of-log LSN */
checkpoint_start_lsn= log_read_end_lsn();
if (LSN_IMPOSSIBLE == checkpoint_start_lsn) /* error */
DBUG_RETURN(TRUE);
unlock(log_mutex);
DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn));
+ int8store(strings[0].str, checkpoint_start_lsn);
- /* STEP 1: fetch information about dirty pages */
- /* note: this piece will move into mysys/mf_pagecache.c */
- {
- ulong stored_LRD_size= 0;
- /*
- We lock the entire cache but will be quick, just reading/writing a few MBs
- of memory at most.
- When we enter here, we must be sure that no "first_in_switch" situation
- is happening or will happen (either we have to get rid of
- first_in_switch in the code or, first_in_switch has to increment a
- "danger" counter for Checkpoint to know it has to wait. TODO.
- */
- pagecache_pthread_mutex_lock(&pagecache->cache_lock);
+ /* STEP 2: fetch information about dirty pages */
- /*
- This is an over-estimation, as in theory blocks_changed may contain
- non-PAGECACHE_LSN_PAGE pages, which we don't want to store into the
- checkpoint record; the true number of page-LRD-info we'll store into the
- record is stored_LRD_size.
- */
- /*
- TODO: Ingo says blocks_changed is not a reliable number (see his
- document); ask him.
- */
- string1.length= 8+8+(8+8+8)*pagecache->blocks_changed;
- if (NULL == (string1.str= my_malloc(string1.length)))
- goto err;
- ptr= string1.str;
- int8store(ptr, checkpoint_start_lsn);
- ptr+= 8+8; /* don't store stored_LRD_size now, wait */
- if (pagecache->blocks_changed > 0)
- {
- uint file_hash;
- for (file_hash= 0; file_hash < PAGECACHE_CHANGED_BLOCKS_HASH; file_hash++)
- {
- PAGECACHE_BLOCK_LINK *block;
- for (block= pagecache->changed_blocks[file_hash] ;
- block;
- block= block->next_changed)
- {
- DBUG_ASSERT(block->hash_link != NULL);
- DBUG_ASSERT(block->status & BLOCK_CHANGED);
- if (block->type != PAGECACHE_LSN_PAGE)
- {
- continue; /* no need to store it in the checkpoint record */
- }
- /*
- In the current pagecache, rec_lsn is not set correctly:
- 1) it is set on pagecache_unlock(), too late (a page is dirty
- (BLOCK_CHANGED) since the first pagecache_write()). So in this
- scenario:
- thread1: thread2:
- write_REDO
- pagecache_write()
- checkpoint : reclsn not known
- pagecache_unlock(sets rec_lsn)
- commit
- crash,
- at recovery we will wrongly skip the REDO. It also affects the
- low-water mark's computation.
- 2) sometimes the unlocking can be an implicit action of
- pagecache_write(), without any call to pagecache_unlock(), then
- rec_lsn is not set.
- 1) and 2) are critical problems.
- TODO: fix this when Monty has explained how he writes BLOB pages.
- */
- if (0 == block->rec_lsn)
- abort(); /* always fail in all builds */
-
- int8store(ptr, block->hash_link->file.file);
- ptr+= 8;
- int8store(ptr, block->hash_link->pageno);
- ptr+= 8;
- int8store(ptr, block->rec_lsn);
- ptr+= 8;
- stored_LRD_size++;
- DBUG_ASSERT(stored_LRD_size <= pagecache->blocks_changed);
- set_if_bigger(candidate_max_rec_lsn_at_last_checkpoint,
- block->rec_lsn);
- }
- }
- pagecache_pthread_mutex_unlock(&pagecache->cache_lock);
- int8store(string1.str+8, stored_LRD_size);
- string1.length= 8+8+(8+8+8)*stored_LRD_size;
- }
+ if (pagecache_collect_changed_blocks_with_LSN(pagecache, &strings[1],
+ &candidate_max_rec_lsn_at_last_checkpoint))
+ goto err;
- /* STEP 2: fetch information about transactions */
+ /* STEP 3: fetch information about transactions */
/* note: this piece will move into trnman.c */
/*
Transactions are in the "active list" (protected by a mutex) and in a
@@ -345,7 +259,7 @@ my_bool execute_checkpoint_indirect()
string2.length= 8+(7+2+8+8+8)*stored_trn_size;
}
- /* STEP 3: fetch information about table files */
+ /* STEP 4: fetch information about table files */
{
/* This global mutex is in fact THR_LOCK_maria (see ma_open()) */
@@ -391,13 +305,8 @@ my_bool execute_checkpoint_indirect()
/* LAST STEP: now write the checkpoint log record */
- string_array[0]= string1;
- string_array[1]= string2;
- string_array[2]= string3;
- string_array[3]= NULL;
-
checkpoint_lsn= log_write_record(LOGREC_CHECKPOINT,
- &system_trans, string_array);
+ &system_trans, strings);
/*
Do nothing between the log write and the control file write, for the
@@ -418,9 +327,8 @@ err:
end:
- my_free(buffer1.str, MYF(MY_ALLOW_ZERO_PTR));
- my_free(buffer2.str, MYF(MY_ALLOW_ZERO_PTR));
- my_free(buffer3.str, MYF(MY_ALLOW_ZERO_PTR));
+ for (i= 1; i<5; i++)
+ my_free(strings[i], MYF(MY_ALLOW_ZERO_PTR));
/*
this portion cannot be done as a hook in write_log_record() for the
@@ -440,7 +348,6 @@ end:
lock(log_mutex);
/* That LSN is used for the "two-checkpoint rule" (MEDIUM checkpoints) */
maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
- written_since_last_checkpoint= (my_off_t)0;
DBUG_RETURN(FALSE);
}
lock(log_mutex);
@@ -471,6 +378,8 @@ log_write_record(...)
thread" WL#3261) to do a checkpoint
*/
request_asynchronous_checkpoint(INDIRECT);
+ /* prevent similar redundant requests */
+ written_since_last_checkpoint= (my_off_t)0;
}
...;
unlock(log_mutex);
@@ -488,16 +397,13 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
safemutex_assert_owner(log_mutex);
DBUG_ASSERT(level > NONE);
- if (next_asynchronous_checkpoint_to_do < level)
+ if ((next_asynchronous_checkpoint_to_do < level) &&
+ (checkpoint_in_progress < level))
{
/* no equal or stronger running or to run, we post request */
/*
- note that thousands of requests for checkpoints are going to come all
- at the same time (when the log bound
- MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS is passed), so it may not be a
- good idea for each of them to broadcast a cond to wake up the background
- checkpoint thread. We just don't broacast a cond, the checkpoint thread
- (see least_recently_dirtied.c) will notice our request in max a few
+ We just don't broacast a cond, the checkpoint thread
+ (see ma_least_recently_dirtied.c) will notice our request in max a few
seconds.
*/
next_asynchronous_checkpoint_to_do= level; /* post request */
@@ -520,6 +426,7 @@ void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
first_undo_lsn), this function can be used to do a read of it (without
mutex, without atomic load) which always produces a correct (though maybe
slightly old) value (even on 32-bit CPUs).
+ The prototype will change with Sanja's new LSN type.
*/
static inline ulonglong read_non_atomic(ulonglong volatile *x)
{
diff --git a/storage/maria/ma_least_recently_dirtied.c b/storage/maria/ma_least_recently_dirtied.c
index 809442b4e97..170e59a601a 100644
--- a/storage/maria/ma_least_recently_dirtied.c
+++ b/storage/maria/ma_least_recently_dirtied.c
@@ -36,162 +36,57 @@
#include "least_recently_dirtied.h"
/*
- MikaelR suggested removing this global_LRD_mutex (I have a paper note of
- comments), however at least for the first version we'll start with this
- mutex (which will be a LOCK-based atomic_rwlock).
-*/
-pthread_mutex_t global_LRD_mutex;
-
-/*
- When we flush a page, we should pin page.
- This "pin" is to protect against that:
- I make copy,
- you modify in memory and flush to disk and remove from LRD and from cache,
- I write copy to disk,
- checkpoint happens.
- result: old page is on disk, page is absent from LRD, your REDO will be
- wrongly ignored.
-
- Pin: there can be multiple pins, flushing imposes that there are zero pins.
- For example, pin could be a uint counter protected by the page's latch.
-
- Maybe it's ok if when there is a page replacement, the replacer does not
- remove page from the LRD (it would save global mutex); for that, background
- flusher should be prepared to see pages in the LRD which are not in the page
- cache (then just ignore them). However checkpoint will contain superfluous
- entries and so do more work.
-*/
-
-#define PAGE_SIZE (16*1024) /* just as an example */
-/*
- Optimization:
- LRD flusher should not flush pages one by one: to be fast, it flushes a
- group of pages in sequential disk order if possible; a group of pages is just
- FLUSH_GROUP_SIZE pages.
- Key cache has groupping already somehow Monty said (investigate that).
-*/
-#define FLUSH_GROUP_SIZE 512 /* 8 MB */
-/*
- We don't want to probe for checkpoint requests all the time (it takes
- the log mutex).
- If FLUSH_GROUP_SIZE is 8MB, assuming a local disk which can write 30MB/s
- (1.8GB/min), probing every 16th call to flush_one_group_from_LRD() is every
- 16*8=128MB which is every 128/30=4.2second.
- Using a power of 2 gives a fast modulo operation.
-*/
-#define CHECKPOINT_PROBING_PERIOD_LOG2 4
-
-/*
- This thread does background flush of pieces of the LRD, and all checkpoints.
+ This thread does background flush of pieces of the LRD, and serves
+ requests for asynchronous checkpoints.
Just launch it when engine starts.
MikaelR questioned why the same thread does two different jobs, the risk
could be that while a checkpoint happens no LRD flushing happens.
+ For now, we only do checkpoints - no LRD flushing (to be done when the
+ second version of the page cache is ready WL#3077).
+ Reasons to delay:
+ - Recovery will work (just slower)
+ - new page cache may be different, why do then re-do
+ - current pagecache probably has issues with flushing when somebody is
+ writing to the table being flushed - better avoid that.
*/
pthread_handler_decl background_flush_and_checkpoint_thread()
{
- char *flush_group_buffer= my_malloc(PAGE_SIZE*FLUSH_GROUP_SIZE);
- uint flush_calls= 0;
while (this_thread_not_killed)
{
- if ((flush_calls++) & ((2<<CHECKPOINT_PROBING_PERIOD_LOG2)-1) == 0)
- {
- /* note that we don't care of the checkpoint's success */
- (void)execute_asynchronous_checkpoint_if_any();
- }
- lock(global_LRD_mutex);
- flush_one_group_from_LRD();
- safemutex_assert_not_owner(global_LRD_mutex);
+ /* note that we don't care of the checkpoint's success */
+ (void)execute_asynchronous_checkpoint_if_any();
+ sleep(5);
/*
- We are a background thread, leave time for client threads or we would
- monopolize the disk:
+ in the final version, we will not sleep but call flush_pages_from_LRD()
+ repeatedly. If there are no dirty pages, we'll make sure to not have a
+ tight loop probing for checkpoint requests.
*/
- pthread_yield();
}
- my_free(flush_group_buffer);
}
+/* The rest of this file will not serve in first version */
+
/*
- flushes only the first FLUSH_GROUP_SIZE pages of the LRD.
+ flushes only the first pages of the LRD.
+ max_this_number could be FLUSH_CACHE (of mf_pagecache.c) for example.
*/
-flush_one_group_from_LRD()
+flush_pages_from_LRD(uint max_this_number, LSN max_this_lsn)
{
- char *ptr;
- safe_mutex_assert_owner(global_LRD_mutex);
-
- for (page= 0; page<FLUSH_GROUP_SIZE; page++)
- {
- copy_element_to_array;
- }
/*
One rule to better observe is "page must be flushed to disk before it is
removed from LRD" (otherwise checkpoint is incomplete info, corruption).
*/
- unlock(global_LRD_mutex);
- /* page id is concatenation of "file id" and "number of page in file" */
- qsort(array, sizeof(*element), FLUSH_GROUP_SIZE, by_page_id);
- for (scan_array)
- {
- if (page_cache_latch(page_id, READ) == PAGE_ABSENT)
- {
- /*
- page disappeared since we made the copy (it was flushed to be
- replaced), remove from array (memcpy tail of array over it)...
- */
- continue;
- }
- memcpy(flush_group_buffer+..., page->data, PAGE_SIZE);
- pin_page;
- page_cache_unlatch(page_id, KEEP_PINNED); /* but keep pinned */
- }
- for (scan_the_array)
- {
- /*
- As an optimization, we try to identify contiguous-in-the-file segments (to
- issue one big write()).
- In non-optimized version, contiguous segment is always only one page.
- */
- if ((next_page.page_id - this_page.page_id) == 1)
- {
- /*
- this page and next page are in same file and are contiguous in the
- file: add page to contiguous segment...
- */
- continue; /* defer write() to next pages */
- }
- /* contiguous segment ends */
- my_pwrite(file, contiguous_segment_start_offset, contiguous_segment_size);
- /*
- note that if we had doublewrite, doublewrite buffer may prevent us from
- doing this write() grouping (if doublewrite space is shorter).
- */
- }
/*
- Now remove pages from LRD. As we have pinned them, all pages that we
- managed to pin are still in the LRD, in the same order, we can just cut
- the LRD at the last element of "array". This is more efficient that
- removing element by element (which would take LRD mutex many times) in the
- loop above.
+ Build a list of pages to flush:
+ changed_blocks[i] is roughly sorted by descending rec_lsn,
+ so we could do a merge sort of changed_blocks[] lists, stopping after we
+ have the max_this_number first elements or after we have found a page with
+ rec_lsn > max_this_lsn.
+ Then do like pagecache_flush_blocks_int() does (beware! this time we are
+ not alone on the file! there may be dangers! TODO: sort this out).
*/
- lock(global_LRD_mutex);
- /* cut LRD by bending LRD->first, free cut portion... */
- unlock(global_LRD_mutex);
- for (scan_array)
- {
- /*
- if the page has a property "modified since last flush" (i.e. which is
- redundant with the presence of the page in the LRD, this property can
- just be a pointer to the LRD element) we should reset it
- (note that then the property would live slightly longer than
- the presence in LRD).
- */
- page_cache_unpin(page_id);
- /*
- order between unpin and removal from LRD is not clear, depends on what
- pin actually is.
- */
- }
- free(array);
+
/*
MikaelR noted that he observed that Linux's file cache may never fsync to
disk until this cache is full, at which point it decides to empty the
@@ -201,28 +96,11 @@ flush_one_group_from_LRD()
}
/*
- Flushes all page from LRD up to approximately rec_lsn>=max_lsn.
- This is approximate because we flush groups, and because the LRD list may
+ Note that when we flush all page from LRD up to rec_lsn>=max_lsn,
+ this is approximate because the LRD list may
not be exactly sorted by rec_lsn (because for a big row, all pages of the
row are inserted into the LRD with rec_lsn being the LSN of the REDO for the
first page, so if there are concurrent insertions, the last page of the big
row may have a smaller rec_lsn than the previous pages inserted by
concurrent inserters).
*/
-int flush_all_LRD_to_lsn(LSN max_lsn)
-{
- lock(global_LRD_mutex);
- if (max_lsn == MAX_LSN) /* don't want to flush forever, so make it fixed: */
- max_lsn= LRD->first->prev->rec_lsn;
- while (LRD->first->rec_lsn < max_lsn)
- {
- if (flush_one_group_from_LRD()) /* will unlock LRD mutex */
- return 1;
- /*
- The scheduler may preempt us here as we released the mutex; this is good.
- */
- lock(global_LRD_mutex);
- }
- unlock(global_LRD_mutex);
- return 0;
-}
diff --git a/unittest/mysys/test_file.h b/unittest/mysys/test_file.h
index ea787c123ed..bfc660b13d0 100644
--- a/unittest/mysys/test_file.h
+++ b/unittest/mysys/test_file.h
@@ -1,4 +1,4 @@
-
+#include <m_string.h>
#include <pagecache.h>
/*