Merge gbichot3.local:/home/mysql_src/mysql-5.1-for-maria

into gbichot3.local:/home/mysql_src/mysql-maria BitKeeper/etc/ignore: auto-union BUILD/SETUP.sh: Auto merged client/mysqldump.c: Auto merged config/ac-macros/plugins.m4: Auto merged configure.in: Auto merged include/Makefile.am: Auto merged include/atomic/nolock.h: Auto merged include/atomic/rwlock.h: Auto merged include/atomic/x86-gcc.h: Auto merged include/atomic/x86-msvc.h: Auto merged include/ft_global.h: Auto merged include/keycache.h: Auto merged include/m_string.h: Auto merged include/my_atomic.h: Auto merged include/my_base.h: Auto merged include/my_dbug.h: Auto merged include/my_global.h: Auto merged include/my_handler.h: Auto merged include/my_sys.h: Auto merged include/myisam.h: Auto merged libmysql/CMakeLists.txt: Auto merged libmysqld/Makefile.am: Auto merged mysql-test/mysql-test-run.pl: Auto merged mysql-test/r/events_logs_tests.result: Auto merged mysql-test/t/events_logs_tests.test: Auto merged mysys/Makefile.am: Auto merged mysys/array.c: Auto merged mysys/mf_keycache.c: Auto merged mysys/mf_keycaches.c: Auto merged mysys/my_atomic.c: Auto merged mysys/my_bit.c: Auto merged mysys/my_bitmap.c: Auto merged mysys/my_create.c: Auto merged mysys/my_delete.c: Auto merged mysys/my_getsystime.c: Auto merged mysys/my_handler.c: Auto merged mysys/my_init.c: Auto merged mysys/my_open.c: Auto merged mysys/my_pread.c: Auto merged mysys/my_rename.c: Auto merged mysys/my_symlink.c: Auto merged mysys/my_sync.c: Auto merged plugin/daemon_example/daemon_example.cc: Auto merged sql/Makefile.am: Auto merged sql/filesort.cc: Auto merged sql/gen_lex_hash.cc: Auto merged sql/ha_ndbcluster.cc: Auto merged sql/handler.h: Auto merged sql/item_func.cc: Auto merged sql/item_func.h: Auto merged sql/log.cc: Auto merged sql/mysql_priv.h: Auto merged sql/set_var.h: Auto merged sql/sql_class.h: Auto merged sql/sql_parse.cc: Auto merged sql/sql_select.cc: Auto merged sql/sql_sort.h: Auto merged sql/sql_test.cc: Auto merged sql/uniques.cc: Auto merged sql/unireg.cc: Auto merged storage/Makefile.am: Auto merged storage/csv/ha_tina.cc: Auto merged storage/myisam/Makefile.am: Auto merged storage/myisam/ft_boolean_search.c: Auto merged storage/myisam/ft_nlq_search.c: Auto merged storage/myisam/ft_parser.c: Auto merged storage/myisam/ft_static.c: Auto merged storage/myisam/ft_stopwords.c: Auto merged storage/myisam/ft_update.c: Auto merged storage/myisam/fulltext.h: Auto merged storage/myisam/ha_myisam.h: Auto merged storage/myisam/mi_check.c: Auto merged storage/myisam/mi_create.c: Auto merged storage/myisam/mi_delete.c: Auto merged storage/myisam/mi_delete_all.c: Auto merged storage/myisam/mi_dynrec.c: Auto merged storage/myisam/mi_key.c: Auto merged storage/myisam/mi_log.c: Auto merged storage/myisam/mi_open.c: Auto merged storage/myisam/mi_packrec.c: Auto merged storage/myisam/mi_range.c: Auto merged storage/myisam/mi_rsamepos.c: Auto merged storage/myisam/mi_search.c: Auto merged storage/myisam/mi_test1.c: Auto merged storage/myisam/mi_test2.c: Auto merged storage/myisam/mi_unique.c: Auto merged storage/myisam/mi_update.c: Auto merged storage/myisam/mi_write.c: Auto merged storage/myisam/myisamchk.c: Auto merged storage/myisam/myisamlog.c: Auto merged storage/myisam/myisampack.c: Auto merged storage/myisam/rt_index.c: Auto merged storage/myisam/sort.c: Auto merged storage/myisammrg/ha_myisammrg.h: Auto merged unittest/mytap/tap.c: Auto merged mysql-test/r/view.result: manual merge mysql-test/t/view.test: manual merge Makefile.am: manual merge mysql-test/t/disabled.def: manual merge sql/mysqld.cc: manual merge sql/set_var.cc: manual merge sql/udf_example.c: manual merge storage/myisam/ha_myisam.cc: manual merge storage/myisam/myisamdef.h: manual merge storage/ndb/src/mgmapi/mgmapi.cpp: manual merge unittest/Makefile.am: manual merge unittest/mysys/Makefile.am: manual merge unittest/mysys/my_atomic-t.c: manual merge
author: unknown <guilhem@gbichot3.local> 2007-02-28 17:50:51 +0100
committer: unknown <guilhem@gbichot3.local> 2007-02-28 17:50:51 +0100
commit: ea57b3d4a066a5507a7e322b53e3acab24a2855e (patch)
tree: a0703039f00da454a2a91c8b14835d45a6146209 /storage/maria/ma_checkpoint.c
parent: ae72e394502e13b854b6e9bb00889fa6b69a7ed9 (diff)
parent: fdf847fb62a0fcdf0edf25d6c8654b19eaa9a9ad (diff)
download: mariadb-git-ea57b3d4a066a5507a7e322b53e3acab24a2855e.tar.gz
1 files changed, 429 insertions, 0 deletions
diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c
new file mode 100644
index 00000000000..02d887f758a
--- /dev/null
+++ b/storage/maria/ma_checkpoint.c
@@ -0,0 +1,429 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  WL#3071 Maria checkpoint
+  First version written by Guilhem Bichot on 2006-04-27.
+  Does not compile yet.
+*/
+
+/* Here is the implementation of this module */
+
+/*
+  Summary:
+  - there are asynchronous checkpoints (a writer to the log notices that it's
+  been a long time since we last checkpoint-ed, so posts a request for a
+  background thread to do a checkpoint; does not care about the success of the
+  checkpoint). Then the checkpoint is done by the checkpoint thread, at an
+  unspecified moment ("later") (==soon, of course).
+  - there are synchronous checkpoints: a thread requests a checkpoint to
+  happen now and wants to know when it finishes and if it succeeded; then the
+  checkpoint is done by that same thread.
+*/
+
+#include "page_cache.h"
+#include "least_recently_dirtied.h"
+#include "transaction.h"
+#include "share.h"
+#include "log.h"
+
+#define LSN_IMPOSSIBLE ((LSN)0) /* could also be called LSN_ERROR */
+#define LSN_MAX ((LSN)ULONGLONG_MAX)
+
+/*
+  this transaction is used for any system work (purge, checkpoint writing
+  etc), that is, background threads. It will not be declared/initialized here
+  in the final version.
+*/
+st_transaction system_trans= {0 /* long trans id */, 0 /* short trans id */,0,...};
+
+/* those three are protected by the log's mutex */
+/*
+  The maximum rec_lsn in the LRD when last checkpoint was run, serves for the
+  MEDIUM checkpoint.
+*/
+LSN max_rec_lsn_at_last_checkpoint= 0;
+/* last submitted checkpoint request; cleared when starts */
+CHECKPOINT_LEVEL next_asynchronous_checkpoint_to_do= NONE;
+CHECKPOINT_LEVEL checkpoint_in_progress= NONE;
+
+static inline ulonglong read_non_atomic(ulonglong volatile *x);
+
+/*
+  Used by MySQL client threads requesting a checkpoint (like "ALTER MARIA
+  ENGINE DO CHECKPOINT"), and probably by maria_panic(), and at the end of the
+  UNDO recovery phase.
+*/
+my_bool execute_synchronous_checkpoint(CHECKPOINT_LEVEL level)
+{
+  my_bool result;
+  DBUG_ENTER("execute_synchronous_checkpoint");
+  DBUG_ASSERT(level > NONE);
+
+  lock(log_mutex);
+  while (checkpoint_in_progress != NONE)
+    wait_on_checkpoint_done_cond();
+
+  result= execute_checkpoint(level);
+  DBUG_RETURN(result);
+}
+
+/*
+  If no checkpoint is running, and there is a pending asynchronous checkpoint
+  request, executes it.
+  Is safe if multiple threads call it, though in first version only one will.
+  It's intended to be used by a thread which regularly calls this function;
+  this is why, if there is a request, it does not wait in a loop for
+  synchronous checkpoints to be finished, but just exits (because the thread
+  may want to do something useful meanwhile (flushing dirty pages for example)
+  instead of waiting).
+*/
+my_bool execute_asynchronous_checkpoint_if_any()
+{
+  my_bool result;
+  CHECKPOINT_LEVEL level;
+  DBUG_ENTER("execute_asynchronous_checkpoint");
+
+  /* first check without mutex, ok to see old data */
+  if (likely((next_asynchronous_checkpoint_to_do == NONE) ||
+             (checkpoint_in_progress != NONE)))
+    DBUG_RETURN(FALSE);
+
+  lock(log_mutex);
+  if (likely((next_asynchronous_checkpoint_to_do == NONE) ||
+             (checkpoint_in_progress != NONE)))
+  {
+    unlock(log_mutex);
+    DBUG_RETURN(FALSE);
+  }
+
+  result= execute_checkpoint(next_asynchronous_checkpoint_to_do);
+  DBUG_RETURN(result);
+}
+
+
+/*
+  Does the actual checkpointing. Called by
+  execute_synchronous_checkpoint() and
+  execute_asynchronous_checkpoint_if_any().
+*/
+my_bool execute_checkpoint(CHECKPOINT_LEVEL level)
+{
+  my_bool result;
+  DBUG_ENTER("execute_checkpoint");
+
+  safemutex_assert_owner(log_mutex);
+  if (next_asynchronous_checkpoint_to_do <= level)
+    next_asynchronous_checkpoint_to_do= NONE;
+  checkpoint_in_progress= level;
+
+  if (unlikely(level > INDIRECT))
+  {
+    LSN copy_of_max_rec_lsn_at_last_checkpoint=
+      max_rec_lsn_at_last_checkpoint;
+    /* much I/O work to do, release log mutex */
+    unlock(log_mutex);
+
+    switch (level)
+    {
+    case FULL:
+      /* flush all pages up to the current end of the LRD */
+      flush_all_LRD_to_lsn(LSN_MAX);
+      /* this will go full speed (normal scheduling, no sleep) */
+      break;
+    case MEDIUM:
+      /*
+        flush all pages which were already dirty at last checkpoint:
+        ensures that recovery will never start from before the next-to-last
+        checkpoint (two-checkpoint rule).
+      */
+      flush_all_LRD_to_lsn(copy_of_max_rec_lsn_at_last_checkpoint);
+      /* this will go full speed (normal scheduling, no sleep) */
+      break;
+    }
+    lock(log_mutex);
+  }
+
+  result= execute_checkpoint_indirect();
+  checkpoint_in_progress= NONE;
+  unlock(log_mutex);
+  broadcast(checkpoint_done_cond);
+  DBUG_RETURN(result);
+}
+
+
+/*
+  Does an indirect checpoint (collects data from data structures, writes into
+  a checkpoint log record).
+  Starts and ends while having log's mutex (released in the middle).
+*/
+my_bool execute_checkpoint_indirect()
+{
+  int error= 0, i;
+  /* checkpoint record data: */
+  LSN checkpoint_start_lsn;
+  char checkpoint_start_lsn_char[8];
+  LEX_STRING strings[6]=
+    {checkpoint_start_lsn_char, 8}, {0,0}, {0,0}, {0,0}, {0,0}, {0,0} };
+  char *ptr;
+  LSN checkpoint_lsn;
+  LSN candidate_max_rec_lsn_at_last_checkpoint;
+  DBUG_ENTER("execute_checkpoint_indirect");
+
+  DBUG_ASSERT(sizeof(byte *) <= 8);
+  DBUG_ASSERT(sizeof(LSN) <= 8);
+
+  safemutex_assert_owner(log_mutex);
+
+  /* STEP 1: record current end-of-log LSN */
+  checkpoint_start_lsn= log_read_end_lsn();
+  if (LSN_IMPOSSIBLE == checkpoint_start_lsn) /* error */
+    DBUG_RETURN(TRUE);
+  unlock(log_mutex);
+
+  DBUG_PRINT("info",("checkpoint_start_lsn %lu", checkpoint_start_lsn));
+  int8store(strings[0].str, checkpoint_start_lsn);
+
+  /* STEP 2: fetch information about dirty pages */
+
+  if (pagecache_collect_changed_blocks_with_LSN(pagecache, &strings[1],
+                                                &candidate_max_rec_lsn_at_last_checkpoint))
+    goto err;
+
+  /* STEP 3: fetch information about transactions */
+  if (trnman_collect_transactions(&strings[2], &strings[3]))
+    goto err;
+
+  /* STEP 4: fetch information about table files */
+
+  {
+    /* This global mutex is in fact THR_LOCK_maria (see ma_open()) */
+    lock(global_share_list_mutex);
+    strings[4].length= 8+(8+8)*share_list->count;
+    if (NULL == (strings[4].str= my_malloc(strings[4].length)))
+      goto err;
+    ptr= string3.str;
+    /*
+      Note that maria_open_list is a list of MARIA_HA*, while we would prefer
+      a list of MARIA_SHARE* here (we are interested in the short id,
+      unique file name, members of MARIA_SHARE*, and in file descriptors,
+      which will in the end be in MARIA_SHARE*).
+    */
+    for (iterate on the maria_open_list)
+    {
+      /* latch each MARIA_SHARE, one by one, like this: */
+      pthread_mutex_lock(&share->intern_lock);
+      /*
+        TODO:
+        we need to prevent the share from going away while we later flush and
+        force it without holding THR_LOCK_maria. For example if the share is
+        free()d by maria_close() we'll have a problem. Or if the share's file
+        descriptor is closed by maria_close() we will not be able to my_sync()
+        it.
+      */
+      pthread_mutex_unlock(&share->intern_lock);
+      store the share pointer into a private array;
+    }
+    unlock(global_share_list_mutex);
+
+    /* work on copy */
+    int8store(ptr, elements_in_array);
+    ptr+= 8;
+    for (el in array)
+    {
+      int8store(ptr, array[...].short_id);
+      ptr+= 8;
+      memcpy(ptr, array[...].unique_file_name[_length], ...);
+      ptr+= ...;
+      /* maybe we need to lock share->intern_lock here */
+      /*
+        these two are long ops (involving disk I/O) that's why we copied the
+        list, to not keep the list locked for long:
+      */
+      flush_bitmap_pages(el);
+      /* TODO: and also autoinc counter, logical file end, free page list */
+
+      /*
+        fsyncs the fd, that's the loooong operation (e.g. max 150 fsync per
+        second, so if you have touched 1000 files it's 7 seconds).
+      */
+      force_file(el);
+    }
+  }
+
+  /* LAST STEP: now write the checkpoint log record */
+
+  checkpoint_lsn= log_write_record(LOGREC_CHECKPOINT,
+                                   &system_trans, strings);
+
+  /*
+    Do nothing between the log write and the control file write, for the
+    "repair control file" tool to be possible one day.
+  */
+
+  if (LSN_IMPOSSIBLE == checkpoint_lsn)
+    goto err;
+
+  if (0 != control_file_write_and_force(checkpoint_lsn, NULL))
+    goto err;
+
+  /*
+    Note that we should not alter memory structures until we have successfully
+    written the checkpoint record and control file.
+    Btw, a log write failure is serious:
+    - if we know how many bytes we managed to write, we should try to write
+    more, keeping the log's mutex (MY_FULL_IO)
+    - if we don't know, this log record is corrupted and we have no way to
+    "de-corrupt" it, so it will stay corrupted, and as the log is sequential,
+    any log record written after it will not be reachable (for example if we
+    would write UNDOs and crash, we would not be able to read the log and so
+    not be able to rollback), so we should stop the engine now (holding the
+    log's mutex) and do a recovery.
+  */
+  goto end;
+
+err:
+  print_error_to_error_log(the_error_message);
+  candidate_max_rec_lsn_at_last_checkpoint= LSN_IMPOSSIBLE;
+
+end:
+
+  for (i= 1; i<6; i++)
+    my_free(strings[i].str, MYF(MY_ALLOW_ZERO_PTR));
+
+  /*
+    this portion cannot be done as a hook in write_log_record() for the
+    LOGREC_CHECKPOINT type because:
+    - at that moment we still have not written to the control file so cannot
+    mark the request as done; this could be solved by writing to the control
+    file in the hook but that would be an I/O under the log's mutex, bad.
+    - it would not be nice organisation of code (I tried it :).
+  */
+  if (candidate_max_rec_lsn_at_last_checkpoint != LSN_IMPOSSIBLE)
+  {
+    /* checkpoint succeeded */
+    /*
+      TODO: compute log's low water mark (how to do that with our fuzzy
+      ARIES-like reads of data structures? TODO think about it :).
+    */
+    lock(log_mutex);
+    /* That LSN is used for the "two-checkpoint rule" (MEDIUM checkpoints) */
+    maximum_rec_lsn_last_checkpoint= candidate_max_rec_lsn_at_last_checkpoint;
+    DBUG_RETURN(FALSE);
+  }
+  lock(log_mutex);
+  DBUG_RETURN(TRUE);
+  /*
+    keep mutex locked upon exit because callers will want to clear
+    mutex-protected status variables
+  */
+}
+
+
+
+/*
+  Here's what should be put in log_write_record() in the log handler:
+*/
+log_write_record(...)
+{
+  ...;
+  lock(log_mutex);
+  ...;
+  write_to_log(length);
+  written_since_last_checkpoint+= length;
+  if (written_since_last_checkpoint >
+      MAX_LOG_BYTES_WRITTEN_BETWEEN_CHECKPOINTS)
+  {
+    /*
+      ask one system thread (the "LRD background flusher and checkpointer
+      thread" WL#3261) to do a checkpoint
+    */
+    request_asynchronous_checkpoint(INDIRECT);
+    /* prevent similar redundant requests */
+    written_since_last_checkpoint= (my_off_t)0;
+  }
+  ...;
+  unlock(log_mutex);
+  ...;
+}
+
+/*
+  Requests a checkpoint from the background thread, *asynchronously*
+  (requestor does not wait for completion, and does not even later check the
+  result).
+  In real life it will be called by log_write_record().
+*/
+void request_asynchronous_checkpoint(CHECKPOINT_LEVEL level);
+{
+  safemutex_assert_owner(log_mutex);
+
+  DBUG_ASSERT(level > NONE);
+  if ((next_asynchronous_checkpoint_to_do < level) &&
+      (checkpoint_in_progress < level))
+  {
+    /* no equal or stronger running or to run, we post request */
+    /*
+      We just don't broacast a cond, the checkpoint thread
+      (see ma_least_recently_dirtied.c) will notice our request in max a few
+      seconds.
+    */
+    next_asynchronous_checkpoint_to_do= level; /* post request */
+  }
+
+  /*
+    If there was an error, only an error
+    message to the error log will say it; normal, for a checkpoint triggered
+    by a log write, we probably don't want the client's log write to throw an
+    error, as the log write succeeded and a checkpoint failure is not
+    critical: the failure in this case is more for the DBA to know than for
+    the end user.
+  */
+}
+
+
+/*
+  If a 64-bit variable transitions from both halves being zero to both halves
+  being non-zero, and never changes after that (like the transaction's
+  first_undo_lsn), this function can be used to do a read of it (without
+  mutex, without atomic load) which always produces a correct (though maybe
+  slightly old) value (even on 32-bit CPUs).
+  The prototype will change with Sanja's new LSN type.
+*/
+static inline ulonglong read_non_atomic(ulonglong volatile *x)
+{
+#if ( SIZEOF_CHARP >= 8 )
+  /* 64-bit CPU (right?), 64-bit reads are atomic */
+  return *x;
+#else
+  /*
+    32-bit CPU, 64-bit reads may give a mixed of old half and new half (old
+    low bits and new high bits, or the contrary).
+    As the variable we read transitions from both halves being zero to both
+    halves being non-zero, and never changes then, we can detect atomicity
+    problems:
+  */
+  ulonglong y;
+  for (;;) /* loop until no atomicity problems */
+  {
+    y= *x;
+    if (likely(((0 == y) ||
+                ((0 != (y >> 32)) && (0 != (y << 32)))))
+      return y;
+    /* Worth seeing it! */
+    DBUG_PRINT("info",("atomicity problem"));
+  }
+#endif
+}
author	unknown <guilhem@gbichot3.local>	2007-02-28 17:50:51 +0100
committer	unknown <guilhem@gbichot3.local>	2007-02-28 17:50:51 +0100
commit	ea57b3d4a066a5507a7e322b53e3acab24a2855e (patch)
tree	a0703039f00da454a2a91c8b14835d45a6146209 /storage/maria/ma_checkpoint.c
parent	ae72e394502e13b854b6e9bb00889fa6b69a7ed9 (diff)
parent	fdf847fb62a0fcdf0edf25d6c8654b19eaa9a9ad (diff)
download	mariadb-git-ea57b3d4a066a5507a7e322b53e3acab24a2855e.tar.gz