1 files changed, 267 insertions, 0 deletions
diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c
new file mode 100644
index 00000000000..a42fbdf0458
--- /dev/null
+++ b/storage/maria/ma_recovery.c
@@ -0,0 +1,267 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 of the License.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA */
+
+/*
+  WL#3072 Maria recovery
+  First version written by Guilhem Bichot on 2006-04-27.
+  Does not compile yet.
+*/
+
+/* Here is the implementation of this module */
+
+#include "page_cache.h"
+#include "least_recently_dirtied.h"
+#include "transaction.h"
+#include "share.h"
+#include "log.h"
+
+typedef struct st_record_type_properties {
+ /* used for debug error messages or "maria_read_log" command-line tool: */
+  char *name,
+  my_bool record_ends_group;
+  /* a function to execute when we see the record during the REDO phase */
+  int (*record_execute_in_redo_phase)(RECORD *); /* param will be record header instead later */
+  /* a function to execute when we see the record during the UNDO phase */
+  int (*record_execute_in_undo_phase)(RECORD *); /* param will be record header instead later */
+} RECORD_TYPE_PROPERTIES;
+
+int no_op(RECORD *) {return 0};
+
+RECORD_TYPE_PROPERTIES all_record_type_properties[]=
+{
+  /* listed here in the order of the "log records type" enumeration */
+  {"REDO_INSERT_HEAD", FALSE, redo_insert_head_execute_in_redo_phase, no_op},
+  ...,
+  {"UNDO_INSERT"     , TRUE , undo_insert_execute_in_redo_phase, undo_insert_execute_in_undo_phase},
+  {"COMMIT",         , TRUE , commit_execute_in_redo_phase, no_op},
+  ...
+};
+
+int redo_insert_head_execute_in_redo_phase(RECORD *record)
+{
+  /* write the data to the proper page */
+}
+
+int undo_insert_execute_in_redo_phase(RECORD *record)
+{
+  trans_table[short_trans_id].undo_lsn= record.lsn;
+  /* don't restore the old version of the row */
+}
+
+int undo_insert_execute_in_undo_phase(RECORD *record)
+{
+  /* restore the old version of the row */
+  trans_table[short_trans_id].undo_lsn= record.prev_undo_lsn;
+}
+
+int commit_execute_in_redo_phase(RECORD *record)
+{
+  trans_table[short_trans_id].state= COMMITTED;
+  /*
+    and that's all: the delete/update handler should not be woken up! as there
+    may be REDO for purge further in the log.
+  */
+}
+
+#define record_ends_group(R)                                            \
+  all_record_type_properties[(R)->type].record_ends_group)
+
+#define execute_log_record_in_redo_phase(R)                                           \
+  all_record_type_properties[(R).type].record_execute_in_redo_phase(R)
+
+
+int recovery()
+{
+  control_file_create_or_open();
+  /*
+    init log handler: tell it that we are going to do large reads of the
+    log, sequential and backward. Log handler could decide to alloc a big
+    read-only IO_CACHE for this, or use its usual page cache.
+  */
+
+  /* read checkpoint log record from log handler */
+  RECORD *checkpoint_record= log_read_record(last_checkpoint_lsn_at_start);
+
+  /* parse this record, build structs (dirty_pages, transactions table, file_map) */
+  /*
+    read log records (note: sometimes only the header is needed, for ex during
+    REDO phase only the header of UNDO is needed, not the 4G blob in the
+    variable-length part, so I could use that; however for PREPARE (which is a
+    variable-length record) I'll need to read the full record in the REDO
+    phase):
+  */
+
+  /**** REDO PHASE *****/
+
+  record= log_read_record(min(rec_lsn, ...)); /* later, read only header */
+
+  /*
+    if log handler knows the end LSN of the log, we could print here how many
+    MB of log we have to read (to give an idea of the time), and print
+    progress notes.
+  */
+
+  while (record != NULL)
+  {
+    /*
+      A complete group is a set of log records with an "end mark" record
+      (e.g. a set of REDOs for an operation, terminated by an UNDO for this
+      operation); if there is no "end mark" record the group is incomplete
+      and won't be executed.
+    */
+    if (record_ends_group(record)
+    {
+      if (trans_table[record.short_trans_id].group_start_lsn != 0)
+      {
+        /*
+          There is a complete group for this transaction, containing more than
+          this event.
+          We're going to read recently read log records:
+          for this log_read_record() to be efficient (not touch the disk),
+          log handler could cache recently read pages
+          (can just use an IO_CACHE of 10 MB to read the log, or the normal
+          log handler page cache).
+          Without it only OS file cache will help.
+        */
+        record2=
+          log_read_record(trans_table[record.short_trans_id].group_start_lsn);
+
+        do
+        {
+          if (record2.short_trans_id == record.short_trans_id)
+            execute_log_record_in_redo_phase(record2); /* it's in our group */
+          record2= log_read_next_record();
+        }
+        while (record2.lsn < record.lsn);
+        trans_table[record.short_trans_id].group_start_lsn= 0; /* group finished */
+      }
+      execute_log_record_in_redo_phase(record);
+    }
+    else /* record does not end group */
+    {
+      /* just record the fact, can't know if can execute yet */
+      if (trans_table[short_trans_id].group_start_lsn == 0) /* group not yet started */
+        trans_table[short_trans_id].group_start_lsn= record.lsn;
+    }
+
+    /*
+      Later we can optimize: instead of "execute_log_record(record2)", do
+      copy_record_into_exec_buffer(record2):
+      this will just copy record into a multi-record (10 MB?) memory buffer,
+      and when buffer is full, will do sorting of REDOs per 
+      page id and execute them.
+      This sorting will enable us to do more sequential reads of the
+      data/index pages.
+      Note that updating bitmap pages (when we have executed a REDO for a page
+      we update its bitmap page) may break the sequential read of pages,
+      so maybe we should read and cache bitmap pages in the beginning.
+      Or ok the sequence will be broken, but quickly all bitmap pages will be
+      in memory and so the sequence will not be broken anymore.
+      Sorting could even determine, based on physical device of files
+      ("st_dev" in stat()), that some files should be should be taken by
+      different threads, if we want to do parallism.
+    */
+    /*
+      Here's how to read a complete variable-length record if needed:
+      <sanja> read the header, allocate buffer of record length, read whole
+      record.
+    */
+    record= log_read_next_record();
+  }
+
+  /*
+    Earlier or here, create true transactions in TM.
+    If done earlier, note that TM should not wake up the delete/update handler
+    when it receives a commit info, as existing REDO for purge may exist in
+    the log, and so the delete/update handler may do changes which conflict
+    with these REDOs.
+    Even if done here, better to not wake it up now as we're going to free the
+    page cache.
+
+    MikaelR suggests: support checkpoints during REDO phase too: do checkpoint
+    after a certain amount of log records have been executed. This helps
+    against repeated crashes. Those checkpoints could not be user-requested
+    (as engine is not communicating during the REDO phase), so they would be
+    automatic: this changes the original assumption that we don't write to the
+    log while in the REDO phase, but why not. How often should we checkpoint?
+  */
+
+  /*
+    We want to have two steps:
+    engine->recover_with_max_memory();
+    next_engine->recover_with_max_memory();
+    engine->init_with_normal_memory();
+    next_engine->init_with_normal_memory();
+    So: in recover_with_max_memory() allocate a giant page cache, do REDO
+    phase, then all page cache is flushed and emptied and freed (only retain
+    small structures like TM): take full checkpoint, which is useful if
+    next engine crashes in its recovery the next second.
+    Destroy all shares (maria_close()), then at init_with_normal_memory() we
+    do this:
+  */
+
+  /**** UNDO PHASE *****/
+
+  print_information_to_error_log(nb of trans to roll back, nb of prepared trans);
+
+  /*
+    Launch one or more threads to do the background rollback. Don't wait for
+    them to complete their rollback (background rollback; for debugging, we
+    can have an option which waits). Set a counter (total_of_rollback_threads)
+    to the number of threads to lauch.
+
+    Note that InnoDB's rollback-in-background works as long as InnoDB is the
+    last engine to recover, otherwise MySQL will refuse new connections until
+    the last engine has recovered so it's not "background" from the user's
+    point of view. InnoDB is near top of sys_table_types so all others
+    (e.g. BDB) recover after it... So it's really "online rollback" only if
+    InnoDB is the only engine.
+  */
+
+  /* wake up delete/update handler */
+  /* tell the TM that it can now accept new transactions */
+
+  /*
+    mark that checkpoint requests are now allowed.
+  */
+}
+
+pthread_handler_decl rollback_background_thread()
+{
+  /*
+    execute the normal runtime-rollback code for a bunch of transactions.
+  */
+  while (trans in list_of_trans_to_rollback_by_this_thread)
+  {
+    while (trans->undo_lsn != 0)
+    {
+      /* this is the normal runtime-rollback code: */
+      record= log_read_record(trans->undo_lsn);
+      execute_log_record_in_undo_phase(record);
+      trans->undo_lsn= record.prev_undo_lsn;
+    }
+    /* remove trans from list */
+  }
+  lock_mutex(rollback_threads); /* or atomic counter */
+  if (--total_of_rollback_threads == 0)
+  {
+    /*
+      All rollback threads are done.  Print "rollback finished" to the error
+      log and take a full checkpoint.
+    */
+  }
+  unlock_mutex(rollback_threads);
+  pthread_exit();
+}