summaryrefslogtreecommitdiff
path: root/storage/maria/ma_recovery.c
diff options
context:
space:
mode:
Diffstat (limited to 'storage/maria/ma_recovery.c')
-rw-r--r--storage/maria/ma_recovery.c267
1 files changed, 267 insertions, 0 deletions
diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c
new file mode 100644
index 00000000000..a42fbdf0458
--- /dev/null
+++ b/storage/maria/ma_recovery.c
@@ -0,0 +1,267 @@
+/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ WL#3072 Maria recovery
+ First version written by Guilhem Bichot on 2006-04-27.
+ Does not compile yet.
+*/
+
+/* Here is the implementation of this module */
+
+#include "page_cache.h"
+#include "least_recently_dirtied.h"
+#include "transaction.h"
+#include "share.h"
+#include "log.h"
+
+typedef struct st_record_type_properties {
+ /* used for debug error messages or "maria_read_log" command-line tool: */
+ char *name,
+ my_bool record_ends_group;
+ /* a function to execute when we see the record during the REDO phase */
+ int (*record_execute_in_redo_phase)(RECORD *); /* param will be record header instead later */
+ /* a function to execute when we see the record during the UNDO phase */
+ int (*record_execute_in_undo_phase)(RECORD *); /* param will be record header instead later */
+} RECORD_TYPE_PROPERTIES;
+
+int no_op(RECORD *) {return 0};
+
+RECORD_TYPE_PROPERTIES all_record_type_properties[]=
+{
+ /* listed here in the order of the "log records type" enumeration */
+ {"REDO_INSERT_HEAD", FALSE, redo_insert_head_execute_in_redo_phase, no_op},
+ ...,
+ {"UNDO_INSERT" , TRUE , undo_insert_execute_in_redo_phase, undo_insert_execute_in_undo_phase},
+ {"COMMIT", , TRUE , commit_execute_in_redo_phase, no_op},
+ ...
+};
+
+int redo_insert_head_execute_in_redo_phase(RECORD *record)
+{
+ /* write the data to the proper page */
+}
+
+int undo_insert_execute_in_redo_phase(RECORD *record)
+{
+ trans_table[short_trans_id].undo_lsn= record.lsn;
+ /* don't restore the old version of the row */
+}
+
+int undo_insert_execute_in_undo_phase(RECORD *record)
+{
+ /* restore the old version of the row */
+ trans_table[short_trans_id].undo_lsn= record.prev_undo_lsn;
+}
+
+int commit_execute_in_redo_phase(RECORD *record)
+{
+ trans_table[short_trans_id].state= COMMITTED;
+ /*
+ and that's all: the delete/update handler should not be woken up! as there
+ may be REDO for purge further in the log.
+ */
+}
+
+#define record_ends_group(R) \
+ all_record_type_properties[(R)->type].record_ends_group)
+
+#define execute_log_record_in_redo_phase(R) \
+ all_record_type_properties[(R).type].record_execute_in_redo_phase(R)
+
+
+int recovery()
+{
+ control_file_create_or_open();
+ /*
+ init log handler: tell it that we are going to do large reads of the
+ log, sequential and backward. Log handler could decide to alloc a big
+ read-only IO_CACHE for this, or use its usual page cache.
+ */
+
+ /* read checkpoint log record from log handler */
+ RECORD *checkpoint_record= log_read_record(last_checkpoint_lsn_at_start);
+
+ /* parse this record, build structs (dirty_pages, transactions table, file_map) */
+ /*
+ read log records (note: sometimes only the header is needed, for ex during
+ REDO phase only the header of UNDO is needed, not the 4G blob in the
+ variable-length part, so I could use that; however for PREPARE (which is a
+ variable-length record) I'll need to read the full record in the REDO
+ phase):
+ */
+
+ /**** REDO PHASE *****/
+
+ record= log_read_record(min(rec_lsn, ...)); /* later, read only header */
+
+ /*
+ if log handler knows the end LSN of the log, we could print here how many
+ MB of log we have to read (to give an idea of the time), and print
+ progress notes.
+ */
+
+ while (record != NULL)
+ {
+ /*
+ A complete group is a set of log records with an "end mark" record
+ (e.g. a set of REDOs for an operation, terminated by an UNDO for this
+ operation); if there is no "end mark" record the group is incomplete
+ and won't be executed.
+ */
+ if (record_ends_group(record)
+ {
+ if (trans_table[record.short_trans_id].group_start_lsn != 0)
+ {
+ /*
+ There is a complete group for this transaction, containing more than
+ this event.
+ We're going to read recently read log records:
+ for this log_read_record() to be efficient (not touch the disk),
+ log handler could cache recently read pages
+ (can just use an IO_CACHE of 10 MB to read the log, or the normal
+ log handler page cache).
+ Without it only OS file cache will help.
+ */
+ record2=
+ log_read_record(trans_table[record.short_trans_id].group_start_lsn);
+
+ do
+ {
+ if (record2.short_trans_id == record.short_trans_id)
+ execute_log_record_in_redo_phase(record2); /* it's in our group */
+ record2= log_read_next_record();
+ }
+ while (record2.lsn < record.lsn);
+ trans_table[record.short_trans_id].group_start_lsn= 0; /* group finished */
+ }
+ execute_log_record_in_redo_phase(record);
+ }
+ else /* record does not end group */
+ {
+ /* just record the fact, can't know if can execute yet */
+ if (trans_table[short_trans_id].group_start_lsn == 0) /* group not yet started */
+ trans_table[short_trans_id].group_start_lsn= record.lsn;
+ }
+
+ /*
+ Later we can optimize: instead of "execute_log_record(record2)", do
+ copy_record_into_exec_buffer(record2):
+ this will just copy record into a multi-record (10 MB?) memory buffer,
+ and when buffer is full, will do sorting of REDOs per
+ page id and execute them.
+ This sorting will enable us to do more sequential reads of the
+ data/index pages.
+ Note that updating bitmap pages (when we have executed a REDO for a page
+ we update its bitmap page) may break the sequential read of pages,
+ so maybe we should read and cache bitmap pages in the beginning.
+ Or ok the sequence will be broken, but quickly all bitmap pages will be
+ in memory and so the sequence will not be broken anymore.
+ Sorting could even determine, based on physical device of files
+ ("st_dev" in stat()), that some files should be should be taken by
+ different threads, if we want to do parallism.
+ */
+ /*
+ Here's how to read a complete variable-length record if needed:
+ <sanja> read the header, allocate buffer of record length, read whole
+ record.
+ */
+ record= log_read_next_record();
+ }
+
+ /*
+ Earlier or here, create true transactions in TM.
+ If done earlier, note that TM should not wake up the delete/update handler
+ when it receives a commit info, as existing REDO for purge may exist in
+ the log, and so the delete/update handler may do changes which conflict
+ with these REDOs.
+ Even if done here, better to not wake it up now as we're going to free the
+ page cache.
+
+ MikaelR suggests: support checkpoints during REDO phase too: do checkpoint
+ after a certain amount of log records have been executed. This helps
+ against repeated crashes. Those checkpoints could not be user-requested
+ (as engine is not communicating during the REDO phase), so they would be
+ automatic: this changes the original assumption that we don't write to the
+ log while in the REDO phase, but why not. How often should we checkpoint?
+ */
+
+ /*
+ We want to have two steps:
+ engine->recover_with_max_memory();
+ next_engine->recover_with_max_memory();
+ engine->init_with_normal_memory();
+ next_engine->init_with_normal_memory();
+ So: in recover_with_max_memory() allocate a giant page cache, do REDO
+ phase, then all page cache is flushed and emptied and freed (only retain
+ small structures like TM): take full checkpoint, which is useful if
+ next engine crashes in its recovery the next second.
+ Destroy all shares (maria_close()), then at init_with_normal_memory() we
+ do this:
+ */
+
+ /**** UNDO PHASE *****/
+
+ print_information_to_error_log(nb of trans to roll back, nb of prepared trans);
+
+ /*
+ Launch one or more threads to do the background rollback. Don't wait for
+ them to complete their rollback (background rollback; for debugging, we
+ can have an option which waits). Set a counter (total_of_rollback_threads)
+ to the number of threads to lauch.
+
+ Note that InnoDB's rollback-in-background works as long as InnoDB is the
+ last engine to recover, otherwise MySQL will refuse new connections until
+ the last engine has recovered so it's not "background" from the user's
+ point of view. InnoDB is near top of sys_table_types so all others
+ (e.g. BDB) recover after it... So it's really "online rollback" only if
+ InnoDB is the only engine.
+ */
+
+ /* wake up delete/update handler */
+ /* tell the TM that it can now accept new transactions */
+
+ /*
+ mark that checkpoint requests are now allowed.
+ */
+}
+
+pthread_handler_decl rollback_background_thread()
+{
+ /*
+ execute the normal runtime-rollback code for a bunch of transactions.
+ */
+ while (trans in list_of_trans_to_rollback_by_this_thread)
+ {
+ while (trans->undo_lsn != 0)
+ {
+ /* this is the normal runtime-rollback code: */
+ record= log_read_record(trans->undo_lsn);
+ execute_log_record_in_undo_phase(record);
+ trans->undo_lsn= record.prev_undo_lsn;
+ }
+ /* remove trans from list */
+ }
+ lock_mutex(rollback_threads); /* or atomic counter */
+ if (--total_of_rollback_threads == 0)
+ {
+ /*
+ All rollback threads are done. Print "rollback finished" to the error
+ log and take a full checkpoint.
+ */
+ }
+ unlock_mutex(rollback_threads);
+ pthread_exit();
+}