MDEV-5262: Missing retry after temp error in parallel replication

Start implementing that an event group can be re-tried in parallel replication if it fails with a temporary error (like deadlock). Patch is very incomplete, just some very basic retry works. Stuff still missing (not complete list): - Handle moving to the next relay log file, if event group to be retried spans multiple relay log files. - Handle refcounting of relay log files, to ensure that we do not purge a relay log file and then later attempt to re-execute events out of it. - Handle description_event_for_exec - we need to save this somehow for the possible retry - and use the correct one in case it differs between relay logs. - Do another retry attempt in case the first retry also fails. - Limit the max number of retries. - Lots of testing will be needed for the various edge cases.
author: unknown <knielsen@knielsen-hq.org> 2014-05-08 14:20:18 +0200
committer: Kristian Nielsen <knielsen@knielsen-hq.org> 2014-05-08 14:20:18 +0200
commit: b0b60f249807b6c2d423313350d9ad66693c2d1e (patch)
tree: bb8fadff8a4425a04c03af264872991ee36f7fd0 /sql/rpl_rli.h
parent: 2b4b857d51469c5fd974186ba8219e367c2019ec (diff)
download: mariadb-git-b0b60f249807b6c2d423313350d9ad66693c2d1e.tar.gz
1 files changed, 36 insertions, 0 deletions
diff --git a/sql/rpl_rli.h b/sql/rpl_rli.h
index 00d16f52488..c2cdbcdc573 100644
--- a/sql/rpl_rli.h
+++ b/sql/rpl_rli.h
@@ -61,6 +61,7 @@ enum {
 *****************************************************************************/
 
 struct rpl_group_info;
+struct inuse_relaylog;
 
 class Relay_log_info : public Slave_reporting_capability
 {
@@ -164,6 +165,13 @@ public:
   Master_info *mi;
 
   /*
+    List of active relay log files.
+    (This can be more than one in case of parallel replication).
+  */
+  inuse_relaylog *inuse_relaylog_list;
+  inuse_relaylog *last_inuse_relaylog;
+
+  /*
     Needed to deal properly with cur_log getting closed and re-opened with
     a different log under our feet
   */
@@ -398,6 +406,7 @@ public:
   void stmt_done(my_off_t event_log_pos,
                  time_t event_creation_time, THD *thd,
                  rpl_group_info *rgi);
+  int alloc_inuse_relaylog(const char *name);
 
   /**
      Is the replication inside a group?
@@ -464,6 +473,25 @@ private:
 
 
 /*
+  In parallel replication, if we need to re-try a transaction due to a
+  deadlock or other temporary error, we may need to go back and re-read events
+  out of an earlier relay log.
+
+  This structure keeps track of the relaylogs that are potentially in use.
+  Each rpl_group_info has a pointer to one of those, corresponding to the
+  first GTID event.
+
+  A reference count keeps track of how long a relay log is potentially in use.
+*/
+struct inuse_relaylog {
+  inuse_relaylog *next;
+  uint64 queued_count;
+  uint64 dequeued_count;
+  char name[FN_REFLEN];
+};
+
+
+/*
   This is data for various state needed to be kept for the processing of
   one event group (transaction) during replication.
 
@@ -596,6 +624,14 @@ struct rpl_group_info
   /* Needs room for "Gtid D-S-N\x00". */
   char gtid_info_buf[5+10+1+10+1+20+1];
 
+  /*
+    Information to be able to re-try an event group in case of a deadlock or
+    other temporary error.
+  */
+  inuse_relaylog *relay_log;
+  uint64 retry_start_offset;
+  uint64 retry_event_count;
+
   rpl_group_info(Relay_log_info *rli_);
   ~rpl_group_info();
   void reinit(Relay_log_info *rli);
author	unknown <knielsen@knielsen-hq.org>	2014-05-08 14:20:18 +0200
committer	Kristian Nielsen <knielsen@knielsen-hq.org>	2014-05-08 14:20:18 +0200
commit	b0b60f249807b6c2d423313350d9ad66693c2d1e (patch)
tree	bb8fadff8a4425a04c03af264872991ee36f7fd0 /sql/rpl_rli.h
parent	2b4b857d51469c5fd974186ba8219e367c2019ec (diff)
download	mariadb-git-b0b60f249807b6c2d423313350d9ad66693c2d1e.tar.gz