MWL#116: Efficient group commit for binary log

Preliminary commit for testing
author: unknown <knielsen@knielsen-hq.org> 2010-09-30 15:20:15 +0200
committer: unknown <knielsen@knielsen-hq.org> 2010-09-30 15:20:15 +0200
commit: 0394cf203042eb6e408b9c88802c93444f226af9 (patch)
tree: 7a5fefa05950b55557111426e9e83e4e48ce6c17 /sql
parent: e432151e9cf6a7a5ccf84fc137975ccf38fd0798 (diff)
download: mariadb-git-0394cf203042eb6e408b9c88802c93444f226af9.tar.gz
11 files changed, 1469 insertions, 346 deletions
diff --git a/sql/handler.cc b/sql/handler.cc
index b817673ed23..0bce67596fa 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -76,6 +76,8 @@ TYPELIB tx_isolation_typelib= {array_elements(tx_isolation_names)-1,"",
 static TYPELIB known_extensions= {0,"known_exts", NULL, NULL};
 uint known_extensions_id= 0;
 
+static int commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans,
+                              bool is_real_trans);
 
 
 static plugin_ref ha_default_plugin(THD *thd)
@@ -1070,7 +1072,7 @@ ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
 */
 int ha_commit_trans(THD *thd, bool all)
 {
-  int error= 0, cookie= 0;
+  int error= 0, cookie;
   /*
     'all' means that this is either an explicit commit issued by
     user, or an implicit commit issued by a DDL.
@@ -1085,7 +1087,8 @@ int ha_commit_trans(THD *thd, bool all)
   */
   bool is_real_trans= all || thd->transaction.all.ha_list == 0;
   Ha_trx_info *ha_info= trans->ha_list;
-  my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
+  bool need_prepare_ordered, need_commit_ordered;
+  my_xid xid;
   DBUG_ENTER("ha_commit_trans");
 
   /*
@@ -1118,85 +1121,112 @@ int ha_commit_trans(THD *thd, bool all)
     DBUG_RETURN(2);
   }
 #ifdef USING_TRANSACTIONS
-  if (ha_info)
+  if (!ha_info)
   {
-    uint rw_ha_count;
-    bool rw_trans;
+    /* Free resources and perform other cleanup even for 'empty' transactions. */
+    if (is_real_trans)
+      thd->transaction.cleanup();
+    DBUG_RETURN(0);
+  }
 
-    DBUG_EXECUTE_IF("crash_commit_before", abort(););
+  DBUG_EXECUTE_IF("crash_commit_before", abort(););
 
-    /* Close all cursors that can not survive COMMIT */
-    if (is_real_trans)                          /* not a statement commit */
-      thd->stmt_map.close_transient_cursors();
+  /* Close all cursors that can not survive COMMIT */
+  if (is_real_trans)                          /* not a statement commit */
+    thd->stmt_map.close_transient_cursors();
 
-    rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
-    /* rw_trans is TRUE when we in a transaction changing data */
-    rw_trans= is_real_trans && (rw_ha_count > 0);
+  uint rw_ha_count= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
+  /* rw_trans is TRUE when we in a transaction changing data */
+  bool rw_trans= is_real_trans && (rw_ha_count > 0);
 
-    if (rw_trans &&
-        wait_if_global_read_lock(thd, 0, 0))
-    {
-      ha_rollback_trans(thd, all);
-      DBUG_RETURN(1);
-    }
+  if (rw_trans &&
+      wait_if_global_read_lock(thd, 0, 0))
+  {
+    ha_rollback_trans(thd, all);
+    DBUG_RETURN(1);
+  }
 
-    if (rw_trans &&
-        opt_readonly &&
-        !(thd->security_ctx->master_access & SUPER_ACL) &&
-        !thd->slave_thread)
-    {
-      my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
-      ha_rollback_trans(thd, all);
-      error= 1;
-      goto end;
-    }
+  if (rw_trans &&
+      opt_readonly &&
+      !(thd->security_ctx->master_access & SUPER_ACL) &&
+      !thd->slave_thread)
+  {
+    my_error(ER_OPTION_PREVENTS_STATEMENT, MYF(0), "--read-only");
+    goto err;
+  }
 
-    if (!trans->no_2pc && (rw_ha_count > 1))
-    {
-      for (; ha_info && !error; ha_info= ha_info->next())
-      {
-        int err;
-        handlerton *ht= ha_info->ht();
-        /*
-          Do not call two-phase commit if this particular
-          transaction is read-only. This allows for simpler
-          implementation in engines that are always read-only.
-        */
-        if (! ha_info->is_trx_read_write())
-          continue;
-        /*
-          Sic: we know that prepare() is not NULL since otherwise
-          trans->no_2pc would have been set.
-        */
-        if ((err= ht->prepare(ht, thd, all)))
-        {
-          my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
-          error= 1;
-        }
-        status_var_increment(thd->status_var.ha_prepare_count);
-      }
-      DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
-      if (error || (is_real_trans && xid &&
-                    (error= !(cookie= tc_log->log_xid(thd, xid)))))
-      {
-        ha_rollback_trans(thd, all);
-        error= 1;
-        goto end;
-      }
-      DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
-    }
-    error=ha_commit_one_phase(thd, all) ? (cookie ? 2 : 1) : 0;
-    DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
-    if (cookie)
-      tc_log->unlog(cookie, xid);
+  if (trans->no_2pc || (rw_ha_count <= 1))
+  {
+    error= ha_commit_one_phase(thd, all);
     DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
-end:
-    if (rw_trans)
-      start_waiting_global_read_lock(thd);
+    goto end;
   }
-  /* Free resources and perform other cleanup even for 'empty' transactions. */
-  else if (is_real_trans)
-    thd->transaction.cleanup();
+
+  need_prepare_ordered= FALSE;
+  need_commit_ordered= FALSE;
+  xid= thd->transaction.xid_state.xid.get_my_xid();
+
+  for (Ha_trx_info *hi= ha_info; hi; hi= hi->next())
+  {
+    int err;
+    handlerton *ht= hi->ht();
+    /*
+      Do not call two-phase commit if this particular
+      transaction is read-only. This allows for simpler
+      implementation in engines that are always read-only.
+    */
+    if (! hi->is_trx_read_write())
+      continue;
+    /*
+      Sic: we know that prepare() is not NULL since otherwise
+      trans->no_2pc would have been set.
+    */
+    if ((err= ht->prepare(ht, thd, all)))
+      my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
+    status_var_increment(thd->status_var.ha_prepare_count);
+
+    if (err)
+      goto err;
+
+    if (ht->prepare_ordered)
+      need_prepare_ordered= TRUE;
+    if (ht->commit_ordered)
+      need_commit_ordered= TRUE;
+  }
+  DBUG_EXECUTE_IF("crash_commit_after_prepare", DBUG_ABORT(););
+
+  if (!is_real_trans)
+  {
+    error= commit_one_phase_2(thd, all, trans, is_real_trans);
+    DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
+    goto end;
+  }
+
+  cookie= tc_log->log_and_order(thd, xid, all, need_prepare_ordered,
+                                need_commit_ordered);
+  if (!cookie)
+    goto err;
+
+  DBUG_EXECUTE_IF("crash_commit_after_log", DBUG_ABORT(););
+
+  error= commit_one_phase_2(thd, all, trans, is_real_trans) ? 2 : 0;
+  DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
+
+  DBUG_EXECUTE_IF("crash_commit_before_unlog", DBUG_ABORT(););
+  tc_log->unlog(cookie, xid);
+
+  DBUG_EXECUTE_IF("crash_commit_after", DBUG_ABORT(););
+  goto end;
+
+  /* Come here if error and we need to rollback. */
+err:
+  if (!error)
+    error= 1;
+  ha_rollback_trans(thd, all);
+
+end:
+  if (rw_trans)
+    start_waiting_global_read_lock(thd);
 #endif /* USING_TRANSACTIONS */
   DBUG_RETURN(error);
 }
@@ -1207,7 +1237,6 @@ end:
 */
 int ha_commit_one_phase(THD *thd, bool all)
 {
-  int error=0;
   THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
   /*
     "real" is a nick name for a transaction for which a commit will
@@ -1217,11 +1246,44 @@ int ha_commit_one_phase(THD *thd, bool all)
     enclosing 'all' transaction is rolled back.
   */
   bool is_real_trans=all || thd->transaction.all.ha_list == 0;
-  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
+  Ha_trx_info *ha_info= trans->ha_list;
   DBUG_ENTER("ha_commit_one_phase");
 #ifdef USING_TRANSACTIONS
   if (ha_info)
   {
+    if (is_real_trans)
+    {
+      bool locked= false;
+      for (; ha_info; ha_info= ha_info->next())
+      {
+        handlerton *ht= ha_info->ht();
+        if (ht->commit_ordered)
+        {
+          if (ha_info->is_trx_read_write() && !locked)
+          {
+            pthread_mutex_lock(&LOCK_commit_ordered);
+            locked= 1;
+          }
+          ht->commit_ordered(ht, thd, all);
+        }
+      }
+      if (locked)
+        pthread_mutex_unlock(&LOCK_commit_ordered);
+    }
+  }
+#endif /* USING_TRANSACTIONS */
+  DBUG_RETURN(commit_one_phase_2(thd, all, trans, is_real_trans));
+}
+
+static int
+commit_one_phase_2(THD *thd, bool all, THD_TRANS *trans, bool is_real_trans)
+{
+  int error= 0;
+  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
+  DBUG_ENTER("commit_one_phase_2");
+#ifdef USING_TRANSACTIONS
+  if (ha_info)
+  {
     for (; ha_info; ha_info= ha_info_next)
     {
       int err;
diff --git a/sql/handler.h b/sql/handler.h
index d03264a23db..17dcc294099 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -656,9 +656,96 @@ struct handlerton
      NOTE 'all' is also false in auto-commit mode where 'end of statement'
      and 'real commit' mean the same event.
    */
-   int  (*commit)(handlerton *hton, THD *thd, bool all);
+   int (*commit)(handlerton *hton, THD *thd, bool all);
+   /*
+     The commit_ordered() method is called prior to the commit() method, after
+     the transaction manager has decided to commit (not rollback) the
+     transaction. Unlike commit(), commit_ordered() is called only when the
+     full transaction is committed, not for each commit of statement
+     transaction in a multi-statement transaction.
+
+     The calls to commit_ordered() in multiple parallel transactions is
+     guaranteed to happen in the same order in every participating
+     handler. This can be used to ensure the same commit order among multiple
+     handlers (eg. in table handler and binlog). So if transaction T1 calls
+     into commit_ordered() of handler A before T2, then T1 will also call
+     commit_ordered() of handler B before T2.
+
+     Engines that implement this method should during this call make the
+     transaction visible to other transactions, thereby making the order of
+     transaction commits be defined by the order of commit_ordered() calls.
+
+     The intension is that commit_ordered() should do the minimal amount of
+     work that needs to happen in consistent commit order among handlers. To
+     preserve ordering, calls need to be serialised on a global mutex, so
+     doing any time-consuming or blocking operations in commit_ordered() will
+     limit scalability.
+
+     Handlers can rely on commit_ordered() calls for transactions that updated
+     data to be serialised (no two calls can run in parallel, so no extra
+     locking on the handler part is required to ensure this). However, calls
+     for SELECT-only transactions are not serialised, so can occur in parallel
+     with each other and with at most one write-transaction.
+
+     Note that commit_ordered() can be called from a different thread than the
+     one handling the transaction! So it can not do anything that depends on
+     thread local storage, in particular it can not call my_error() and
+     friends (instead it can store the error code and delay the call of
+     my_error() to the commit() method).
+
+     Similarly, since commit_ordered() returns void, any return error code
+     must be saved and returned from the commit() method instead.
+
+     The commit_ordered method is optional, and can be left unset if not
+     needed in a particular handler.
+   */
+   void (*commit_ordered)(handlerton *hton, THD *thd, bool all);
    int  (*rollback)(handlerton *hton, THD *thd, bool all);
    int  (*prepare)(handlerton *hton, THD *thd, bool all);
+   /*
+     The prepare_ordered method is optional. If set, it will be called after
+     successful prepare() in all handlers participating in 2-phase
+     commit. Like commit_ordered(), it is called only when the full
+     transaction is committed, not for each commit of statement transaction.
+
+     The calls to prepare_ordered() among multiple parallel transactions are
+     ordered consistently with calls to commit_ordered(). This means that
+     calls to prepare_ordered() effectively define the commit order, and that
+     each handler will see the same sequence of transactions calling into
+     prepare_ordered() and commit_ordered().
+
+     Thus, prepare_ordered() can be used to define commit order for handlers
+     that need to do this in the prepare step (like binlog). It can also be
+     used to release transaction's locks early in an order consistent with the
+     order transactions will be eventually committed.
+
+     Like commit_ordered(), prepare_ordered() calls are serialised to maintain
+     ordering, so the intension is that they should execute fast, with only
+     the minimal amount of work needed to define commit order. Handlers can
+     rely on this serialisation, and do not need to do any extra locking to
+     avoid two prepare_ordered() calls running in parallel.
+
+     Like commit_ordered(), prepare_ordered() is not guaranteed to be called
+     in the context of the thread handling the rest of the transaction. So it
+     cannot invoke code that relies on thread local storage, in particular it
+     cannot call my_error().
+
+     When prepare_ordered() is called, the transaction coordinator has already
+     decided to commit (not rollback) the transaction. So prepare_ordered()
+     cannot cause a rollback by returning an error, all possible errors must
+     be handled in prepare() (the prepare_ordered() method returns void). In
+     case of some fatal error, a record of the error must be made internally
+     by the engine and returned from commit() later.
+
+     Note that for user-level XA SQL commands, no consistent ordering among
+     prepare_ordered() and commit_ordered() is guaranteed (as that would
+     require blocking all other commits for an indefinite time).
+
+     When 2-phase commit is not used (eg. only one engine (and no binlog) in
+     transaction), prepare() is not called and in such cases prepare_ordered()
+     also is not called.
+   */
+   void (*prepare_ordered)(handlerton *hton, THD *thd, bool all);
    int  (*recover)(handlerton *hton, XID *xid_list, uint len);
    int  (*commit_by_xid)(handlerton *hton, XID *xid);
    int  (*rollback_by_xid)(handlerton *hton, XID *xid);
diff --git a/sql/log.cc b/sql/log.cc
index f52e68dd1b9..8440a835158 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -38,6 +38,7 @@
 #endif
 
 #include <mysql/plugin.h>
+#include "debug_sync.h"
 
 /* max size of the log message */
 #define MAX_LOG_BUFFER_SIZE 1024
@@ -154,9 +155,12 @@ class binlog_trx_data {
 public:
   binlog_trx_data()
     : at_least_one_stmt_committed(0), incident(FALSE), m_pending(0),
-    before_stmt_pos(MY_OFF_T_UNDEF)
+    before_stmt_pos(MY_OFF_T_UNDEF), using_xa(0)
   {
     trans_log.end_of_file= max_binlog_cache_size;
+    (void) my_pthread_mutex_init(&LOCK_group_commit, MY_MUTEX_INIT_SLOW,
+                                 "LOCK_group_commit", MYF(0));
+    (void) pthread_cond_init(&COND_group_commit, 0);
   }
 
   ~binlog_trx_data()
@@ -208,11 +212,12 @@ public:
     completely.
    */
   void reset() {
-    if (!empty())
+    if (trans_log.type != WRITE_CACHE || !empty())
       truncate(0);
     before_stmt_pos= MY_OFF_T_UNDEF;
     incident= FALSE;
     trans_log.end_of_file= max_binlog_cache_size;
+    using_xa= FALSE;
     DBUG_ASSERT(empty());
   }
 
@@ -257,6 +262,41 @@ public:
     Binlog position before the start of the current statement.
   */
   my_off_t before_stmt_pos;
+
+  /* 0 or error when writing to binlog; set during group commit. */
+  int error;
+  /* If error != 0, value of errno (for my_error() reporting). */
+  int commit_errno;
+  /* Link for queueing transactions up for group commit to binlog. */
+  binlog_trx_data *next;
+  /*
+    Flag set true when group commit for this transaction is finished; used
+    with pthread_cond_wait() to wait until commit is done.
+    This flag is protected by LOCK_group_commit.
+  */
+  bool done;
+  /*
+    Flag set if this transaction is the group commit leader that will handle
+    the actual writing to the binlog.
+    This flag is protected by LOCK_group_commit.
+  */
+  bool group_commit_leader;
+  /*
+    Flag set true if this transaction is committed with log_xid() as part of
+    XA, false if not.
+  */
+  bool using_xa;
+  /*
+    Extra events (BEGIN, COMMIT/ROLLBACK/XID, and possibly INCIDENT) to be
+    written during group commit. The incident_event is only valid if
+    has_incident() is true.
+  */
+  Log_event *begin_event;
+  Log_event *end_event;
+  Log_event *incident_event;
+  /* Mutex and condition for wakeup after group commit. */
+  pthread_mutex_t LOCK_group_commit;
+  pthread_cond_t COND_group_commit;
 };
 
 handlerton *binlog_hton;
@@ -1391,117 +1431,188 @@ static int binlog_close_connection(handlerton *hton, THD *thd)
   return 0;
 }
 
+/* Helper functions for binlog_flush_trx_cache(). */
+static int
+binlog_flush_trx_cache_prepare(THD *thd)
+{
+  if (thd->binlog_flush_pending_rows_event(TRUE))
+    return 1;
+  return 0;
+}
+
+static void
+binlog_flush_trx_cache_finish(THD *thd, binlog_trx_data *trx_data)
+{
+  IO_CACHE *trans_log= &trx_data->trans_log;
+
+  trx_data->reset();
+
+  statistic_increment(binlog_cache_use, &LOCK_status);
+  if (trans_log->disk_writes != 0)
+  {
+    statistic_increment(binlog_cache_disk_use, &LOCK_status);
+    trans_log->disk_writes= 0;
+  }
+}
+
+/*
+  End a transaction, writing events to the binary log.
+
+  SYNOPSIS
+    binlog_flush_trx_cache()
+
+    thd      The thread whose transaction should be ended
+    trx_data Pointer to the transaction data to use
+    end_ev   The end event to use (COMMIT, ROLLBACK, or commit XID)
+
+  DESCRIPTION
+
+    End the currently open transaction. The transaction can be either
+    a real transaction or a statement transaction.
+
+    This can be to commit a transaction, with a COMMIT query event or an XA
+    commit XID event. But it can also be to rollback a transaction with a
+    ROLLBACK query event, used for rolling back transactions which also
+    contain updates to non-transactional tables.
+ */
+static int
+binlog_flush_trx_cache(THD *thd, binlog_trx_data *trx_data,
+                       Log_event *end_ev)
+{
+  DBUG_ENTER("binlog_flush_trx_cache");
+  DBUG_PRINT("info", ("thd->options={ %s%s}",
+                      FLAGSTR(thd->options, OPTION_NOT_AUTOCOMMIT),
+                      FLAGSTR(thd->options, OPTION_BEGIN)));
+
+  if (binlog_flush_trx_cache_prepare(thd))
+    DBUG_RETURN(1);
+
+  /*
+    Doing a commit or a rollback including non-transactional tables,
+    i.e., ending a transaction where we might write the transaction
+    cache to the binary log.
+
+    We can always end the statement when ending a transaction since
+    transactions are not allowed inside stored functions.  If they
+    were, we would have to ensure that we're not ending a statement
+    inside a stored function.
+   */
+  int error= mysql_bin_log.write_transaction_to_binlog(thd, trx_data, end_ev);
+
+  binlog_flush_trx_cache_finish(thd, trx_data);
+
+  DBUG_ASSERT(thd->binlog_get_pending_rows_event() == NULL);
+  DBUG_RETURN(error);
+}
+
 /*
-  End a transaction.
+  Discard a transaction, ie. ROLLBACK with only transactional table updates.
 
   SYNOPSIS
-    binlog_end_trans()
+    binlog_truncate_trx_cache()
 
     thd      The thread whose transaction should be ended
     trx_data Pointer to the transaction data to use
-    end_ev   The end event to use, or NULL
     all      True if the entire transaction should be ended, false if
              only the statement transaction should be ended.
 
   DESCRIPTION
 
-    End the currently open transaction. The transaction can be either
-    a real transaction (if 'all' is true) or a statement transaction
-    (if 'all' is false).
+    Rollback (and end) a transaction that only modifies transactional
+    tables. The transaction can be either a real transaction (if 'all' is
+    true) or a statement transaction (if 'all' is false).
 
-    If 'end_ev' is NULL, the transaction is a rollback of only
-    transactional tables, so the transaction cache will be truncated
-    to either just before the last opened statement transaction (if
-    'all' is false), or reset completely (if 'all' is true).
+    The transaction cache will be truncated to either just before the last
+    opened statement transaction (if 'all' is false), or reset completely (if
+    'all' is true).
  */
 static int
-binlog_end_trans(THD *thd, binlog_trx_data *trx_data,
-                 Log_event *end_ev, bool all)
+binlog_truncate_trx_cache(THD *thd, binlog_trx_data *trx_data, bool all)
 {
-  DBUG_ENTER("binlog_end_trans");
-  int error=0;
-  IO_CACHE *trans_log= &trx_data->trans_log;
-  DBUG_PRINT("enter", ("transaction: %s  end_ev: 0x%lx",
-                       all ? "all" : "stmt", (long) end_ev));
+  DBUG_ENTER("binlog_truncate_trx_cache");
+  int error= 0;
+  DBUG_PRINT("enter", ("transaction: %s", all ? "all" : "stmt"));
   DBUG_PRINT("info", ("thd->options={ %s%s}",
                       FLAGSTR(thd->options, OPTION_NOT_AUTOCOMMIT),
                       FLAGSTR(thd->options, OPTION_BEGIN)));
 
   /*
-    NULL denotes ROLLBACK with nothing to replicate: i.e., rollback of
-    only transactional tables.  If the transaction contain changes to
-    any non-transactiona tables, we need write the transaction and log
-    a ROLLBACK last.
+    ROLLBACK with nothing to replicate: i.e., rollback of only transactional
+    tables.
   */
-  if (end_ev != NULL)
-  {
-    if (thd->binlog_flush_pending_rows_event(TRUE))
-      DBUG_RETURN(1);
-    /*
-      Doing a commit or a rollback including non-transactional tables,
-      i.e., ending a transaction where we might write the transaction
-      cache to the binary log.
-
-      We can always end the statement when ending a transaction since
-      transactions are not allowed inside stored functions.  If they
-      were, we would have to ensure that we're not ending a statement
-      inside a stored function.
-     */
-    error= mysql_bin_log.write(thd, &trx_data->trans_log, end_ev,
-                               trx_data->has_incident());
-    trx_data->reset();
 
-    /*
-      We need to step the table map version after writing the
-      transaction cache to disk.
-    */
-    mysql_bin_log.update_table_map_version();
-    statistic_increment(binlog_cache_use, &LOCK_status);
-    if (trans_log->disk_writes != 0)
-    {
-      statistic_increment(binlog_cache_disk_use, &LOCK_status);
-      trans_log->disk_writes= 0;
-    }
-  }
-  else
-  {
-    /*
-      If rolling back an entire transaction or a single statement not
-      inside a transaction, we reset the transaction cache.
-
-      If rolling back a statement in a transaction, we truncate the
-      transaction cache to remove the statement.
-     */
-    thd->binlog_remove_pending_rows_event(TRUE);
-    if (all || !(thd->options & (OPTION_BEGIN | OPTION_NOT_AUTOCOMMIT)))
-    {
-      if (trx_data->has_incident())
-        error= mysql_bin_log.write_incident(thd, TRUE);
-      trx_data->reset();
-    }
-    else                                        // ...statement
-      trx_data->truncate(trx_data->before_stmt_pos);
+  /*
+    If rolling back an entire transaction or a single statement not
+    inside a transaction, we reset the transaction cache.
 
-    /*
-      We need to step the table map version on a rollback to ensure
-      that a new table map event is generated instead of the one that
-      was written to the thrown-away transaction cache.
-    */
-    mysql_bin_log.update_table_map_version();
+    If rolling back a statement in a transaction, we truncate the
+    transaction cache to remove the statement.
+   */
+  thd->binlog_remove_pending_rows_event(TRUE);
+  if (all || !(thd->options & (OPTION_BEGIN | OPTION_NOT_AUTOCOMMIT)))
+  {
+    if (trx_data->has_incident())
+      error= mysql_bin_log.write_incident(thd);
+    trx_data->reset();
   }
+  else                                        // ...statement
+    trx_data->truncate(trx_data->before_stmt_pos);
 
   DBUG_ASSERT(thd->binlog_get_pending_rows_event() == NULL);
   DBUG_RETURN(error);
 }
 
+static LEX_STRING const write_error_msg=
+    { C_STRING_WITH_LEN("error writing to the binary log") };
+
 static int binlog_prepare(handlerton *hton, THD *thd, bool all)
 {
   /*
-    do nothing.
-    just pretend we can do 2pc, so that MySQL won't
-    switch to 1pc.
-    real work will be done in MYSQL_BIN_LOG::log_xid()
+    If this prepare is for a single statement in the middle of a transactions,
+    not the actual transaction commit, then we do nothing. The real work is
+    only done later, in the prepare for making persistent changes.
   */
+  if (!all && (thd->options & (OPTION_BEGIN | OPTION_NOT_AUTOCOMMIT)))
+    return 0;
+
+  binlog_trx_data *trx_data=
+    (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton);
+
+  trx_data->using_xa= TRUE;
+
+  if (binlog_flush_trx_cache_prepare(thd))
+    return 1;
+
+  my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
+  if (!xid)
+  {
+    /* Skip logging this transaction, marked by setting end_event to NULL. */
+    trx_data->end_event= NULL;
+    return 0;
+  }
+
+  /*
+    Allocate the extra events that will be logged to the binlog in binlog group
+    commit. Use placement new to allocate them on the THD memroot, as they need
+    to remain live until log_xid() returns.
+   */
+  size_t needed_size= sizeof(Query_log_event) + sizeof(Xid_log_event);
+  if (trx_data->has_incident())
+    needed_size+= sizeof(Incident_log_event);
+  uchar *mem= (uchar *)thd->alloc(needed_size);
+  if (!mem)
+    return 1;
+
+  trx_data->begin_event= new ((void *)mem)
+    Query_log_event(thd, STRING_WITH_LEN("BEGIN"), TRUE, TRUE, 0);
+  mem+= sizeof(Query_log_event);
+
+  trx_data->end_event= new ((void *)mem) Xid_log_event(thd, xid);
+
+  if (trx_data->has_incident())
+    trx_data->incident_event= new ((void *)(mem + sizeof(Xid_log_event)))
+      Incident_log_event(thd, INCIDENT_LOST_EVENTS, write_error_msg);
+
   return 0;
 }
 
@@ -1525,11 +1636,11 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all)
   binlog_trx_data *const trx_data=
     (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton);
 
-  if (trx_data->empty())
+  if (trx_data->using_xa)
   {
     // we're here because trans_log was flushed in MYSQL_BIN_LOG::log_xid()
-    trx_data->reset();
-    DBUG_RETURN(0);
+    binlog_flush_trx_cache_finish(thd, trx_data);
+    DBUG_RETURN(error);
   }
 
   /*
@@ -1556,8 +1667,8 @@ static int binlog_commit(handlerton *hton, THD *thd, bool all)
        !stmt_has_updated_trans_table(thd) &&
        thd->transaction.stmt.modified_non_trans_table))
   {
-    Query_log_event qev(thd, STRING_WITH_LEN("COMMIT"), TRUE, TRUE, 0);
-    error= binlog_end_trans(thd, trx_data, &qev, all);
+    Query_log_event end_ev(thd, STRING_WITH_LEN("COMMIT"), TRUE, TRUE, 0);
+    error= binlog_flush_trx_cache(thd, trx_data, &end_ev);
   }
 
   trx_data->at_least_one_stmt_committed = my_b_tell(&trx_data->trans_log) > 0;
@@ -1621,7 +1732,7 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all)
         (thd->options & OPTION_KEEP_LOG)) &&
         mysql_bin_log.check_write_error(thd))
       trx_data->set_incident();
-    error= binlog_end_trans(thd, trx_data, 0, all);
+    error= binlog_truncate_trx_cache(thd, trx_data, all);
   }
   else
   {
@@ -1641,8 +1752,8 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all)
          thd->current_stmt_binlog_row_based) ||
         ((thd->options & OPTION_KEEP_LOG)))
     {
-      Query_log_event qev(thd, STRING_WITH_LEN("ROLLBACK"), TRUE, TRUE, 0);
-      error= binlog_end_trans(thd, trx_data, &qev, all);
+      Query_log_event end_ev(thd, STRING_WITH_LEN("ROLLBACK"), TRUE, TRUE, 0);
+      error= binlog_flush_trx_cache(thd, trx_data, &end_ev);
     }
     /*
       Otherwise, we simply truncate the cache as there is no change on
@@ -1650,7 +1761,7 @@ static int binlog_rollback(handlerton *hton, THD *thd, bool all)
     */
     else if ((all && !thd->transaction.all.modified_non_trans_table) ||
           (!all && !thd->transaction.stmt.modified_non_trans_table))
-      error= binlog_end_trans(thd, trx_data, 0, all);
+      error= binlog_truncate_trx_cache(thd, trx_data, all);
   }
   if (!all)
     trx_data->before_stmt_pos = MY_OFF_T_UNDEF; // part of the stmt rollback
@@ -2464,7 +2575,7 @@ const char *MYSQL_LOG::generate_name(const char *log_name,
 
 MYSQL_BIN_LOG::MYSQL_BIN_LOG()
   :bytes_written(0), prepared_xids(0), file_id(1), open_count(1),
-   need_start_event(TRUE), m_table_map_version(0),
+   need_start_event(TRUE),
    is_relay_log(0),
    description_event_for_exec(0), description_event_for_queue(0)
 {
@@ -2492,6 +2603,7 @@ void MYSQL_BIN_LOG::cleanup()
     delete description_event_for_exec;
     (void) pthread_mutex_destroy(&LOCK_log);
     (void) pthread_mutex_destroy(&LOCK_index);
+    (void) pthread_mutex_destroy(&LOCK_queue);
     (void) pthread_cond_destroy(&update_cond);
   }
   DBUG_VOID_RETURN;
@@ -2520,6 +2632,8 @@ void MYSQL_BIN_LOG::init_pthread_objects()
   */ 
   (void) my_pthread_mutex_init(&LOCK_index, MY_MUTEX_INIT_SLOW, "LOCK_index",
                                MYF_NO_DEADLOCK_DETECTION);
+  (void) my_pthread_mutex_init(&LOCK_queue, MY_MUTEX_INIT_FAST, "LOCK_queue",
+                               MYF(0));
   (void) pthread_cond_init(&update_cond, 0);
 }
 
@@ -3943,6 +4057,10 @@ err:
 }
 
 
+#ifndef DBUG_OFF
+static ulong opt_binlog_dbug_fsync_sleep= 0;
+#endif
+
 bool MYSQL_BIN_LOG::flush_and_sync()
 {
   int err=0, fd=log_file.file;
@@ -3953,6 +4071,11 @@ bool MYSQL_BIN_LOG::flush_and_sync()
   {
     sync_binlog_counter= 0;
     err=my_sync(fd, MYF(MY_WME));
+#ifndef DBUG_OFF
+    ulong usec_sleep= opt_binlog_dbug_fsync_sleep;
+    if (usec_sleep > 0)
+      my_sleep(usec_sleep);
+#endif
   }
   return err;
 }
@@ -4113,7 +4236,6 @@ int THD::binlog_write_table_map(TABLE *table, bool is_trans)
     DBUG_RETURN(error);
 
   binlog_table_maps++;
-  table->s->table_map_version= mysql_bin_log.table_map_version();
   DBUG_RETURN(0);
 }
 
@@ -4194,64 +4316,41 @@ MYSQL_BIN_LOG::flush_and_set_pending_rows_event(THD *thd,
 
   if (Rows_log_event* pending= trx_data->pending())
   {
-    IO_CACHE *file= &log_file;
-
     /*
       Decide if we should write to the log file directly or to the
       transaction log.
     */
     if (pending->get_cache_stmt() || my_b_tell(&trx_data->trans_log))
-      file= &trx_data->trans_log;
-
-    /*
-      If we are writing to the log file directly, we could avoid
-      locking the log. This does not work since we need to step the
-      m_table_map_version below, and that change has to be protected
-      by the LOCK_log mutex.
-    */
-    pthread_mutex_lock(&LOCK_log);
-
-    /*
-      Write pending event to log file or transaction cache
-    */
-    if (pending->write(file))
     {
-      pthread_mutex_unlock(&LOCK_log);
-      set_write_error(thd);
-      DBUG_RETURN(1);
+      /* Write to transaction log/cache. */
+      if (pending->write(&trx_data->trans_log))
+      {
+        set_write_error(thd);
+        DBUG_RETURN(1);
+      }
     }
-
-    /*
-      We step the table map version if we are writing an event
-      representing the end of a statement.  We do this regardless of
-      wheather we write to the transaction cache or to directly to the
-      file.
-
-      In an ideal world, we could avoid stepping the table map version
-      if we were writing to a transaction cache, since we could then
-      reuse the table map that was written earlier in the transaction
-      cache.  This does not work since STMT_END_F implies closing all
-      table mappings on the slave side.
-
-      TODO: Find a solution so that table maps does not have to be
-      written several times within a transaction.
-     */
-    if (pending->get_flags(Rows_log_event::STMT_END_F))
-      ++m_table_map_version;
-
-    delete pending;
-
-    if (file == &log_file)
+    else
     {
+      /* Write directly to log file. */
+      pthread_mutex_lock(&LOCK_log);
+      if (pending->write(&log_file))
+      {
+        pthread_mutex_unlock(&LOCK_log);
+        set_write_error(thd);
+        DBUG_RETURN(1);
+      }
+
       error= flush_and_sync();
       if (!error)
       {
         signal_update();
         rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED);
       }
+
+      pthread_mutex_unlock(&LOCK_log);
     }
 
-    pthread_mutex_unlock(&LOCK_log);
+    delete pending;
   }
 
   thd->binlog_set_pending_rows_event(event);
@@ -4450,9 +4549,6 @@ err:
       set_write_error(thd);
   }
 
-  if (event_info->flags & LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F)
-    ++m_table_map_version;
-
   pthread_mutex_unlock(&LOCK_log);
   DBUG_RETURN(error);
 }
@@ -4575,18 +4671,14 @@ uint MYSQL_BIN_LOG::next_file_id()
   SYNOPSIS
     write_cache()
     cache    Cache to write to the binary log
-    lock_log True if the LOCK_log mutex should be aquired, false otherwise
-    sync_log True if the log should be flushed and sync:ed
 
   DESCRIPTION
     Write the contents of the cache to the binary log. The cache will
     be reset as a READ_CACHE to be able to read the contents from it.
  */
 
-int MYSQL_BIN_LOG::write_cache(IO_CACHE *cache, bool lock_log, bool sync_log)
+int MYSQL_BIN_LOG::write_cache(IO_CACHE *cache)
 {
-  Mutex_sentry sentry(lock_log ? &LOCK_log : NULL);
-
   if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
     return ER_ERROR_ON_WRITE;
   uint length= my_b_bytes_in_cache(cache), group, carry, hdr_offs;
@@ -4697,6 +4789,7 @@ int MYSQL_BIN_LOG::write_cache(IO_CACHE *cache, bool lock_log, bool sync_log)
     }
 
     /* Write data to the binary log file */
+    DBUG_EXECUTE_IF("fail_binlog_write_1", return ER_ERROR_ON_WRITE;);
     if (my_b_write(&log_file, cache->read_pos, length))
       return ER_ERROR_ON_WRITE;
     cache->read_pos=cache->read_end;		// Mark buffer used up
@@ -4704,9 +4797,6 @@ int MYSQL_BIN_LOG::write_cache(IO_CACHE *cache, bool lock_log, bool sync_log)
 
   DBUG_ASSERT(carry == 0);
 
-  if (sync_log)
-    flush_and_sync();
-
   return 0;                                     // All OK
 }
 
@@ -4739,26 +4829,22 @@ int query_error_code(THD *thd, bool not_killed)
   return error;
 }
 
-bool MYSQL_BIN_LOG::write_incident(THD *thd, bool lock)
+bool MYSQL_BIN_LOG::write_incident(THD *thd)
 {
   uint error= 0;
   DBUG_ENTER("MYSQL_BIN_LOG::write_incident");
-  LEX_STRING const write_error_msg=
-    { C_STRING_WITH_LEN("error writing to the binary log") };
   Incident incident= INCIDENT_LOST_EVENTS;
   Incident_log_event ev(thd, incident, write_error_msg);
-  if (lock)
-    pthread_mutex_lock(&LOCK_log);
+
+  pthread_mutex_lock(&LOCK_log);
   error= ev.write(&log_file);
-  if (lock)
+  if (!error && !(error= flush_and_sync()))
   {
-    if (!error && !(error= flush_and_sync()))
-    {
-      signal_update();
-      rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED);
-    }
-    pthread_mutex_unlock(&LOCK_log);
+    signal_update();
+    rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED);
   }
+  pthread_mutex_unlock(&LOCK_log);
+
   DBUG_RETURN(error);
 }
 
@@ -4786,103 +4872,366 @@ bool MYSQL_BIN_LOG::write_incident(THD *thd, bool lock)
     'cache' needs to be reinitialized after this functions returns.
 */
 
-bool MYSQL_BIN_LOG::write(THD *thd, IO_CACHE *cache, Log_event *commit_event,
-                          bool incident)
+bool
+MYSQL_BIN_LOG::write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data,
+                                           Log_event *end_ev)
+{
+  DBUG_ENTER("MYSQL_BIN_LOG::write_transaction_to_binlog");
+
+  /*
+    Create the necessary events here, where we have the correct THD (and
+    thread context).
+
+    Due to group commit the actual writing to binlog may happen in a different
+    thread.
+  */
+  Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, TRUE, 0);
+  trx_data->begin_event= &qinfo;
+  trx_data->end_event= end_ev;
+  if (trx_data->has_incident())
+  {
+    Incident_log_event inc_ev(thd, INCIDENT_LOST_EVENTS, write_error_msg);
+    trx_data->incident_event= &inc_ev;
+    DBUG_RETURN(write_transaction_to_binlog_events(trx_data));
+  }
+  else
+  {
+    trx_data->incident_event= NULL;
+    DBUG_RETURN(write_transaction_to_binlog_events(trx_data));
+  }
+}
+
+bool
+MYSQL_BIN_LOG::write_transaction_to_binlog_events(binlog_trx_data *trx_data)
 {
-  DBUG_ENTER("MYSQL_BIN_LOG::write(THD *, IO_CACHE *, Log_event *)");
+  /*
+    To facilitate group commit for the binlog, we first queue up ourselves in
+    the group commit queue. Then the first thread to enter the queue waits for
+    the LOCK_log mutex, and commits for everyone in the queue once it gets the
+    lock. Any other threads in the queue just wait for the first one to finish
+    the commit and wake them up.
+  */
+
+  pthread_mutex_lock(&trx_data->LOCK_group_commit);
+  const binlog_trx_data *orig_queue= atomic_enqueue_trx(trx_data);
+
+  if (orig_queue != NULL)
+  {
+    trx_data->group_commit_leader= FALSE;
+    trx_data->done= FALSE;
+    trx_group_commit_participant(trx_data);
+  }
+  else
+  {
+    trx_data->group_commit_leader= TRUE;
+    pthread_mutex_unlock(&trx_data->LOCK_group_commit);
+    trx_group_commit_leader(NULL);
+  }
+
+  return trx_group_commit_finish(trx_data);
+}
+
+/*
+  Participate as secondary transaction in group commit.
+
+  Another thread is already waiting to obtain the LOCK_log, and should include
+  this thread in the group commit once the log is obtained. So here we put
+  ourself in the queue and wait to be signalled that the group commit is done.
+
+  Note that this function must be called with the trs_data->LOCK_group_commit
+  locked; the mutex will be released before return.
+*/
+void
+MYSQL_BIN_LOG::trx_group_commit_participant(binlog_trx_data *trx_data)
+{
+  safe_mutex_assert_owner(&trx_data->LOCK_group_commit);
+
+  /* Wait until trx_data.done == true and woken up by the leader. */
+  while (!trx_data->done)
+    pthread_cond_wait(&trx_data->COND_group_commit,
+                      &trx_data->LOCK_group_commit);
+  pthread_mutex_unlock(&trx_data->LOCK_group_commit);
+}
+
+bool
+MYSQL_BIN_LOG::trx_group_commit_finish(binlog_trx_data *trx_data)
+{
+  DBUG_ENTER("MYSQL_BIN_LOG::trx_group_commit_finish");
+  DBUG_PRINT("info", ("trx_data->error=%d\n", trx_data->error));
+  if (trx_data->error)
+  {
+    switch (trx_data->error)
+    {
+    case ER_ERROR_ON_WRITE:
+      my_error(ER_ERROR_ON_WRITE, MYF(ME_NOREFRESH), name, trx_data->commit_errno);
+      break;
+    case ER_ERROR_ON_READ:
+      my_error(ER_ERROR_ON_READ, MYF(ME_NOREFRESH),
+               trx_data->trans_log.file_name, trx_data->commit_errno);
+      break;
+    default:
+      /*
+        There are not (and should not be) any errors thrown not covered above.
+        But just in case one is added later without updating the above switch
+        statement, include a catch-all.
+      */
+      my_printf_error(trx_data->error,
+                      "Error writing transaction to binary log: %d",
+                      MYF(ME_NOREFRESH), trx_data->error);
+    }
+
+    /*
+      Since we return error, this transaction XID will not be committed, so
+      we need to mark it as not needed for recovery (unlog() is not called
+      for a transaction if log_xid() fails).
+     */
+    if (trx_data->end_event->get_type_code() == XID_EVENT)
+      mark_xid_done();
+
+    DBUG_RETURN(1);
+  }
+
+  DBUG_RETURN(0);
+}
+
+/*
+  Do binlog group commit as the lead thread.
+
+  This must be called when this thread/transaction is queued at the start of
+  the group_commit_queue. It will wait to obtain the LOCK_log mutex, then group
+  commit all the transactions in the queue (more may have entered while waiting
+  for LOCK_log). After commit is done, all other threads in the queue will be
+  signalled.
+
+ */
+void
+MYSQL_BIN_LOG::trx_group_commit_leader(TC_group_commit_entry *first)
+{
+  uint xid_count= 0;
+  uint write_count= 0;
+
+  /* First, put anything from group_log_xid into the queue. */
+  binlog_trx_data *full_queue= NULL;
+  binlog_trx_data **next_ptr= &full_queue;
+  for (TC_group_commit_entry *entry= first; entry; entry= entry->next)
+  {
+    binlog_trx_data *const trx_data=
+      (binlog_trx_data*) thd_get_ha_data(entry->thd, binlog_hton);
+
+    /* Skip log_xid for transactions without xid, marked by NULL end_event. */
+    if (!trx_data->end_event)
+      continue;
+
+    trx_data->error= 0;
+    *next_ptr= trx_data;
+    next_ptr= &(trx_data->next);
+  }
+
+  /*
+    Next, lock the LOCK_log(), and once we get it, add any additional writes
+    that queued up while we were waiting.
+
+    Note that if some writer not going through log_xid() comes in and gets the
+    LOCK_log before us, they will not be able to include us in their group
+    commit (and they are not able to handle ensuring same commit order between
+    us and participating transactional storage engines anyway).
+
+    On the other hand, when we get the LOCK_log, we will be able to include
+    any non-trasactional writes that queued up in our group commit. This
+    should hopefully not be too big of a problem, as group commit is most
+    important for the transactional case anyway when durability (fsync) is
+    enabled.
+  */
   VOID(pthread_mutex_lock(&LOCK_log));
 
-  /* NULL would represent nothing to replicate after ROLLBACK */
-  DBUG_ASSERT(commit_event != NULL);
+  /*
+    As the queue is in reverse order of entering, reverse the queue as we add
+    it to the existing one. Note that there is no ordering defined between
+    transactional and non-transactional commits.
+  */
+  binlog_trx_data *current= atomic_grab_trx_queue();
+  binlog_trx_data *xtra_queue= NULL;
+  while (current)
+  {
+    current->error= 0;
+    binlog_trx_data *next= current->next;
+    current->next= xtra_queue;
+    xtra_queue= current;
+    current= next;
+  }
+  *next_ptr= xtra_queue;
 
+  /*
+    Now we have in full_queue the list of transactions to be committed in
+    order.
+  */
   DBUG_ASSERT(is_open());
   if (likely(is_open()))                       // Should always be true
   {
     /*
-      We only bother to write to the binary log if there is anything
-      to write.
-     */
-    if (my_b_tell(cache) > 0)
+      Commit every transaction in the queue.
+
+      Note that we are doing this in a different thread than the one running
+      the transaction! So we are limited in the operations we can do. In
+      particular, we cannot call my_error() on behalf of a transaction, as
+      that obtains the THD from thread local storage. Instead, we must set
+      current->error and let the thread do the error reporting itself once
+      we wake it up.
+    */
+    for (current= full_queue; current != NULL; current= current->next)
     {
-      /*
-        Log "BEGIN" at the beginning of every transaction.  Here, a
-        transaction is either a BEGIN..COMMIT block or a single
-        statement in autocommit mode.
-      */
-      Query_log_event qinfo(thd, STRING_WITH_LEN("BEGIN"), TRUE, TRUE, 0);
+      IO_CACHE *cache= &current->trans_log;
 
       /*
-        Now this Query_log_event has artificial log_pos 0. It must be
-        adjusted to reflect the real position in the log. Not doing it
-        would confuse the slave: it would prevent this one from
-        knowing where he is in the master's binlog, which would result
-        in wrong positions being shown to the user, MASTER_POS_WAIT
-        undue waiting etc.
+        We only bother to write to the binary log if there is anything
+        to write.
       */
-      if (qinfo.write(&log_file))
-        goto err;
-
-      DBUG_EXECUTE_IF("crash_before_writing_xid",
-                      {
-                        if ((write_error= write_cache(cache, false, true)))
-                          DBUG_PRINT("info", ("error writing binlog cache: %d",
-                                               write_error));
-                        DBUG_PRINT("info", ("crashing before writing xid"));
-                        abort();
-                      });
-
-      if ((write_error= write_cache(cache, false, false)))
-        goto err;
+      if (my_b_tell(cache) > 0)
+      {
+        current->error= write_transaction(current);
+        if (current->error)
+          current->commit_errno= errno;
 
-      if (commit_event && commit_event->write(&log_file))
-        goto err;
+        write_count++;
+      }
 
-      if (incident && write_incident(thd, FALSE))
-        goto err;
+      if (current->end_event->get_type_code() == XID_EVENT)
+        xid_count++;
+    }
 
+    if (write_count > 0)
+    {
       if (flush_and_sync())
-        goto err;
-      DBUG_EXECUTE_IF("half_binlogged_transaction", DBUG_ABORT(););
-      if (cache->error)				// Error on read
       {
-        sql_print_error(ER(ER_ERROR_ON_READ), cache->file_name, errno);
-        write_error=1;				// Don't give more errors
-        goto err;
+        for (current= full_queue; current != NULL; current= current->next)
+        {
+          if (!current->error)
+          {
+            current->error= ER_ERROR_ON_WRITE;
+            current->commit_errno= errno;
+          }
+        }
+      }
+      else
+      {
+        signal_update();
       }
-      signal_update();
     }
 
     /*
-      if commit_event is Xid_log_event, increase the number of
+      if any commit_events are Xid_log_event, increase the number of
       prepared_xids (it's decreasd in ::unlog()). Binlog cannot be rotated
       if there're prepared xids in it - see the comment in new_file() for
       an explanation.
-      If the commit_event is not Xid_log_event (then it's a Query_log_event)
-      rotate binlog, if necessary.
+      If no Xid_log_events (then it's all Query_log_event) rotate binlog,
+      if necessary.
     */
-    if (commit_event && commit_event->get_type_code() == XID_EVENT)
+    if (xid_count > 0)
     {
-      pthread_mutex_lock(&LOCK_prep_xids);
-      prepared_xids++;
-      pthread_mutex_unlock(&LOCK_prep_xids);
+      mark_xids_active(xid_count);
     }
     else
       rotate_and_purge(RP_LOCK_LOG_IS_ALREADY_LOCKED);
   }
+
   VOID(pthread_mutex_unlock(&LOCK_log));
 
-  DBUG_RETURN(0);
+  /*
+    Signal those that are not part of group_log_xid, and are not group leaders
+    running the queue.
 
-err:
-  if (!write_error)
+    Since a group leader runs the queue itself if a group_log_xid does not get
+    to do it forst, such leader threads do not need wait or wakeup.
+  */
+  for (current= xtra_queue; current != NULL; current= current->next)
   {
-    write_error= 1;
-    sql_print_error(ER(ER_ERROR_ON_WRITE), name, errno);
+    /*
+      Note that we need to take LOCK_group_commit even in the case of a leader!
+
+      Otherwise there is a race between setting and testing the
+      group_commit_leader flag.
+    */
+    pthread_mutex_lock(&current->LOCK_group_commit);
+    if (!current->group_commit_leader)
+    {
+      current->done= true;
+      pthread_cond_signal(&current->COND_group_commit);
+    }
+    pthread_mutex_unlock(&current->LOCK_group_commit);
   }
-  VOID(pthread_mutex_unlock(&LOCK_log));
-  DBUG_RETURN(1);
 }
 
+int
+MYSQL_BIN_LOG::write_transaction(binlog_trx_data *trx_data)
+{
+  IO_CACHE *cache= &trx_data->trans_log;
+  /*
+    Log "BEGIN" at the beginning of every transaction.  Here, a transaction is
+    either a BEGIN..COMMIT block or a single statement in autocommit mode. The
+    event was constructed in write_transaction_to_binlog(), in the thread
+    running the transaction.
+
+    Now this Query_log_event has artificial log_pos 0. It must be
+    adjusted to reflect the real position in the log. Not doing it
+    would confuse the slave: it would prevent this one from
+    knowing where he is in the master's binlog, which would result
+    in wrong positions being shown to the user, MASTER_POS_WAIT
+    undue waiting etc.
+  */
+  if (trx_data->begin_event->write(&log_file))
+    return ER_ERROR_ON_WRITE;
+
+  DBUG_EXECUTE_IF("crash_before_writing_xid",
+                  {
+                    if ((write_cache(cache)))
+                      DBUG_PRINT("info", ("error writing binlog cache"));
+                    else
+                      flush_and_sync();
+
+                    DBUG_PRINT("info", ("crashing before writing xid"));
+                    abort();
+                  });
+
+  if (write_cache(cache))
+    return ER_ERROR_ON_WRITE;
+
+  if (trx_data->end_event->write(&log_file))
+    return ER_ERROR_ON_WRITE;
+
+  if (trx_data->has_incident() && trx_data->incident_event->write(&log_file))
+    return ER_ERROR_ON_WRITE;
+
+  if (cache->error)				// Error on read
+    return ER_ERROR_ON_READ;
+
+  return 0;
+}
+
+binlog_trx_data *
+MYSQL_BIN_LOG::atomic_enqueue_trx(binlog_trx_data *trx_data)
+{
+  my_atomic_rwlock_wrlock(&LOCK_queue);
+  trx_data->next= group_commit_queue;
+  while (!my_atomic_casptr((void **)(&group_commit_queue),
+                           (void **)(&trx_data->next),
+                           trx_data))
+    ;
+  my_atomic_rwlock_wrunlock(&LOCK_queue);
+  return trx_data->next;
+}
+
+binlog_trx_data *
+MYSQL_BIN_LOG::atomic_grab_trx_queue()
+{
+  my_atomic_rwlock_wrlock(&LOCK_queue);
+  binlog_trx_data *queue= group_commit_queue;
+  while (!my_atomic_casptr((void **)(&group_commit_queue),
+                           (void **)(&queue),
+                           NULL))
+    ;
+  my_atomic_rwlock_wrunlock(&LOCK_queue);
+  return queue;
+}
 
 /**
   Wait until we get a signal that the binary log has been updated.
@@ -5276,6 +5625,344 @@ void sql_print_information(const char *format, ...)
 }
 
 
+static my_bool mutexes_inited;
+pthread_mutex_t LOCK_prepare_ordered;
+pthread_mutex_t LOCK_commit_ordered;
+
+void
+TC_init()
+{
+  my_pthread_mutex_init(&LOCK_prepare_ordered, MY_MUTEX_INIT_SLOW,
+                        "LOCK_prepare_ordered", MYF(0));
+  my_pthread_mutex_init(&LOCK_commit_ordered, MY_MUTEX_INIT_SLOW,
+                        "LOCK_commit_ordered", MYF(0));
+  mutexes_inited= TRUE;
+}
+
+void
+TC_destroy()
+{
+  if (mutexes_inited)
+  {
+    pthread_mutex_destroy(&LOCK_prepare_ordered);
+    pthread_mutex_destroy(&LOCK_commit_ordered);
+    mutexes_inited= FALSE;
+  }
+}
+
+void
+TC_LOG::run_prepare_ordered(THD *thd, bool all)
+{
+  Ha_trx_info *ha_info=
+    all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
+
+  for (; ha_info; ha_info= ha_info->next())
+  {
+    handlerton *ht= ha_info->ht();
+    if (!ht->prepare_ordered)
+      continue;
+    safe_mutex_assert_owner(&LOCK_prepare_ordered);
+    ht->prepare_ordered(ht, thd, all);
+  }
+}
+
+void
+TC_LOG::run_commit_ordered(THD *thd, bool all)
+{
+  Ha_trx_info *ha_info=
+    all ? thd->transaction.all.ha_list : thd->transaction.stmt.ha_list;
+
+  for (; ha_info; ha_info= ha_info->next())
+  {
+    handlerton *ht= ha_info->ht();
+    if (!ht->commit_ordered)
+      continue;
+    safe_mutex_assert_owner(&LOCK_commit_ordered);
+    ht->commit_ordered(ht, thd, all);
+    DEBUG_SYNC(thd, "commit_after_run_commit_ordered");
+  }
+}
+
+TC_LOG_queued::TC_LOG_queued() : group_commit_queue(NULL)
+{
+}
+
+TC_LOG_queued::~TC_LOG_queued()
+{
+}
+
+TC_LOG_queued::TC_group_commit_entry *
+TC_LOG_queued::reverse_queue(TC_LOG_queued::TC_group_commit_entry *queue)
+{
+  TC_group_commit_entry *entry= queue;
+  TC_group_commit_entry *prev= NULL;
+  while (entry)
+  {
+    TC_group_commit_entry *next= entry->next;
+    entry->next= prev;
+    prev= entry;
+    entry= next;
+  }
+
+  return prev;
+}
+
+void
+TC_LOG_queued::group_commit_wait_for_wakeup(TC_group_commit_entry *entry)
+{
+  THD *thd= entry->thd;
+  pthread_mutex_lock(&thd->LOCK_commit_ordered);
+  while (!entry->group_commit_ready)
+    pthread_cond_wait(&thd->COND_commit_ordered,
+                      &thd->LOCK_commit_ordered);
+  pthread_mutex_unlock(&thd->LOCK_commit_ordered);
+}
+
+void
+TC_LOG_queued::group_commit_wakeup_other(TC_group_commit_entry *other)
+{
+  THD *thd= other->thd;
+  pthread_mutex_lock(&thd->LOCK_commit_ordered);
+  other->group_commit_ready= TRUE;
+  pthread_cond_signal(&thd->COND_commit_ordered);
+  pthread_mutex_unlock(&thd->LOCK_commit_ordered);
+}
+
+TC_LOG_unordered::TC_LOG_unordered() : group_commit_queue_busy(0)
+{
+  pthread_cond_init(&COND_queue_busy, 0);
+}
+
+TC_LOG_unordered::~TC_LOG_unordered()
+{
+  pthread_cond_destroy(&COND_queue_busy);
+}
+
+int TC_LOG_unordered::log_and_order(THD *thd, my_xid xid, bool all,
+                                    bool need_prepare_ordered,
+                                    bool need_commit_ordered)
+{
+  int cookie;
+  struct TC_group_commit_entry entry;
+  bool is_group_commit_leader;
+  LINT_INIT(is_group_commit_leader);
+
+  if (need_prepare_ordered)
+  {
+    pthread_mutex_lock(&LOCK_prepare_ordered);
+    run_prepare_ordered(thd, all);
+    if (need_commit_ordered)
+    {
+      /*
+        Must put us in queue so we can run_commit_ordered() in same sequence
+        as we did run_prepare_ordered().
+      */
+      entry.thd= thd;
+      entry.group_commit_ready= false;
+      TC_group_commit_entry *previous_queue= group_commit_queue;
+      entry.next= previous_queue;
+      group_commit_queue= &entry;
+      is_group_commit_leader= (previous_queue == NULL);
+    }
+    pthread_mutex_unlock(&LOCK_prepare_ordered);
+  }
+
+  if (xid)
+    cookie= log_xid(thd, xid);
+  else
+    cookie= 0;
+
+  if (need_commit_ordered)
+  {
+    if (need_prepare_ordered)
+    {
+      /*
+        We did the run_prepare_ordered() serialised, then ran the log_xid() in
+        parallel. Now we have to do run_commit_ordered() serialised in the
+        same sequence as run_prepare_ordered().
+
+        We do this starting from the head of the queue, each thread doing
+        run_commit_ordered() and signalling the next in queue.
+      */
+      if (is_group_commit_leader)
+      {
+        /* The first in queue starts the ball rolling. */
+        pthread_mutex_lock(&LOCK_prepare_ordered);
+        while (group_commit_queue_busy)
+          pthread_cond_wait(&COND_queue_busy, &LOCK_prepare_ordered);
+        TC_group_commit_entry *queue= group_commit_queue;
+        group_commit_queue= NULL;
+        /*
+          Mark the queue busy while we bounce it from one thread to the
+          next.
+        */
+        group_commit_queue_busy= TRUE;
+        pthread_mutex_unlock(&LOCK_prepare_ordered);
+
+        queue= reverse_queue(queue);
+        DBUG_ASSERT(queue == &entry && queue->thd == thd);
+      }
+      else
+      {
+        /* Not first in queue; just wait until previous thread wakes us up. */
+        group_commit_wait_for_wakeup(&entry);
+      }
+    }
+
+    /* Only run commit_ordered() if log_xid was successful. */
+    if (cookie)
+    {
+      pthread_mutex_lock(&LOCK_commit_ordered);
+      run_commit_ordered(thd, all);
+      pthread_mutex_unlock(&LOCK_commit_ordered);
+    }
+
+    if (need_prepare_ordered)
+    {
+      TC_group_commit_entry *next= entry.next;
+      if (next)
+      {
+        group_commit_wakeup_other(next);
+      }
+      else
+      {
+        pthread_mutex_lock(&LOCK_prepare_ordered);
+        group_commit_queue_busy= FALSE;
+        pthread_cond_signal(&COND_queue_busy);
+        pthread_mutex_unlock(&LOCK_prepare_ordered);
+      }
+    }
+  }
+
+  return cookie;
+}
+
+
+TC_LOG_group_commit::TC_LOG_group_commit()
+  : num_commits(0), num_group_commits(0)
+{
+  my_pthread_mutex_init(&LOCK_group_commit, MY_MUTEX_INIT_SLOW,
+                        "LOCK_group_commit", MYF(0));
+}
+
+TC_LOG_group_commit::~TC_LOG_group_commit()
+{
+  pthread_mutex_destroy(&LOCK_group_commit);
+}
+
+int TC_LOG_group_commit::log_and_order(THD *thd, my_xid xid, bool all,
+                                       bool need_prepare_ordered,
+                                       bool need_commit_ordered)
+{
+  IF_DBUG(int err;)
+  int cookie;
+  struct TC_group_commit_entry entry;
+  bool is_group_commit_leader;
+
+  entry.thd= thd;
+  entry.all= all;
+  entry.group_commit_ready= false;
+  entry.xid_error= 0;
+
+  pthread_mutex_lock(&LOCK_prepare_ordered);
+  TC_group_commit_entry *previous_queue= group_commit_queue;
+  entry.next= previous_queue;
+  group_commit_queue= &entry;
+
+  DEBUG_SYNC(thd, "commit_before_prepare_ordered");
+  run_prepare_ordered(thd, all);
+  DEBUG_SYNC(thd, "commit_after_prepare_ordered");
+  pthread_mutex_unlock(&LOCK_prepare_ordered);
+
+  is_group_commit_leader= (previous_queue == NULL);
+
+  if (is_group_commit_leader)
+  {
+    TC_group_commit_entry *current;
+
+    pthread_mutex_lock(&LOCK_group_commit);
+    DEBUG_SYNC(thd, "commit_after_get_LOCK_group_commit");
+
+    pthread_mutex_lock(&LOCK_prepare_ordered);
+    TC_group_commit_entry *queue= group_commit_queue;
+    group_commit_queue= NULL;
+    pthread_mutex_unlock(&LOCK_prepare_ordered);
+
+    /*
+      Since we enqueue at the head, the queue is actually in reverse order.
+      So reverse it back into correct commit order before returning.
+    */
+    queue= reverse_queue(queue);
+
+    /* The first in the queue is the leader. */
+    DBUG_ASSERT(queue == &entry && queue->thd == thd);
+
+    DEBUG_SYNC(thd, "commit_before_group_log_xid");
+    /* This will set individual error codes in each thd->xid_error. */
+    group_log_xid(queue);
+    DEBUG_SYNC(thd, "commit_after_group_log_xid");
+
+    /*
+      Call commit_ordered methods for all transactions in the queue
+      (that did not get an error in group_log_xid()).
+
+      We do this under an additional global LOCK_commit_ordered; this is
+      so that transactions that do not need 2-phase commit do not have
+      to wait for the potentially long duration of LOCK_group_commit.
+    */
+    current= queue;
+
+    DEBUG_SYNC(thd, "commit_before_get_LOCK_commit_ordered");
+    pthread_mutex_lock(&LOCK_commit_ordered);
+    /*
+      We cannot unlock LOCK_group_commit until we have locked
+      LOCK_commit_ordered; otherwise scheduling could allow the next
+      group commit to run ahead of us, messing up the order of
+      commit_ordered() calls. But as soon as LOCK_commit_ordered is
+      obtained, we can let the next group commit start.
+    */
+    pthread_mutex_unlock(&LOCK_group_commit);
+    DEBUG_SYNC(thd, "commit_after_release_LOCK_group_commit");
+
+    ++num_group_commits;
+    do
+    {
+      ++num_commits;
+      if (!current->xid_error)
+        run_commit_ordered(current->thd, current->all);
+
+      /*
+        Careful not to access current->next_commit_ordered after waking up
+        the other thread! As it may change immediately after wakeup.
+      */
+      TC_group_commit_entry *next= current->next;
+      if (current != &entry)                    // Don't wake up ourself
+        group_commit_wakeup_other(current);
+      current= next;
+    } while (current != NULL);
+    DEBUG_SYNC(thd, "commit_after_group_run_commit_ordered");
+
+    pthread_mutex_unlock(&LOCK_commit_ordered);
+  }
+  else
+  {
+    /* If not leader, just wait until leader wakes us up. */
+    group_commit_wait_for_wakeup(&entry);
+  }
+
+  /*
+    Now that we're back in our own thread context, do any delayed processing
+    and error reporting.
+  */
+  IF_DBUG(err= entry.xid_error;)
+  cookie= xid_log_after(&entry);
+  /* The cookie must be non-zero in the non-error case. */
+  DBUG_ASSERT(err || cookie);
+
+  return cookie;
+}
+
+
 /********* transaction coordinator log for 2pc - mmap() based solution *******/
 
 /*
@@ -5878,30 +6565,68 @@ void TC_LOG_BINLOG::close()
   pthread_cond_destroy (&COND_prep_xids);
 }
 
-/**
-  @todo
-  group commit
+/*
+  Do a binlog log_xid() for a group of transactions, linked through
+  thd->next_commit_ordered.
+*/
+void
+TC_LOG_BINLOG::group_log_xid(TC_group_commit_entry *first)
+{
+  DBUG_ENTER("TC_LOG_BINLOG::group_log_xid");
+  trx_group_commit_leader(first);
+  for (TC_group_commit_entry *entry= first; entry; entry= entry->next)
+  {
+    binlog_trx_data *const trx_data=
+      (binlog_trx_data*) thd_get_ha_data(entry->thd, binlog_hton);
+    entry->xid_error= trx_data->error;
+  }
+  DBUG_VOID_RETURN;
+}
 
-  @retval
-    0    error
-  @retval
-    1    success
+int
+TC_LOG_BINLOG::xid_log_after(TC_group_commit_entry *entry)
+{
+  binlog_trx_data *const trx_data=
+    (binlog_trx_data*) thd_get_ha_data(entry->thd, binlog_hton);
+  if (trx_group_commit_finish(trx_data))
+    return 0;                           // Returning zero cookie signals error
+  else
+    return 1;
+}
+
+/*
+  After an XID is logged, we need to hold on to the current binlog file until
+  it is fully committed in the storage engine. The reason is that crash
+  recovery only looks at the latest binlog, so we must make sure there are no
+  outstanding prepared (but not committed) transactions before rotating the
+  binlog.
+
+  To handle this, we keep a count of outstanding XIDs. This function is used
+  to increase this count when committing one or more transactions to the
+  binary log.
 */
-int TC_LOG_BINLOG::log_xid(THD *thd, my_xid xid)
+void
+TC_LOG_BINLOG::mark_xids_active(uint xid_count)
 {
-  DBUG_ENTER("TC_LOG_BINLOG::log");
-  Xid_log_event xle(thd, xid);
-  binlog_trx_data *trx_data=
-    (binlog_trx_data*) thd_get_ha_data(thd, binlog_hton);
-  /*
-    We always commit the entire transaction when writing an XID. Also
-    note that the return value is inverted.
-   */
-  DBUG_RETURN(!binlog_end_trans(thd, trx_data, &xle, TRUE));
+  DBUG_ENTER("TC_LOG_BINLOG::mark_xids_active");
+  DBUG_PRINT("info", ("xid_count=%u", xid_count));
+  pthread_mutex_lock(&LOCK_prep_xids);
+  prepared_xids+= xid_count;
+  pthread_mutex_unlock(&LOCK_prep_xids);
+  DBUG_VOID_RETURN;
 }
 
-void TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
+/*
+  Once an XID is committed, it is safe to rotate the binary log, as it can no
+  longer be needed during crash recovery.
+
+  This function is called to mark an XID this way. It needs to decrease the
+  count of pending XIDs, and signal the log rotator thread when it reaches zero.
+*/
+void
+TC_LOG_BINLOG::mark_xid_done()
 {
+  DBUG_ENTER("TC_LOG_BINLOG::mark_xid_done");
   pthread_mutex_lock(&LOCK_prep_xids);
   DBUG_ASSERT(prepared_xids > 0);
   if (--prepared_xids == 0) {
@@ -5909,7 +6634,16 @@ void TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
     pthread_cond_signal(&COND_prep_xids);
   }
   pthread_mutex_unlock(&LOCK_prep_xids);
-  rotate_and_purge(0);     // as ::write() did not rotate
+  DBUG_VOID_RETURN;
+}
+
+void TC_LOG_BINLOG::unlog(ulong cookie, my_xid xid)
+{
+  DBUG_ENTER("TC_LOG_BINLOG::unlog");
+  if (xid)
+    mark_xid_done();
+  rotate_and_purge(0);     // as ::write_transaction_to_binlog() did not rotate
+  DBUG_VOID_RETURN;
 }
 
 int TC_LOG_BINLOG::recover(IO_CACHE *log, Format_description_log_event *fdle)
@@ -5981,6 +6715,72 @@ ulonglong mysql_bin_log_file_pos(void)
 #endif /* INNODB_COMPATIBILITY_HOOKS */
 
 
+static ulonglong binlog_status_var_num_commits;
+static ulonglong binlog_status_var_num_group_commits;
+
+static SHOW_VAR binlog_status_vars_detail[]=
+{
+  {"commits",
+    (char *)&binlog_status_var_num_commits, SHOW_LONGLONG},
+  {"group_commits",
+    (char *)&binlog_status_var_num_group_commits, SHOW_LONGLONG},
+  {NullS, NullS, SHOW_LONG}
+};
+
+static int show_binlog_vars(THD *thd, SHOW_VAR *var, char *buff)
+{
+  mysql_bin_log.set_status_variables();
+  var->type= SHOW_ARRAY;
+  var->value= (char *)&binlog_status_vars_detail;
+  return 0;
+}
+
+static SHOW_VAR binlog_status_vars_top[]= {
+  {"binlog", (char *) &show_binlog_vars, SHOW_FUNC},
+  {NullS, NullS, SHOW_LONG}
+};
+
+#ifndef DBUG_OFF
+static MYSQL_SYSVAR_ULONG(
+  dbug_fsync_sleep,
+  opt_binlog_dbug_fsync_sleep,
+  PLUGIN_VAR_RQCMDARG,
+  "Extra sleep (in microseconds) to add to binlog fsync(), for debugging",
+  NULL,
+  NULL,
+  0,
+  0,
+  ULONG_MAX,
+  0);
+
+static struct st_mysql_sys_var *binlog_sys_vars[]=
+{
+  MYSQL_SYSVAR(dbug_fsync_sleep),
+  NULL
+};
+#endif
+
+
+/*
+  Copy out current values of status variables, for SHOW STATUS or
+  information_schema.global_status.
+
+  This is called only under LOCK_status, so we can fill in a static array.
+*/
+void
+TC_LOG_BINLOG::set_status_variables()
+{
+  ulonglong num_commits, num_group_commits;
+
+  pthread_mutex_lock(&LOCK_commit_ordered);
+  num_commits= this->num_commits;
+  num_group_commits= this->num_group_commits;
+  pthread_mutex_unlock(&LOCK_commit_ordered);
+
+  binlog_status_var_num_commits= num_commits;
+  binlog_status_var_num_group_commits= num_group_commits;
+}
+
 struct st_mysql_storage_engine binlog_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
@@ -5995,8 +6795,12 @@ mysql_declare_plugin(binlog)
   binlog_init, /* Plugin Init */
   NULL, /* Plugin Deinit */
   0x0100 /* 1.0 */,
-  NULL,                       /* status variables                */
+  binlog_status_vars_top,     /* status variables                */
+#ifndef DBUG_OFF
+  binlog_sys_vars,            /* system variables                */
+#else
   NULL,                       /* system variables                */
+#endif
   NULL                        /* config options                  */
 }
 mysql_declare_plugin_end;
diff --git a/sql/log.h b/sql/log.h
index 8b5dfcb3935..ac0ebea6db4 100644
--- a/sql/log.h
+++ b/sql/log.h
@@ -33,11 +33,173 @@ class TC_LOG
 
   virtual int open(const char *opt_name)=0;
   virtual void close()=0;
-  virtual int log_xid(THD *thd, my_xid xid)=0;
+  virtual int log_and_order(THD *thd, my_xid xid, bool all,
+                            bool need_prepare_ordered,
+                            bool need_commit_ordered) = 0;
   virtual void unlog(ulong cookie, my_xid xid)=0;
+
+protected:
+  /*
+    These methods are meant to be invoked from log_and_order() implementations
+    to run any prepare_ordered() respectively commit_ordered() methods in
+    participating handlers.
+
+    They must be called using suitable thread syncronisation to ensure that
+    they are each called in the correct commit order among all
+    transactions. However, it is only necessary to call them if the
+    corresponding flag passed to log_and_order is set (it is safe, but not
+    required, to call them when the flag is false).
+
+    The caller must be holding LOCK_prepare_ordered respectively
+    LOCK_commit_ordered when calling these methods.
+  */
+  void run_prepare_ordered(THD *thd, bool all);
+  void run_commit_ordered(THD *thd, bool all);
+};
+
+/*
+  Locks used to ensure serialised execution of TC_LOG::run_prepare_ordered()
+  and TC_LOG::run_commit_ordered(), or any other code that calls handler
+  prepare_ordered() or commit_ordered() methods.
+*/
+extern pthread_mutex_t LOCK_prepare_ordered;
+extern pthread_mutex_t LOCK_commit_ordered;
+
+extern void TC_init();
+extern void TC_destroy();
+
+/*
+  Base class for two TC implementations TC_LOG_unordered and
+  TC_LOG_group_commit that both use a queue of threads waiting for group
+  commit.
+*/
+class TC_LOG_queued: public TC_LOG
+{
+protected:
+  TC_LOG_queued();
+  ~TC_LOG_queued();
+
+  /* Structure used to link list of THDs waiting for group commit. */
+  struct TC_group_commit_entry
+  {
+    struct TC_group_commit_entry *next;
+    THD *thd;
+    /* This is the `all' parameter for ha_commit_trans() etc. */
+    bool all;
+    /*
+      Flag set true when it is time for this thread to wake up after group
+      commit. Used with THD::LOCK_commit_ordered and THD::COND_commit_ordered.
+    */
+    bool group_commit_ready;
+    /*
+      Set by TC_LOG_group_commit::group_log_xid(), to return per-thd error and
+      cookie.
+    */
+    int xid_error;
+  };
+
+  TC_group_commit_entry * reverse_queue(TC_group_commit_entry *queue);
+
+  void group_commit_wait_for_wakeup(TC_group_commit_entry *entry);
+  void group_commit_wakeup_other(TC_group_commit_entry *other);
+
+  /*
+    This is a queue of threads waiting for being allowed to commit.
+    Access to the queue must be protected by LOCK_prepare_ordered.
+  */
+  TC_group_commit_entry *group_commit_queue;
+};
+
+class TC_LOG_unordered: public TC_LOG_queued
+{
+public:
+  TC_LOG_unordered();
+  ~TC_LOG_unordered();
+
+  int log_and_order(THD *thd, my_xid xid, bool all,
+                    bool need_prepare_ordered, bool need_commit_ordered);
+
+protected:
+  virtual int log_xid(THD *thd, my_xid xid)=0;
+
+private:
+  /*
+    This flag and condition is used to reserve the queue while threads in it
+    each run the commit_ordered() methods one after the other. Only once the
+    last commit_ordered() in the queue is done can we start on a new queue
+    run.
+
+    Since we start this process in the first thread in the queue and finish in
+    the last (and possibly different) thread, we need a condition variable for
+    this (we cannot unlock a mutex in a different thread than the one who
+    locked it).
+
+    The condition is used together with the LOCK_prepare_ordered mutex.
+  */
+  my_bool group_commit_queue_busy;
+  pthread_cond_t COND_queue_busy;
+};
+
+class TC_LOG_group_commit: public TC_LOG_queued
+{
+public:
+  TC_LOG_group_commit();
+  ~TC_LOG_group_commit();
+
+  int log_and_order(THD *thd, my_xid xid, bool all,
+                    bool need_prepare_ordered, bool need_commit_ordered);
+
+protected:
+  /* Total number of committed transactions. */
+  ulonglong num_commits;
+  /* Number of group commits done. */
+  ulonglong num_group_commits;
+
+  /*
+    When using this class, this method is used instead of log_xid() to do
+    logging of a group of transactions all at once.
+
+    The transactions will be linked through THD::next_commit_ordered.
+
+    Additionally, when this method is used instead of log_xid(), the order in
+    which handler->prepare_ordered() and handler->commit_ordered() are called
+    is guaranteed to be the same as the order of calls and THD list elements
+    for group_log_xid().
+
+    This can be used to efficiently implement group commit that at the same
+    time preserves the order of commits among handlers and TC (eg. to get same
+    commit order in InnoDB and binary log).
+
+    For TCs that do not need this, it can be preferable to use plain log_xid()
+    with class TC_LOG_unordered instead, as it allows threads to run log_xid()
+    in parallel with each other. In contrast, group_log_xid() runs under a
+    global mutex, so it is guaranteed that only once call into it will be
+    active at once.
+
+    Since this call handles multiple threads/THDs at once, my_error() (and
+    other code that relies on thread local storage) cannot be used in this
+    method. Instead, the implementation must record any error and report it as
+    the return value from xid_log_after(), which will be invoked individually
+    for each thread.
+
+    In the success case, this method must set thd->xid_cookie for each thread
+    to the cookie that is normally returned from log_xid() (which must be
+    non-zero in the non-error case).
+  */
+  virtual void group_log_xid(TC_group_commit_entry *first) = 0;
+  /*
+    Called for each transaction (in corrent thread context) after
+    group_log_xid() has finished, but with no guarantee on ordering among
+    threads.
+    Can be used to do error reporting etc. */
+  virtual int xid_log_after(TC_group_commit_entry *entry) = 0;
+
+private:
+  /* Mutex used to serialise calls to group_log_xid(). */
+  pthread_mutex_t LOCK_group_commit;
 };
 
-class TC_LOG_DUMMY: public TC_LOG // use it to disable the logging
+class TC_LOG_DUMMY: public TC_LOG_unordered // use it to disable the logging
 {
 public:
   TC_LOG_DUMMY() {}
@@ -48,7 +210,7 @@ public:
 };
 
 #ifdef HAVE_MMAP
-class TC_LOG_MMAP: public TC_LOG
+class TC_LOG_MMAP: public TC_LOG_unordered
 {
   public:                // only to keep Sun Forte on sol9x86 happy
   typedef enum {
@@ -227,12 +389,19 @@ private:
   time_t last_time;
 };
 
-class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
+class binlog_trx_data;
+class MYSQL_BIN_LOG: public TC_LOG_group_commit, private MYSQL_LOG
 {
  private:
   /* LOCK_log and LOCK_index are inited by init_pthread_objects() */
   pthread_mutex_t LOCK_index;
   pthread_mutex_t LOCK_prep_xids;
+  /*
+    Mutex to protect the queue of transactions waiting to participate in group
+    commit. (Only used on platforms without native atomic operations).
+  */
+  pthread_mutex_t LOCK_queue;
+
   pthread_cond_t  COND_prep_xids;
   pthread_cond_t update_cond;
   ulonglong bytes_written;
@@ -271,8 +440,8 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
     In 5.0 it's 0 for relay logs too!
   */
   bool no_auto_events;
-
-  ulonglong m_table_map_version;
+  /* Queue of transactions queued up to participate in group commit. */
+  binlog_trx_data *group_commit_queue;
 
   int write_to_file(IO_CACHE *cache);
   /*
@@ -282,6 +451,14 @@ class MYSQL_BIN_LOG: public TC_LOG, private MYSQL_LOG
   */
   void new_file_without_locking();
   void new_file_impl(bool need_lock);
+  int write_transaction(binlog_trx_data *trx_data);
+  bool write_transaction_to_binlog_events(binlog_trx_data *trx_data);
+  void trx_group_commit_participant(binlog_trx_data *trx_data);
+  void trx_group_commit_leader(TC_group_commit_entry *first);
+  binlog_trx_data *atomic_enqueue_trx(binlog_trx_data *trx_data);
+  binlog_trx_data *atomic_grab_trx_queue();
+  void mark_xid_done();
+  void mark_xids_active(uint xid_count);
 
 public:
   MYSQL_LOG::generate_name;
@@ -310,18 +487,11 @@ public:
 
   int open(const char *opt_name);
   void close();
-  int log_xid(THD *thd, my_xid xid);
+  void group_log_xid(TC_group_commit_entry *first);
+  int xid_log_after(TC_group_commit_entry *entry);
   void unlog(ulong cookie, my_xid xid);
   int recover(IO_CACHE *log, Format_description_log_event *fdle);
 #if !defined(MYSQL_CLIENT)
-  bool is_table_mapped(TABLE *table) const
-  {
-    return table->s->table_map_version == table_map_version();
-  }
-
-  ulonglong table_map_version() const { return m_table_map_version; }
-  void update_table_map_version() { ++m_table_map_version; }
-
   int flush_and_set_pending_rows_event(THD *thd, Rows_log_event* event);
   int remove_pending_rows_event(THD *thd);
 
@@ -362,10 +532,12 @@ public:
   void new_file();
 
   bool write(Log_event* event_info); // binary log write
-  bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event, bool incident);
-  bool write_incident(THD *thd, bool lock);
+  bool write_transaction_to_binlog(THD *thd, binlog_trx_data *trx_data,
+                                   Log_event *end_ev);
+  bool trx_group_commit_finish(binlog_trx_data *trx_data);
+  bool write_incident(THD *thd);
 
-  int  write_cache(IO_CACHE *cache, bool lock_log, bool flush_and_sync);
+  int  write_cache(IO_CACHE *cache);
   void set_write_error(THD *thd);
   bool check_write_error(THD *thd);
 
@@ -420,6 +592,7 @@ public:
   inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);}
   inline IO_CACHE *get_index_file() { return &index_file;}
   inline uint32 get_open_count() { return open_count; }
+  void set_status_variables();
 };
 
 class Log_event_handler
diff --git a/sql/log_event.h b/sql/log_event.h
index 36715b1d151..46d02f5d2c5 100644
--- a/sql/log_event.h
+++ b/sql/log_event.h
@@ -463,10 +463,9 @@ struct sql_ex_info
 #define LOG_EVENT_SUPPRESS_USE_F    0x8
 
 /*
-  The table map version internal to the log should be increased after
-  the event has been written to the binary log.
+  This used to be LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F, but is now unused.
  */
-#define LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F 0x10
+#define LOG_EVENT_UNUSED1_F 0x10
 
 /**
    @def LOG_EVENT_ARTIFICIAL_F
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 645b7498042..fd39b979f4c 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -1333,6 +1333,7 @@ void clean_up(bool print_message)
   ha_end();
   if (tc_log)
     tc_log->close();
+  TC_destroy();
   xid_cache_free();
   wt_end();
   delete_elements(&key_caches, (void (*)(const char*, uchar*)) free_key_cache);
@@ -4124,6 +4125,8 @@ a file name for --log-bin-index option", opt_binlog_index_name);
   if (!errmesg[0][0])
     unireg_abort(1);
 
+  TC_init();
+
   /* We have to initialize the storage engines before CSV logging */
   if (ha_init())
   {
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 580fe8057cd..8dbba6b2ec5 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -673,6 +673,8 @@ THD::THD()
   active_vio = 0;
 #endif
   pthread_mutex_init(&LOCK_thd_data, MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_commit_ordered, MY_MUTEX_INIT_FAST);
+  pthread_cond_init(&COND_commit_ordered, 0);
 
   /* Variables with default values */
   proc_info="login";
@@ -999,6 +1001,8 @@ THD::~THD()
   free_root(&transaction.mem_root,MYF(0));
 #endif
   mysys_var=0;					// Safety (shouldn't be needed)
+  pthread_cond_destroy(&COND_commit_ordered);
+  pthread_mutex_destroy(&LOCK_commit_ordered);
   pthread_mutex_destroy(&LOCK_thd_data);
 #ifndef DBUG_OFF
   dbug_sentry= THD_SENTRY_GONE;
@@ -3773,7 +3777,6 @@ int THD::binlog_flush_pending_rows_event(bool stmt_end)
     if (stmt_end)
     {
       pending->set_flags(Rows_log_event::STMT_END_F);
-      pending->flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
       binlog_table_maps= 0;
     }
 
@@ -3901,7 +3904,6 @@ int THD::binlog_query(THD::enum_binlog_query_type qtype, char const *query_arg,
     {
       Query_log_event qinfo(this, query_arg, query_len, is_trans, suppress_use,
                             errcode);
-      qinfo.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
       /*
         Binlog table maps will be irrelevant after a Query_log_event
         (they are just removed on the slave side) so after the query
diff --git a/sql/sql_class.h b/sql/sql_class.h
index aa39ddb2b15..aa2933e4070 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -1438,6 +1438,10 @@ public:
   /* container for handler's private per-connection data */
   Ha_data ha_data[MAX_HA];
 
+  /* Mutex and condition for waking up threads after group commit. */
+  pthread_mutex_t LOCK_commit_ordered;
+  pthread_cond_t COND_commit_ordered;
+
 #ifndef MYSQL_CLIENT
   int binlog_setup_trx_data();
 
diff --git a/sql/sql_load.cc b/sql/sql_load.cc
index 82cc8f81b4a..441fe93aaef 100644
--- a/sql/sql_load.cc
+++ b/sql/sql_load.cc
@@ -516,7 +516,6 @@ int mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list,
 	  else
 	  {
 	    Delete_file_log_event d(thd, db, transactional_table);
-            d.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
 	    (void) mysql_bin_log.write(&d);
 	  }
 	}
@@ -698,7 +697,6 @@ static bool write_execute_load_query_log_event(THD *thd, sql_exchange* ex,
       (duplicates == DUP_REPLACE) ? LOAD_DUP_REPLACE :
       (ignore ? LOAD_DUP_IGNORE : LOAD_DUP_ERROR),
       transactional_table, FALSE, errcode);
-  e.flags|= LOG_EVENT_UPDATE_TABLE_MAP_VERSION_F;
   return mysql_bin_log.write(&e);
 }
 
diff --git a/sql/table.cc b/sql/table.cc
index 733aa3e6887..2ddde40778d 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -297,13 +297,6 @@ TABLE_SHARE *alloc_table_share(TABLE_LIST *table_list, char *key,
     share->version=       refresh_version;
 
     /*
-      This constant is used to mark that no table map version has been
-      assigned.  No arithmetic is done on the value: it will be
-      overwritten with a value taken from MYSQL_BIN_LOG.
-    */
-    share->table_map_version= ~(ulonglong)0;
-
-    /*
       Since alloc_table_share() can be called without any locking (for
       example, ha_create_table... functions), we do not assign a table
       map id here.  Instead we assign a value that is not used
@@ -367,10 +360,9 @@ void init_tmp_table_share(THD *thd, TABLE_SHARE *share, const char *key,
   share->frm_version= 		 FRM_VER_TRUE_VARCHAR;
 
   /*
-    Temporary tables are not replicated, but we set up these fields
+    Temporary tables are not replicated, but we set up this fields
     anyway to be able to catch errors.
    */
-  share->table_map_version= ~(ulonglong)0;
   share->cached_row_logging_check= -1;
 
   /*
diff --git a/sql/table.h b/sql/table.h
index a24e79e26cf..efc48090b3b 100644
--- a/sql/table.h
+++ b/sql/table.h
@@ -433,7 +433,6 @@ typedef struct st_table_share
   bool waiting_on_cond;                 /* Protection against free */
   bool deleting;                        /* going to delete this table */
   ulong table_map_id;                   /* for row-based replication */
-  ulonglong table_map_version;
 
   /*
     Cache for row-based replication table share checks that does not
author	unknown <knielsen@knielsen-hq.org>	2010-09-30 15:20:15 +0200
committer	unknown <knielsen@knielsen-hq.org>	2010-09-30 15:20:15 +0200
commit	0394cf203042eb6e408b9c88802c93444f226af9 (patch)
tree	7a5fefa05950b55557111426e9e83e4e48ce6c17 /sql
parent	e432151e9cf6a7a5ccf84fc137975ccf38fd0798 (diff)
download	mariadb-git-0394cf203042eb6e408b9c88802c93444f226af9.tar.gz