1 files changed, 878 insertions, 292 deletions
diff --git a/sql/log.cc b/sql/log.cc
index c659a3ede2e..336e49b02da 100644
--- a/sql/log.cc
+++ b/sql/log.cc
@@ -25,7 +25,6 @@
 #include "mysql_priv.h"
 #include "sql_acl.h"
 #include "sql_repl.h"
-#include "ha_innodb.h" // necessary to cut the binlog when crash recovery
 
 #include <my_dir.h>
 #include <stdarg.h>
@@ -40,23 +39,235 @@ ulong sync_binlog_counter= 0;
 
 static bool test_if_number(const char *str,
 			   long *res, bool allow_wildcards);
+static int binlog_close_connection(THD *thd);
+static int binlog_savepoint_set(THD *thd, void *sv);
+static int binlog_savepoint_rollback(THD *thd, void *sv);
+static int binlog_commit(THD *thd, bool all);
+static int binlog_rollback(THD *thd, bool all);
+static int binlog_prepare(THD *thd, bool all);
+
+static handlerton binlog_hton = {
+  0,
+  sizeof(my_off_t),             /* savepoint size = binlog offset */
+  binlog_close_connection,
+  binlog_savepoint_set,
+  binlog_savepoint_rollback,
+  NULL,                         /* savepoint_release */
+  binlog_commit,
+  binlog_rollback,
+  binlog_prepare,
+  NULL,                         /* recover */
+  NULL,                         /* commit_by_xid */
+  NULL                          /* rollback_by_xid */
+};
+
+/*
+  this function is mostly a placeholder.
+  conceptually, binlog initialization (now mostly done in MYSQL_LOG::open)
+  should be moved here.
+*/
+
+handlerton *binlog_init()
+{
+  return &binlog_hton;
+}
+
+static int binlog_close_connection(THD *thd)
+{
+  IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot];
+  close_cached_file(trans_log);
+  my_free((gptr)trans_log, MYF(0));
+  return 0;
+}
+
+static inline void binlog_cleanup_trans(IO_CACHE *trans_log)
+{
+  statistic_increment(binlog_cache_use, &LOCK_status);
+  if (trans_log->disk_writes != 0)
+  {
+    statistic_increment(binlog_cache_disk_use, &LOCK_status);
+    trans_log->disk_writes= 0;
+  }
+  reinit_io_cache(trans_log, WRITE_CACHE, (my_off_t) 0, 0, 1); // cannot fail
+  trans_log->end_of_file= max_binlog_cache_size;
+}
+
+static int binlog_prepare(THD *thd, bool all)
+{
+  /*
+    do nothing.
+    just pretend we can do 2pc, so that MySQL won't
+    switch to 1pc.
+    real work will be done in MYSQL_LOG::log()
+  */
+  return 0;
+}
+
+static int binlog_commit(THD *thd, bool all)
+{
+  int error;
+  IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot];
+  DBUG_ENTER("binlog_commit");
+  DBUG_ASSERT(mysql_bin_log.is_open() &&
+              (all || !(thd->options & OPTION_NOT_AUTOCOMMIT)));
+
+  if (!my_b_tell(trans_log))
+  {
+    // we're here because trans_log was flushed in MYSQL_LOG::log()
+    DBUG_RETURN(0);
+  }
+
+  /* Update the binary log as we have cached some queries */
+  error= mysql_bin_log.write(thd, trans_log, 1);
+  binlog_cleanup_trans(trans_log);
+  DBUG_RETURN(error);
+}
+
+static int binlog_rollback(THD *thd, bool all)
+{
+  int error=0;
+  IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot];
+  DBUG_ENTER("binlog_rollback");
+  /*
+    first two conditions here are guaranteed - see trans_register_ha()
+    call below. The third one must be true. If it is not, we're registering
+    unnecessary, doing extra work. The cause should be found and eliminated
+  */
+  DBUG_ASSERT(all && mysql_bin_log.is_open() && my_b_tell(trans_log));
+  /*
+     Update the binary log with a BEGIN/ROLLBACK block if we have
+     cached some queries and we updated some non-transactional
+     table. Such cases should be rare (updating a
+     non-transactional table inside a transaction...)
+  */
+  if (unlikely(thd->options & OPTION_STATUS_NO_TRANS_UPDATE))
+    error= mysql_bin_log.write(thd, trans_log, 0);
+  binlog_cleanup_trans(trans_log);
+  DBUG_RETURN(error);
+}
+
+/*
+  NOTE: how do we handle this (unlikely but legal) case:
+  [transaction] + [update to non-trans table] + [rollback to savepoint] ?
+  The problem occurs when a savepoint is before the update to the
+  non-transactional table. Then when there's a rollback to the savepoint, if we
+  simply truncate the binlog cache, we lose the part of the binlog cache where
+  the update is. If we want to not lose it, we need to write the SAVEPOINT
+  command and the ROLLBACK TO SAVEPOINT command to the binlog cache. The latter
+  is easy: it's just write at the end of the binlog cache, but the former
+  should be *inserted* to the place where the user called SAVEPOINT. The
+  solution is that when the user calls SAVEPOINT, we write it to the binlog
+  cache (so no need to later insert it). As transactions are never intermixed
+  in the binary log (i.e. they are serialized), we won't have conflicts with
+  savepoint names when using mysqlbinlog or in the slave SQL thread.
+  Then when ROLLBACK TO SAVEPOINT is called, if we updated some
+  non-transactional table, we don't truncate the binlog cache but instead write
+  ROLLBACK TO SAVEPOINT to it; otherwise we truncate the binlog cache (which
+  will chop the SAVEPOINT command from the binlog cache, which is good as in
+  that case there is no need to have it in the binlog).
+*/
+
+static int binlog_savepoint_set(THD *thd, void *sv)
+{
+  IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot];
+  DBUG_ENTER("binlog_savepoint_set");
+  DBUG_ASSERT(mysql_bin_log.is_open() && my_b_tell(trans_log));
+
+  *(my_off_t *)sv= my_b_tell(trans_log);
+  /* Write it to the binary log */
+  Query_log_event qinfo(thd, thd->query, thd->query_length, TRUE, FALSE);
+  DBUG_RETURN(mysql_bin_log.write(&qinfo));
+}
+
+static int binlog_savepoint_rollback(THD *thd, void *sv)
+{
+  IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot];
+  DBUG_ENTER("binlog_savepoint_rollback");
+  DBUG_ASSERT(mysql_bin_log.is_open() && my_b_tell(trans_log));
+
+  /*
+    Write ROLLBACK TO SAVEPOINT to the binlog cache if we have updated some
+    non-transactional table. Otherwise, truncate the binlog cache starting
+    from the SAVEPOINT command.
+  */
+  if (unlikely(thd->options & OPTION_STATUS_NO_TRANS_UPDATE))
+  {
+    Query_log_event qinfo(thd, thd->query, thd->query_length, TRUE, FALSE);
+    DBUG_RETURN(mysql_bin_log.write(&qinfo));
+  }
+  reinit_io_cache(trans_log, WRITE_CACHE, *(my_off_t *)sv, 0, 0);
+  DBUG_RETURN(0);
+}
+
+int check_binlog_magic(IO_CACHE* log, const char** errmsg)
+{
+  char magic[4];
+  DBUG_ASSERT(my_b_tell(log) == 0);
+
+  if (my_b_read(log, (byte*) magic, sizeof(magic)))
+  {
+    *errmsg = "I/O error reading the header from the binary log";
+    sql_print_error("%s, errno=%d, io cache code=%d", *errmsg, my_errno,
+		    log->error);
+    return 1;
+  }
+  if (memcmp(magic, BINLOG_MAGIC, sizeof(magic)))
+  {
+    *errmsg = "Binlog has bad magic number;  It's not a binary log file that can be used by this version of MySQL";
+    return 1;
+  }
+  return 0;
+}
+
+File open_binlog(IO_CACHE *log, const char *log_file_name, const char **errmsg)
+{
+  File file;
+  DBUG_ENTER("open_binlog");
+
+  if ((file = my_open(log_file_name, O_RDONLY | O_BINARY, MYF(MY_WME))) < 0)
+  {
+    sql_print_error("Failed to open log (file '%s', errno %d)",
+                    log_file_name, my_errno);
+    *errmsg = "Could not open log file";
+    goto err;
+  }
+  if (init_io_cache(log, file, IO_SIZE*2, READ_CACHE, 0, 0,
+                    MYF(MY_WME|MY_DONT_CHECK_FILESIZE)))
+  {
+    sql_print_error("Failed to create a cache on log (file '%s')",
+                    log_file_name);
+    *errmsg = "Could not open log file";
+    goto err;
+  }
+  if (check_binlog_magic(log,errmsg))
+    goto err;
+  DBUG_RETURN(file);
+
+err:
+  if (file >= 0)
+  {
+    my_close(file,MYF(0));
+    end_io_cache(log);
+  }
+  DBUG_RETURN(-1);
+}
 
 #ifdef __NT__
 static int eventSource = 0;
 
-void setup_windows_event_source() 
+void setup_windows_event_source()
 {
-  HKEY    hRegKey= NULL; 
+  HKEY    hRegKey= NULL;
   DWORD   dwError= 0;
   TCHAR   szPath[MAX_PATH];
   DWORD dwTypes;
-    
+
   if (eventSource)               // Ensure that we are only called once
     return;
   eventSource= 1;
 
   // Create the event source registry key
-  dwError= RegCreateKey(HKEY_LOCAL_MACHINE, 
+  dwError= RegCreateKey(HKEY_LOCAL_MACHINE,
                           "SYSTEM\\CurrentControlSet\\Services\\EventLog\\Application\\MySQL", 
                           &hRegKey);
 
@@ -64,9 +275,8 @@ void setup_windows_event_source()
   GetModuleFileName(NULL, szPath, MAX_PATH);
 
   /* Register EventMessageFile */
-  dwError = RegSetValueEx(hRegKey, "EventMessageFile", 0, REG_EXPAND_SZ, 
+  dwError = RegSetValueEx(hRegKey, "EventMessageFile", 0, REG_EXPAND_SZ,
                           (PBYTE) szPath, strlen(szPath)+1);
-    
 
   /* Register supported event types */
   dwTypes= (EVENTLOG_ERROR_TYPE | EVENTLOG_WARNING_TYPE |
@@ -129,14 +339,14 @@ static int find_uniq_filename(char *name)
 MYSQL_LOG::MYSQL_LOG()
   :bytes_written(0), last_time(0), query_start(0), name(0),
    file_id(1), open_count(1), log_type(LOG_CLOSED), write_error(0), inited(0),
-   need_start_event(1), description_event_for_exec(0),
+   need_start_event(1), prepared_xids(0), description_event_for_exec(0),
    description_event_for_queue(0)
 {
   /*
     We don't want to initialize LOCK_Log here as such initialization depends on
     safe_mutex (when using safe_mutex) which depends on MY_INIT(), which is
     called only in main(). Doing initialization here would make it happen
-    before main(). 
+    before main().
   */
   index_file_name[0] = 0;
   bzero((char*) &log_file,sizeof(log_file));
@@ -157,7 +367,7 @@ void MYSQL_LOG::cleanup()
   if (inited)
   {
     inited= 0;
-    close(LOG_CLOSE_INDEX);
+    close(LOG_CLOSE_INDEX|LOG_CLOSE_STOP_EVENT);
     delete description_event_for_queue;
     delete description_event_for_exec;
     (void) pthread_mutex_destroy(&LOCK_log);
@@ -234,7 +444,7 @@ bool MYSQL_LOG::open(const char *log_name, enum_log_type log_type_arg,
 {
   char buff[512];
   File file= -1, index_file_nr= -1;
-  int open_flags = O_CREAT | O_APPEND | O_BINARY;
+  int open_flags = O_CREAT | O_BINARY;
   DBUG_ENTER("MYSQL_LOG::open");
   DBUG_PRINT("enter",("log_type: %d",(int) log_type));
 
@@ -251,9 +461,9 @@ bool MYSQL_LOG::open(const char *log_name, enum_log_type log_type_arg,
     goto err;
   
   if (io_cache_type == SEQ_READ_APPEND)
-    open_flags |= O_RDWR;
+    open_flags |= O_RDWR | O_APPEND;
   else
-    open_flags |= O_WRONLY;
+    open_flags |= O_WRONLY | (log_type == LOG_BIN ? 0 : O_APPEND);
 
   db[0]=0;
   open_count++;
@@ -627,7 +837,7 @@ int MYSQL_LOG::find_next_log(LOG_INFO* linfo, bool need_lock)
     error = !index_file.error ? LOG_INFO_EOF : LOG_INFO_IO;
     goto err;
   }
-  fname[length-1]=0;				// kill /n
+  fname[length-1]=0;				// kill \n
   linfo->index_file_offset = my_b_tell(&index_file);
 
 err:
@@ -683,7 +893,7 @@ bool MYSQL_LOG::reset_logs(THD* thd)
     error=1;
     goto err;
   }
-  
+
   for (;;)
   {
     my_delete(linfo.log_file_name, MYF(MY_WME));
@@ -1041,6 +1251,24 @@ void MYSQL_LOG::new_file(bool need_lock)
   safe_mutex_assert_owner(&LOCK_log);
   safe_mutex_assert_owner(&LOCK_index);
 
+  /*
+    if binlog is used as tc log, be sure all xids are "unlogged",
+    so that on recover we only need to scan one - latest - binlog file
+    for prepared xids. As this is expected to be a rare event,
+    simple wait strategy is enough. We're locking LOCK_log to be sure no
+    new Xid_log_event's are added to the log (and prepared_xids is not
+    increased), and waiting on COND_prep_xids for late threads to
+    catch up.
+  */
+  if (prepared_xids)
+  {
+    tc_log_page_waits++;
+    pthread_mutex_lock(&LOCK_prep_xids);
+    while (prepared_xids)
+      pthread_cond_wait(&COND_prep_xids, &LOCK_prep_xids);
+    pthread_mutex_unlock(&LOCK_prep_xids);
+  }
+
   /* Reuse old name if not binlog and not update log */
   new_name_ptr= name;
 
@@ -1093,8 +1321,6 @@ void MYSQL_LOG::new_file(bool need_lock)
 
   open(old_name, save_log_type, new_name_ptr, index_file_name, io_cache_type,
        no_auto_events, max_size, 1);
-  if (this == &mysql_bin_log)
-    report_pos_in_innodb();
   my_free(old_name,MYF(0));
 
 end:
@@ -1283,8 +1509,7 @@ inline bool sync_binlog(IO_CACHE *cache)
 bool MYSQL_LOG::write(Log_event* event_info)
 {
   THD *thd=event_info->thd;
-  bool called_handler_commit=0;
-  bool error=0;
+  bool error=1;
   bool should_rotate = 0;
   DBUG_ENTER("MYSQL_LOG::write(event)");
   
@@ -1295,26 +1520,10 @@ bool MYSQL_LOG::write(Log_event* event_info)
      mostly called if is_open() *was* true a few instructions before, but it
      could have changed since.
   */
-  if (is_open())
+  if (likely(is_open()))
   {
     const char *local_db= event_info->get_db();
     IO_CACHE *file= &log_file;
-#ifdef USING_TRANSACTIONS    
-    /*
-      Should we write to the binlog cache or to the binlog on disk?
-      Write to the binlog cache if:
-      - it is already not empty (meaning we're in a transaction; note that the
-     present event could be about a non-transactional table, but still we need
-     to write to the binlog cache in that case to handle updates to mixed
-     trans/non-trans table types the best possible in binlogging)
-      - or if the event asks for it (cache_stmt == true).
-    */
-    if (opt_using_transactions &&
-	(event_info->get_cache_stmt() ||
-	 (thd && my_b_tell(&thd->transaction.trans_log))))
-      file= &thd->transaction.trans_log;
-#endif
-    DBUG_PRINT("info",("event type=%d",event_info->get_type_code()));
 #ifdef HAVE_REPLICATION
     /* 
        In the future we need to add to the following if tests like
@@ -1330,7 +1539,46 @@ bool MYSQL_LOG::write(Log_event* event_info)
     }
 #endif /* HAVE_REPLICATION */
 
-    error=1;
+#ifdef USING_TRANSACTIONS
+    /*
+      Should we write to the binlog cache or to the binlog on disk?
+      Write to the binlog cache if:
+      - it is already not empty (meaning we're in a transaction; note that the
+     present event could be about a non-transactional table, but still we need
+     to write to the binlog cache in that case to handle updates to mixed
+     trans/non-trans table types the best possible in binlogging)
+      - or if the event asks for it (cache_stmt == true).
+    */
+    if (opt_using_transactions && thd)
+    {
+      IO_CACHE *trans_log= (IO_CACHE*)thd->ha_data[binlog_hton.slot];
+
+      if (event_info->get_cache_stmt())
+      {
+        if (!trans_log)
+        {
+          thd->ha_data[binlog_hton.slot]= trans_log= (IO_CACHE *)
+            my_malloc(sizeof(IO_CACHE), MYF(MY_ZEROFILL));
+          if (!trans_log || open_cached_file(trans_log, mysql_tmpdir, LOG_PREFIX,
+                                             binlog_cache_size, MYF(MY_WME)))
+          {
+            my_free((gptr)trans_log, MYF(MY_ALLOW_ZERO_PTR));
+            thd->ha_data[binlog_hton.slot]= trans_log= 0;
+            goto err;
+          }
+          trans_log->end_of_file= max_binlog_cache_size;
+          trans_register_ha(thd, TRUE, &binlog_hton);
+        }
+        else if (!my_b_tell(trans_log))
+          trans_register_ha(thd, TRUE, &binlog_hton);
+        file= trans_log;
+      }
+      else if (trans_log && my_b_tell(trans_log))
+        file= trans_log;
+    }
+#endif
+    DBUG_PRINT("info",("event type=%d",event_info->get_type_code()));
+
     /*
       No check for auto events flag here - this write method should
       never be called if auto-events are enabled
@@ -1423,17 +1671,6 @@ COLLATION_CONNECTION=%u,COLLATION_DATABASE=%u,COLLATION_SERVER=%u",
 	    goto err;
 	}
       }
-#ifdef TO_BE_REMOVED
-      if (thd->variables.convert_set)
-      {
-	char buf[256], *p;
-	p= strmov(strmov(buf, "SET CHARACTER SET "),
-		  thd->variables.convert_set->name);
-	Query_log_event e(thd, buf, (ulong) (p - buf), 0, FALSE);
-	if (e.write(file))
-	  goto err;
-      }
-#endif
     }
 
     /* Write the SQL command */
@@ -1441,71 +1678,12 @@ COLLATION_CONNECTION=%u,COLLATION_DATABASE=%u,COLLATION_SERVER=%u",
     if (event_info->write(file))
       goto err;
 
-    /*
-      Tell for transactional table handlers up to which position in the
-      binlog file we wrote. The table handler can store this info, and
-      after crash recovery print for the user the offset of the last
-      transactions which were recovered. Actually, we must also call
-      the table handler commit here, protected by the LOCK_log mutex,
-      because otherwise the transactions may end up in a different order
-      in the table handler log!
-
-      Note that we will NOT call ha_report_binlog_offset_and_commit() if
-      there are binlog events cached in the transaction cache. That is
-      because then the log event which we write to the binlog here is
-      not a transactional event. In versions < 4.0.13 before this fix this
-      caused an InnoDB transaction to be committed if in the middle there
-      was a MyISAM event!
-    */
-
     if (file == &log_file) // we are writing to the real log (disk)
     {
       if (flush_io_cache(file) || sync_binlog(file))
 	goto err;
 
-      if (opt_using_transactions &&
-          !(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)))
-      {
-        /*
-          LOAD DATA INFILE in AUTOCOMMIT=1 mode writes to the binlog
-          chunks also before it is successfully completed. We only report
-          the binlog write and do the commit inside the transactional table
-          handler if the log event type is appropriate.
-        */
-        
-        if (event_info->get_type_code() == QUERY_EVENT ||
-            event_info->get_type_code() == EXEC_LOAD_EVENT)
-        {
-#ifndef DBUG_OFF
-          if (unlikely(opt_crash_binlog_innodb))
-          {
-            /*
-              This option is for use in rpl_crash_binlog_innodb.test.
-              1st we want to verify that Binlog_dump thread cannot send the
-              event now (because of LOCK_log): we here tell the Binlog_dump
-              thread to wake up, sleep for the slave to have time to possibly
-              receive data from the master (it should not), and then crash.
-              2nd we want to verify that at crash recovery the rolled back
-              event is cut from the binlog.
-            */
-            if (!(--opt_crash_binlog_innodb))
-            {
-              signal_update();
-              sleep(2);
-              fprintf(stderr,"This is a normal crash because of"
-                      " --crash-binlog-innodb\n");
-              assert(0);
-            }
-            DBUG_PRINT("info",("opt_crash_binlog_innodb: %d",
-                               opt_crash_binlog_innodb));
-          }
-#endif
-          error = ha_report_binlog_offset_and_commit(thd, log_file_name,
-                                                     file->pos_in_file);
-          called_handler_commit=1;
-        }
-      }
-      /* We wrote to the real log, check automatic rotation; */
+      /* check automatic rotation; */
       DBUG_PRINT("info",("max_size: %lu",max_size));      
       should_rotate= (my_b_tell(file) >= (my_off_t) max_size); 
     }
@@ -1524,7 +1702,7 @@ err:
       signal_update();
     if (should_rotate)
     {
-      pthread_mutex_lock(&LOCK_index);      
+      pthread_mutex_lock(&LOCK_index);
       new_file(0); // inside mutex
       pthread_mutex_unlock(&LOCK_index);
     }
@@ -1532,15 +1710,6 @@ err:
 
   pthread_mutex_unlock(&LOCK_log);
 
-  /*
-    Flush the transactional handler log file now that we have released
-    LOCK_log; the flush is placed here to eliminate the bottleneck on the
-    group commit
-  */
-
-  if (called_handler_commit)
-    ha_commit_complete(thd);
-
 #ifdef HAVE_REPLICATION
   if (should_rotate && expire_logs_days)
   {
@@ -1568,9 +1737,9 @@ uint MYSQL_LOG::next_file_id()
 
   SYNOPSIS
     write()
-    thd 		
+    thd
     cache		The cache to copy to the binlog
-    commit_or_rollback  If true, will write "COMMIT" in the end, if false will
+    is_commit           If true, will write "COMMIT" in the end, if false will
                         write "ROLLBACK".
 
   NOTE
@@ -1578,6 +1747,10 @@ uint MYSQL_LOG::next_file_id()
     - The thing in the cache is always a complete transaction
     - 'cache' needs to be reinitialized after this functions returns.
 
+  TODO
+      fix it to become atomic - either the complete cache is added to binlog
+      or nothing (other storage engines rely on this, doing a ROLLBACK)
+
   IMPLEMENTATION
     - To support transaction over replication, we wrap the transaction
       with BEGIN/COMMIT or BEGIN/ROLLBACK in the binary log.
@@ -1586,12 +1759,12 @@ uint MYSQL_LOG::next_file_id()
       same updates are run on the slave.
 */
 
-bool MYSQL_LOG::write(THD *thd, IO_CACHE *cache, bool commit_or_rollback)
+bool MYSQL_LOG::write(THD *thd, IO_CACHE *cache, bool is_commit)
 {
   VOID(pthread_mutex_lock(&LOCK_log));
   DBUG_ENTER("MYSQL_LOG::write(cache");
   
-  if (is_open())				// Should always be true
+  if (likely(is_open()))                       // Should always be true
   {
     uint length;
 
@@ -1601,11 +1774,11 @@ bool MYSQL_LOG::write(THD *thd, IO_CACHE *cache, bool commit_or_rollback)
       AUTOCOMMIT=1, then MySQL immediately writes each SQL statement to
       the binlog when the statement has been completed. No need to add
       "BEGIN" ... "COMMIT" around such statements. Otherwise, MySQL uses
-      thd->transaction.trans_log to cache the SQL statements until the
-      explicit commit, and at the commit writes the contents in .trans_log
-      to the binlog.
+      trans_log (that is thd->ha_data[binlog_hton.slot]) to cache
+      the SQL statements until the explicit commit, and at the commit writes
+      the contents in trans_log to the binlog.
 
-      We write the "BEGIN" mark first in the buffer (.trans_log) where we
+      We write the "BEGIN" mark first in the buffer (trans_log) where we
       store the SQL statements for a transaction. At the transaction commit
       we will add the "COMMIT mark and write the buffer to the binlog.
     */
@@ -1648,8 +1821,8 @@ bool MYSQL_LOG::write(THD *thd, IO_CACHE *cache, bool commit_or_rollback)
 
     {
       Query_log_event qinfo(thd, 
-                            commit_or_rollback ? "COMMIT" : "ROLLBACK",
-                            commit_or_rollback ? 6        : 8, 
+                            is_commit ? "COMMIT" : "ROLLBACK",
+                            is_commit ?  6       :  8,
                             TRUE, FALSE);
       qinfo.error_code= 0;
       if (qinfo.write(&log_file) || flush_io_cache(&log_file) ||
@@ -1662,25 +1835,6 @@ bool MYSQL_LOG::write(THD *thd, IO_CACHE *cache, bool commit_or_rollback)
       write_error=1;				// Don't give more errors
       goto err;
     }
-#ifndef DBUG_OFF
-    if (unlikely(opt_crash_binlog_innodb))
-    {
-      /* see the previous MYSQL_LOG::write() method for a comment */
-      if (!(--opt_crash_binlog_innodb))
-      {
-        signal_update();
-        sleep(2);
-        fprintf(stderr, "This is a normal crash because of"
-                " --crash-binlog-innodb\n");
-        assert(0);
-      }
-      DBUG_PRINT("info",("opt_crash_binlog_innodb: %d",
-                         opt_crash_binlog_innodb));
-    }
-#endif
-    if ((ha_report_binlog_offset_and_commit(thd, log_file_name,
-					    log_file.pos_in_file)))
-      goto err;
     signal_update();
     DBUG_PRINT("info",("max_size: %lu",max_size));
     if (my_b_tell(&log_file) >= (my_off_t) max_size)
@@ -1693,12 +1847,6 @@ bool MYSQL_LOG::write(THD *thd, IO_CACHE *cache, bool commit_or_rollback)
   }
   VOID(pthread_mutex_unlock(&LOCK_log));
 
-  /* Flush the transactional handler log file now that we have released
-  LOCK_log; the flush is placed here to eliminate the bottleneck on the
-  group commit */  
-
-  ha_commit_complete(thd);
-
   DBUG_RETURN(0);
 
 err:
@@ -1903,6 +2051,16 @@ void MYSQL_LOG::close(uint exiting)
     }
 #endif /* HAVE_REPLICATION */
     end_io_cache(&log_file);
+
+    /* don't pwrite in a file opened with O_APPEND - it doesn't work */
+    if (log_file.type == WRITE_CACHE)
+    {
+      my_off_t offset= BIN_LOG_HEADER_SIZE + FLAGS_OFFSET;
+      char flags=LOG_EVENT_BINLOG_CLOSED_F;
+      my_pwrite(log_file.file, &flags, 1,
+                BIN_LOG_HEADER_SIZE + FLAGS_OFFSET, MYF(0));
+    }
+
     if (my_close(log_file.file,MYF(0)) < 0 && ! write_error)
     {
       write_error=1;
@@ -2085,145 +2243,6 @@ bool flush_error_log()
    return result;
 }
 
-
-/*
-  If the server has InnoDB on, and InnoDB has published the position of the
-  last committed transaction (which happens only if a crash recovery occured at
-  this startup) then truncate the previous binary log at the position given by
-  InnoDB. If binlog is shorter than the position, print a message to the error
-  log.
-
-  SYNOPSIS
-    cut_spurious_tail()
-
-  RETURN VALUES
-    1	Error
-    0	Ok
-*/
-
-bool MYSQL_LOG::cut_spurious_tail()
-{
-  int error= 0;
-  DBUG_ENTER("cut_spurious_tail");
-
-#ifdef HAVE_INNOBASE_DB
-  if (have_innodb != SHOW_OPTION_YES)
-    DBUG_RETURN(0);
-  /*
-    This is the place where we use information from InnoDB to cut the
-    binlog.
-  */
-  char *name= ha_innobase::get_mysql_bin_log_name();
-  ulonglong pos= ha_innobase::get_mysql_bin_log_pos();
-  ulonglong actual_size;
-  char llbuf1[22], llbuf2[22];
-
-  if (name[0] == 0 || pos == ULONGLONG_MAX)
-  {
-    DBUG_PRINT("info", ("InnoDB has not set binlog info"));
-    DBUG_RETURN(0);
-  }
-  /* The binlog given by InnoDB normally is never an active binlog */
-  if (is_open() && is_active(name))
-  {
-    sql_print_error("Warning: after InnoDB crash recovery, InnoDB says that "
-                    "the binary log of the previous run has the same name "
-                    "'%s' as the current one; this is likely to be abnormal.",
-                    name);
-    DBUG_RETURN(1);
-  }
-  sql_print_error("After InnoDB crash recovery, checking if the binary log "
-                  "'%s' contains rolled back transactions which must be "
-                  "removed from it...", name);
-  /* If we have a too long binlog, cut. If too short, print error */
-  int fd= my_open(name, O_EXCL | O_APPEND | O_BINARY | O_WRONLY, MYF(MY_WME));
-  if (fd < 0)
-  {
-    int save_errno= my_errno;
-    sql_print_error("Could not open the binary log '%s' for truncation.",
-                    name);
-    if (save_errno != ENOENT)
-      sql_print_error("The binary log '%s' should not be used for "
-                      "replication.", name);    
-    DBUG_RETURN(1);
-  }
-
-  if (pos > (actual_size= my_seek(fd, 0L, MY_SEEK_END, MYF(MY_WME))))
-  {
-    /*
-      Note that when we have MyISAM rollback this error message should be
-      reconsidered.
-    */
-    sql_print_error("The binary log '%s' is shorter than its expected size "
-                    "(actual: %s, expected: %s) so it misses at least one "
-                    "committed transaction; so it should not be used for "
-                    "replication or point-in-time recovery. You would need "
-                    "to restart slaves from a fresh master's data "
-                    "snapshot ",
-                    name, llstr(actual_size, llbuf1),
-                    llstr(pos, llbuf2));
-    error= 1;
-    goto err;
-  }
-  if (pos < actual_size)
-  {
-    sql_print_error("The binary log '%s' is bigger than its expected size "
-                    "(actual: %s, expected: %s) so it contains a rolled back "
-                    "transaction; now truncating that.", name,
-                    llstr(actual_size, llbuf1), llstr(pos, llbuf2));
-    /*
-      As on some OS, my_chsize() can only pad with 0s instead of really
-      truncating. Then mysqlbinlog (and Binlog_dump thread) will error on
-      these zeroes. This is annoying, but not more (you just need to manually
-      switch replication to the next binlog). Fortunately, in my_chsize.c, it
-      says that all modern machines support real ftruncate().
-      
-    */
-    if ((error= my_chsize(fd, pos, 0, MYF(MY_WME))))
-      goto err;
-  }
-err:
-  if (my_close(fd, MYF(MY_WME)))
-    error= 1;
-#endif
-  DBUG_RETURN(error);
-}
-
-
-/*
-  If the server has InnoDB on, store the binlog name and position into
-  InnoDB. This function is used every time we create a new binlog.
-
-  SYNOPSIS
-    report_pos_in_innodb()
-
-  NOTES
-    This cannot simply be done in MYSQL_LOG::open(), because when we create
-    the first binlog at startup, we have not called ha_init() yet so we cannot
-    write into InnoDB yet.
-
-  RETURN VALUES
-    1	Error
-    0	Ok
-*/
-
-void MYSQL_LOG::report_pos_in_innodb()
-{
-  DBUG_ENTER("report_pos_in_innodb");
-#ifdef HAVE_INNOBASE_DB
-  if (is_open() && have_innodb == SHOW_OPTION_YES)
-  {
-    DBUG_PRINT("info", ("Reporting binlog info into InnoDB - "
-                        "name: '%s' position: %d",
-                        log_file_name, my_b_tell(&log_file)));
-    innobase_store_binlog_offset_and_flush_log(log_file_name,
-                                               my_b_tell(&log_file));
-  }
-#endif
-  DBUG_VOID_RETURN;
-}
-
-
 void MYSQL_LOG::signal_update()
 {
   DBUG_ENTER("MYSQL_LOG::signal_update");
@@ -2354,3 +2373,570 @@ void sql_print_information(const char *format, ...)
 
   DBUG_VOID_RETURN;
 }
+
+/********* transaction coordinator log for 2pc - mmap() based solution *******/
+
+/*
+  the log consists of a file, mmapped to a memory.
+  file is divided on pages of tc_log_page_size size.
+  (usable size of the first page is smaller because of log header)
+  there's PAGE control structure for each page
+  each page (or rather PAGE control structure) can be in one of three
+  states - active, in_sync, pool.
+  there could be only one page in active or in_sync states,
+  but many in pool - pool is fifo queue.
+  usual lifecycle of a page is pool->active->in_sync->pool
+  "active" page - is a page where new xid's are logged.
+  the page stays active as long as in_sync slot is taken.
+  "in_sync" page is being synced to disk. no new xid can be added to it.
+  when the sync is done the page is moved to a pool and an active page
+  becomes "in_sync".
+
+  when a xid is added to an active page, the thread of this xid waits for
+  a page's condition until the page is synced. when in_sync slot becomes vacant
+  one of these waiters is awaken to take care of syncing. it syncs the page
+  and signals all waiters that the page is synced.
+  PAGE::waiters is used to count these waiters, and a page may never
+  become active again until waiters==0 (that is all waiters from the
+  previous sync have noticed the sync was completed)
+
+  note, that the page becomes "dirty" and has to be synced only when
+  a new xid is added into it. Removing a xid from a page does not make it
+  dirty - we don't sync removals to disk.
+*/
+#define TC_LOG_HEADER_SIZE (sizeof(tc_log_magic)+1)
+
+static const char tc_log_magic[]={254, 0x23, 0x05, 0x74};
+
+uint opt_tc_log_size=TC_LOG_MIN_SIZE;
+uint tc_log_max_pages_used=0, tc_log_page_size=0,
+     tc_log_page_waits=0, tc_log_cur_pages_used=0;
+
+TC_LOG *tc_log;
+TC_LOG_MMAP tc_log_mmap;
+TC_LOG_DUMMY tc_log_dummy;
+
+int TC_LOG_MMAP::open(const char *opt_name)
+{
+  uint i;
+  bool crashed=FALSE;
+  PAGE *pg;
+
+  DBUG_ASSERT(total_ha_2pc);
+  DBUG_ASSERT(opt_name && opt_name[0]);
+
+#ifdef HAVE_GETPAGESIZE
+  tc_log_page_size=getpagesize();
+  DBUG_ASSERT(TC_LOG_PAGE_SIZE % tc_log_page_size == 0);
+#else
+  tc_log_page_size=TC_LOG_PAGE_SIZE;
+#endif
+
+  fn_format(logname,opt_name,mysql_data_home,"",MY_UNPACK_FILENAME);
+  fd=my_open(logname, O_RDWR, MYF(0));  // TODO use O_CREAT and check len==0 ?
+  if (fd == -1)
+  {
+    if (using_heuristic_recover())
+      return 1;
+    fd=my_open(logname, O_RDWR|O_CREAT, MYF(MY_WME));
+    if (fd == -1)
+      goto err;
+    inited=1;
+    file_length=opt_tc_log_size;
+    if (my_chsize(fd, file_length, 0, MYF(MY_WME)))
+      goto err;
+  }
+  else
+  {
+    inited=1;
+    crashed=TRUE;
+    if (tc_heuristic_recover)
+    {
+      sql_print_error("Cannot perform automatic crash recovery when "
+                      "--tc-heuristic-recover is used");
+      goto err;
+    }
+    file_length = my_seek(fd, 0L, MY_SEEK_END, MYF(MY_WME+MY_FAE));
+    if (file_length == MY_FILEPOS_ERROR || file_length % tc_log_page_size)
+      goto err;
+  }
+
+  data=(uchar *)my_mmap(0, file_length, PROT_READ|PROT_WRITE,
+                        MAP_NOSYNC|MAP_SHARED, fd, 0);
+  if (data == MAP_FAILED)
+  {
+    my_errno=errno;
+    goto err;
+  }
+  inited=2;
+
+  npages=file_length/tc_log_page_size;
+  DBUG_ASSERT(npages >= 3);             // to guarantee non-empty pool
+  if (!(pages=(PAGE *)my_malloc(npages*sizeof(PAGE), MYF(MY_WME|MY_ZEROFILL))))
+    goto err;
+  inited=3;
+  for (pg=pages, i=0; i < npages; i++, pg++)
+  {
+    pg->next=pg+1;
+    pg->waiters=0;
+    pg->state=POOL;
+    pthread_mutex_init(&pg->lock, MY_MUTEX_INIT_FAST);
+    pthread_cond_init (&pg->cond, 0);
+    pg->start=(my_xid *)(data + i*tc_log_page_size);
+    pg->end=(my_xid *)(pg->start + tc_log_page_size);
+    pg->size=pg->free=tc_log_page_size/sizeof(my_xid);
+  }
+  pages[0].start=(my_xid *)(data+TC_LOG_HEADER_SIZE);
+  pages[0].size=pages[0].free=
+                (tc_log_page_size-TC_LOG_HEADER_SIZE)/sizeof(my_xid);
+  pages[npages-1].next=0;
+  inited=4;
+
+  if (crashed)
+  {
+    sql_print_error("Recovering after a crash");
+    if (recover())
+      goto err;
+  }
+  memcpy(data, tc_log_magic, sizeof(tc_log_magic));
+  data[sizeof(tc_log_magic)]= total_ha_2pc;
+  my_msync(fd, data, tc_log_page_size, MS_SYNC);
+  inited=5;
+
+  pthread_mutex_init(&LOCK_sync,    MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_active,  MY_MUTEX_INIT_FAST);
+  pthread_mutex_init(&LOCK_pool,    MY_MUTEX_INIT_FAST);
+  pthread_cond_init(&COND_active, 0);
+  pthread_cond_init(&COND_pool, 0);
+
+  inited=6;
+
+  in_sync= 0;
+  active=pages;
+  pool=pages+1;
+  pool_last=pages+npages-1;
+
+  return 0;
+
+err:
+  close();
+  return 1;
+}
+
+/*
+  two strategies here:
+  1. take the first from the pool
+  2. if there're waiters - take the one with the most free space
+
+  TODO page merging. try to allocate adjacent page first,
+  so that they can be flushed both in one sync
+*/
+void TC_LOG_MMAP::get_active_from_pool()
+{
+  PAGE **p, **best_p=0;
+  int best_free;
+
+  do
+  {
+    best_p= p= &pool;
+    if ((*p)->waiters == 0)
+      break;
+
+    best_free=0;
+    for (p=&(*p)->next; *p; p=&(*p)->next)
+    {
+      if ((*p)->waiters == 0 && (*p)->free > best_free)
+      {
+        best_free=(*p)->free;
+        best_p=p;
+      }
+    }
+  }
+  while ((*best_p == 0 || best_free == 0) && overflow());
+
+  active=*best_p;
+  if (active->free == active->size)
+  {
+    tc_log_cur_pages_used++;
+    set_if_bigger(tc_log_max_pages_used, tc_log_cur_pages_used);
+  }
+
+  if ((*best_p)->next)
+    *best_p=(*best_p)->next;
+  else
+    pool_last=*best_p;
+}
+
+int TC_LOG_MMAP::overflow()
+{
+  /*
+    simple overflow handling - just wait
+    TODO perhaps, increase log size ?
+    let's check the behaviour of tc_log_page_waits first
+  */
+  tc_log_page_waits++;
+  pthread_cond_wait(&COND_pool, &LOCK_pool);
+  return 1; // returns always 1
+}
+
+/*
+  all access to active page is serialized but it's not a problem, as we're
+  assuming that fsync() will be a bottleneck anyway.
+  That is, parallelizing writes to log pages we'll decrease number of threads
+  waiting for a page, but then all these threads will be waiting for fsync()
+*/
+
+int TC_LOG_MMAP::log(THD *thd, my_xid xid)
+{
+  int err;
+  PAGE *p;
+  ulong cookie;
+
+  pthread_mutex_lock(&LOCK_active);
+
+  while (unlikely(active && active->free == 0))
+    pthread_cond_wait(&COND_active, &LOCK_active);
+
+  if (active == 0)
+  {
+    lock_queue(&LOCK_pool);
+    get_active_from_pool();
+    unlock_queue(&LOCK_pool);
+  }
+
+  p=active;
+  pthread_mutex_lock(&p->lock);
+
+  while (*p->ptr)
+  {
+    p->ptr++;
+    DBUG_ASSERT(p->ptr < p->end);               // because p->free > 0
+  }
+
+  cookie= (ulong)((uchar *)p->ptr - data);      // Can never be zero
+  *p->ptr++= xid;
+  p->free--;
+  p->state= DIRTY;
+
+  /* to sync or not to sync - this is the question */
+  pthread_mutex_lock(&LOCK_sync);
+  pthread_mutex_unlock(&p->lock);
+
+  if (in_sync)
+  {                                          // somebody's syncing. let's wait
+    pthread_mutex_unlock(&LOCK_active);
+    p->waiters++;
+    do
+    {
+      pthread_cond_wait(&p->cond, &LOCK_sync);
+    } while (p->state == DIRTY && in_sync);
+    p->waiters--;
+    err= p->state == ERROR;
+    if (p->state != DIRTY)                   // page was synced
+    {
+      if (p->waiters == 0)
+        pthread_cond_signal(&COND_pool);     // in case somebody's waiting
+      pthread_mutex_unlock(&LOCK_sync);
+      goto done;                             // we're done
+    }
+  }                                          // page was not synced! do it now
+  in_sync=p;                                 // place is vacant - take it
+  DBUG_ASSERT(active == p);
+  active=0;
+  pthread_cond_broadcast(&COND_active);      // in case somebody's waiting
+  pthread_mutex_unlock(&LOCK_active);
+  pthread_mutex_unlock(&LOCK_sync);
+  err= sync();
+
+done:
+  return err ? 0 : cookie;
+}
+
+int TC_LOG_MMAP::sync()
+{
+  int err;
+
+  DBUG_ASSERT(in_sync != active);
+
+  /*
+    sit down and relax - this can take a while...
+    note - no locks are held at this point
+  */
+  err= my_msync(fd, in_sync->start, 1, MS_SYNC);
+
+  pthread_mutex_lock(&LOCK_pool);
+  pool_last->next=in_sync;
+  pool_last=in_sync;
+  in_sync->next=0;
+  in_sync->state= err ? ERROR : POOL;
+  pthread_cond_broadcast(&in_sync->cond);    // signal "sync done"
+  pthread_cond_signal(&COND_pool);           // in case somebody's waiting
+  pthread_mutex_unlock(&LOCK_pool);
+
+  pthread_mutex_lock(&LOCK_sync);
+  in_sync=0;
+  pthread_cond_signal(&active->cond);        // wake up a new syncer
+  pthread_mutex_unlock(&LOCK_sync);
+  return err;
+}
+
+void TC_LOG_MMAP::unlog(ulong cookie, my_xid xid)
+{
+  PAGE *p=pages+(cookie/tc_log_page_size);
+  my_xid *x=(my_xid *)(data+cookie);
+
+  DBUG_ASSERT(*x == xid);
+  DBUG_ASSERT(x >= p->start && x < p->end);
+  *x=0;
+
+  pthread_mutex_lock(&p->lock);
+  p->free++;
+  DBUG_ASSERT(p->free <= p->size);
+  set_if_smaller(p->ptr, x);
+  if (p->free == p->size)
+    statistic_decrement(tc_log_cur_pages_used, &LOCK_status);
+  if (p->waiters == 0)                 // the page is in pool and ready to rock
+    pthread_cond_signal(&COND_pool);   // ping ... in case somebody's waiting
+  pthread_mutex_unlock(&p->lock);
+}
+
+void TC_LOG_MMAP::close()
+{
+  switch (inited) {
+  case 6:
+    pthread_mutex_destroy(&LOCK_sync);
+    pthread_mutex_destroy(&LOCK_active);
+    pthread_mutex_destroy(&LOCK_pool);
+    pthread_cond_destroy(&COND_pool);
+  case 5:
+    data[0]='A'; // garble the first (signature) byte, in case my_delete fails
+  case 4:
+    for (uint i=0; i < npages; i++)
+    {
+      if (pages[i].ptr == 0)
+        break;
+      pthread_mutex_destroy(&pages[i].lock);
+      pthread_cond_destroy(&pages[i].cond);
+    }
+  case 3:
+    my_free((gptr)pages, MYF(0));
+  case 2:
+    my_munmap(data, file_length);
+  case 1:
+    my_close(fd, MYF(0));
+  }
+  if (inited>=5) // cannot do in the switch because of Windows
+    my_delete(logname, MYF(MY_WME));
+  inited=0;
+}
+
+int TC_LOG_MMAP::recover()
+{
+  HASH xids;
+  PAGE *p=pages, *end_p=pages+npages;
+
+  if (memcmp(data, tc_log_magic, sizeof(tc_log_magic)))
+  {
+    sql_print_error("Bad magic header in tc log");
+    goto err1;
+  }
+
+  /*
+    the first byte after magic signature is set to current
+    number of storage engines on startup
+  */
+  if (data[sizeof(tc_log_magic)] != total_ha_2pc)
+  {
+    sql_print_error("For recovery to work all storage engines and binary log "
+                    "must have exactly the same settings as before the crash!");
+    goto err1;
+  }
+
+  if (hash_init(&xids, &my_charset_bin, tc_log_page_size/3, 0,
+            sizeof(my_xid), 0, 0, MYF(0)))
+    goto err1;
+
+  for ( ; p < end_p ; p++)
+  {
+    for (my_xid *x=p->start; x < p->end; x++)
+      if (*x && my_hash_insert(&xids, (byte *)x))
+        goto err2; // OOM
+  }
+
+  if (ha_recover(&xids))
+    goto err2;
+
+  hash_free(&xids);
+  bzero(data, file_length);
+  return 0;
+
+err2:
+  hash_free(&xids);
+err1:
+  sql_print_error("Crash recovery failed. Either correct the problem "
+                  "(if it's, for example, out of memory error) and restart, "
+                  "or delete tc log and start mysqld with "
+                  "--tc-heuristic-recover={commit|rollback}");
+  return 1;
+}
+
+int TC_LOG::using_heuristic_recover()
+{
+  if (!tc_heuristic_recover)
+    return 0;
+
+  sql_print_information("Heuristic crash recovery mode");
+  if (ha_recover(0))
+    sql_print_error("Heuristic crash recovery failed");
+  sql_print_information("Please restart mysqld without --tc-heuristic-recover");
+  return 1;
+}
+
+/****** transaction coordinator log for 2pc - binlog() based solution ******/
+
+/*
+  TODO keep in-memory list of prepared transactions
+  (add to list in log(), remove on unlog())
+  and copy it to the new binlog if rotated
+  but let's check the behaviour of tc_log_page_waits first!
+*/
+
+int MYSQL_LOG::open(const char *opt_name)
+{
+  LOG_INFO log_info, new_log_info;
+
+  DBUG_ASSERT(total_ha_2pc > 1);
+
+  pthread_mutex_init(&LOCK_prep_xids, MY_MUTEX_INIT_FAST);
+  pthread_cond_init (&COND_prep_xids, 0);
+
+  if (using_heuristic_recover())
+    return 1;
+
+  /*
+    read index file to get a last but one binlog filename
+    note - there's no need to lock any mutex, mysqld is only starting
+    up, no other threads are running yet.
+    still, there's safe_mutex_assert_owner() in binlog code, so
+    let's keep it happy.
+  */
+
+  if (find_log_pos(&new_log_info, NullS, 1))
+    goto err;                           // er ? where's the current entry ?
+
+  if (strcmp(log_file_name, new_log_info.log_file_name))
+  {
+    const char *errmsg;
+    char        last_event_type=UNKNOWN_EVENT;
+    IO_CACHE    log;
+    File        file;
+    int         error;
+    Log_event  *ev;
+    Format_description_log_event fdle(4);
+
+    if (! fdle.is_valid())
+      goto err;
+
+    do
+    {
+      log_info.index_file_offset=new_log_info.index_file_offset;
+      log_info.index_file_start_offset=new_log_info.index_file_offset;
+      strcpy(log_info.log_file_name, new_log_info.log_file_name);
+      if (find_next_log(&new_log_info, 1))
+        goto err;                       // er ? where's the current entry ?
+    } while (strcmp(log_file_name, new_log_info.log_file_name));
+
+    if ((file= open_binlog(&log, log_info.log_file_name, &errmsg)) < 0)
+    {
+      sql_print_error("%s", errmsg);
+      goto err;
+    }
+
+    if (((ev= Log_event::read_log_event(&log, 0, &fdle))) &&
+         (ev->get_type_code() == FORMAT_DESCRIPTION_EVENT) &&
+         (ev->flags & LOG_EVENT_BINLOG_CLOSED_F))
+      error=0;
+    else
+      error= recover(&log, (Format_description_log_event *)ev);
+
+    delete ev;
+    end_io_cache(&log);
+    my_close(file, MYF(MY_WME));
+
+    if (error)
+      goto err;
+  }
+
+  return 0;
+
+err:
+  return 1;
+}
+
+void MYSQL_LOG::close()
+{
+  DBUG_ASSERT(prepared_xids==0);
+  pthread_mutex_destroy(&LOCK_prep_xids);
+  pthread_cond_destroy (&COND_prep_xids);
+}
+
+/* TODO group commit */
+int MYSQL_LOG::log(THD *thd, my_xid xid)
+{
+  Xid_log_event xle(thd, xid);
+  if (xle.write((IO_CACHE*)thd->ha_data[binlog_hton.slot]))
+    return 0;
+  thread_safe_increment(prepared_xids, &LOCK_prep_xids);
+  return !binlog_commit(thd,1);                 // invert return value
+}
+
+void MYSQL_LOG::unlog(ulong cookie, my_xid xid)
+{
+  if (thread_safe_dec_and_test(prepared_xids, &LOCK_prep_xids))
+    pthread_cond_signal(&COND_prep_xids);
+}
+
+int MYSQL_LOG::recover(IO_CACHE *log, Format_description_log_event *fdle)
+{
+  Log_event  *ev;
+  HASH xids;
+  MEM_ROOT mem_root;
+
+  if (! fdle->is_valid() ||
+      hash_init(&xids, &my_charset_bin, tc_log_page_size/3, 0,
+            sizeof(my_xid), 0, 0, MYF(0)))
+    goto err1;
+
+  init_alloc_root(&mem_root, tc_log_page_size, tc_log_page_size);
+
+  while ((ev= Log_event::read_log_event(log,0,fdle)) && ev->is_valid())
+  {
+    if (ev->get_type_code() == XID_EVENT)
+    {
+      Xid_log_event *xev=(Xid_log_event *)ev;
+      byte *x=memdup_root(&mem_root, (char *)& xev->xid, sizeof(xev->xid));
+      if (! x)
+        goto err2;
+      my_hash_insert(&xids, x);
+    }
+    delete ev;
+  }
+
+  if (ha_recover(&xids))
+    goto err2;
+
+  free_root(&mem_root, MYF(0));
+  hash_free(&xids);
+  return 0;
+
+err2:
+  free_root(&mem_root, MYF(0));
+  hash_free(&xids);
+err1:
+  sql_print_error("Crash recovery failed. Either correct the problem "
+                  "(if it's, for example, out of memory error) and restart, "
+                  "or delete (or rename) binary log and start mysqld with "
+                  "--tc-heuristic-recover={commit|rollback}");
+  return 1;
+}
+