1 files changed, 555 insertions, 64 deletions
diff --git a/sql/handler.cc b/sql/handler.cc
index eba9b0dc5ea..0fb61895a61 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -576,6 +576,295 @@ void ha_close_connection(THD* thd)
  ======================= TRANSACTIONS ===================================*/
 
 /**
+  Transaction handling in the server
+  ==================================
+
+  In each client connection, MySQL maintains two transactional
+  states:
+  - a statement transaction,
+  - a standard, also called normal transaction.
+
+  Historical note
+  ---------------
+  "Statement transaction" is a non-standard term that comes
+  from the times when MySQL supported BerkeleyDB storage engine.
+
+  First of all, it should be said that in BerkeleyDB auto-commit
+  mode auto-commits operations that are atomic to the storage
+  engine itself, such as a write of a record, and are too
+  high-granular to be atomic from the application perspective
+  (MySQL). One SQL statement could involve many BerkeleyDB
+  auto-committed operations and thus BerkeleyDB auto-commit was of
+  little use to MySQL.
+
+  Secondly, instead of SQL standard savepoints, BerkeleyDB
+  provided the concept of "nested transactions". In a nutshell,
+  transactions could be arbitrarily nested, but when the parent
+  transaction was committed or aborted, all its child (nested)
+  transactions were handled committed or aborted as well.
+  Commit of a nested transaction, in turn, made its changes
+  visible, but not durable: it destroyed the nested transaction,
+  all its changes would become available to the parent and
+  currently active nested transactions of this parent.
+
+  So the mechanism of nested transactions was employed to
+  provide "all or nothing" guarantee of SQL statements
+  required by the standard.
+  A nested transaction would be created at start of each SQL
+  statement, and destroyed (committed or aborted) at statement
+  end. Such nested transaction was internally referred to as
+  a "statement transaction" and gave birth to the term.
+
+  <Historical note ends>
+
+  Since then a statement transaction is started for each statement
+  that accesses transactional tables or uses the binary log.  If
+  the statement succeeds, the statement transaction is committed.
+  If the statement fails, the transaction is rolled back. Commits
+  of statement transactions are not durable -- each such
+  transaction is nested in the normal transaction, and if the
+  normal transaction is rolled back, the effects of all enclosed
+  statement transactions are undone as well.  Technically,
+  a statement transaction can be viewed as a savepoint which is
+  maintained automatically in order to make effects of one
+  statement atomic.
+
+  The normal transaction is started by the user and is ended
+  usually upon a user request as well. The normal transaction
+  encloses transactions of all statements issued between
+  its beginning and its end.
+  In autocommit mode, the normal transaction is equivalent
+  to the statement transaction.
+
+  Since MySQL supports PSEA (pluggable storage engine
+  architecture), more than one transactional engine can be
+  active at a time. Hence transactions, from the server
+  point of view, are always distributed. In particular,
+  transactional state is maintained independently for each
+  engine. In order to commit a transaction the two phase
+  commit protocol is employed.
+
+  Not all statements are executed in context of a transaction.
+  Administrative and status information statements do not modify
+  engine data, and thus do not start a statement transaction and
+  also have no effect on the normal transaction. Examples of such
+  statements are SHOW STATUS and RESET SLAVE.
+
+  Similarly DDL statements are not transactional,
+  and therefore a transaction is [almost] never started for a DDL
+  statement. The difference between a DDL statement and a purely
+  administrative statement though is that a DDL statement always
+  commits the current transaction before proceeding, if there is
+  any.
+
+  At last, SQL statements that work with non-transactional
+  engines also have no effect on the transaction state of the
+  connection. Even though they are written to the binary log,
+  and the binary log is, overall, transactional, the writes
+  are done in "write-through" mode, directly to the binlog
+  file, followed with a OS cache sync, in other words,
+  bypassing the binlog undo log (translog).
+  They do not commit the current normal transaction.
+  A failure of a statement that uses non-transactional tables
+  would cause a rollback of the statement transaction, but
+  in case there no non-transactional tables are used,
+  no statement transaction is started.
+
+  Data layout
+  -----------
+
+  The server stores its transaction-related data in
+  thd->transaction. This structure has two members of type
+  THD_TRANS. These members correspond to the statement and
+  normal transactions respectively:
+
+  - thd->transaction.stmt contains a list of engines
+  that are participating in the given statement
+  - thd->transaction.all contains a list of engines that
+  have participated in any of the statement transactions started
+  within the context of the normal transaction.
+  Each element of the list contains a pointer to the storage
+  engine, engine-specific transactional data, and engine-specific
+  transaction flags.
+
+  In autocommit mode thd->transaction.all is empty.
+  Instead, data of thd->transaction.stmt is
+  used to commit/rollback the normal transaction.
+
+  The list of registered engines has a few important properties:
+  - no engine is registered in the list twice
+  - engines are present in the list a reverse temporal order --
+  new participants are always added to the beginning of the list.
+
+  Transaction life cycle
+  ----------------------
+
+  When a new connection is established, thd->transaction
+  members are initialized to an empty state.
+  If a statement uses any tables, all affected engines
+  are registered in the statement engine list. In
+  non-autocommit mode, the same engines are registered in
+  the normal transaction list.
+  At the end of the statement, the server issues a commit
+  or a roll back for all engines in the statement list.
+  At this point transaction flags of an engine, if any, are
+  propagated from the statement list to the list of the normal
+  transaction.
+  When commit/rollback is finished, the statement list is
+  cleared. It will be filled in again by the next statement,
+  and emptied again at the next statement's end.
+
+  The normal transaction is committed in a similar way
+  (by going over all engines in thd->transaction.all list)
+  but at different times:
+  - upon COMMIT SQL statement is issued by the user
+  - implicitly, by the server, at the beginning of a DDL statement
+  or SET AUTOCOMMIT={0|1} statement.
+
+  The normal transaction can be rolled back as well:
+  - if the user has requested so, by issuing ROLLBACK SQL
+  statement
+  - if one of the storage engines requested a rollback
+  by setting thd->transaction_rollback_request. This may
+  happen in case, e.g., when the transaction in the engine was
+  chosen a victim of the internal deadlock resolution algorithm
+  and rolled back internally. When such a situation happens, there
+  is little the server can do and the only option is to rollback
+  transactions in all other participating engines.  In this case
+  the rollback is accompanied by an error sent to the user.
+
+  As follows from the use cases above, the normal transaction
+  is never committed when there is an outstanding statement
+  transaction. In most cases there is no conflict, since
+  commits of the normal transaction are issued by a stand-alone
+  administrative or DDL statement, thus no outstanding statement
+  transaction of the previous statement exists. Besides,
+  all statements that manipulate with the normal transaction
+  are prohibited in stored functions and triggers, therefore
+  no conflicting situation can occur in a sub-statement either.
+  The remaining rare cases when the server explicitly has
+  to commit the statement transaction prior to committing the normal
+  one cover error-handling scenarios (see for example
+  SQLCOM_LOCK_TABLES).
+
+  When committing a statement or a normal transaction, the server
+  either uses the two-phase commit protocol, or issues a commit
+  in each engine independently. The two-phase commit protocol
+  is used only if:
+  - all participating engines support two-phase commit (provide
+    handlerton::prepare PSEA API call) and
+  - transactions in at least two engines modify data (i.e. are
+  not read-only).
+
+  Note that the two phase commit is used for
+  statement transactions, even though they are not durable anyway.
+  This is done to ensure logical consistency of data in a multiple-
+  engine transaction.
+  For example, imagine that some day MySQL supports unique
+  constraint checks deferred till the end of statement. In such
+  case a commit in one of the engines may yield ER_DUP_KEY,
+  and MySQL should be able to gracefully abort statement
+  transactions of other participants.
+
+  After the normal transaction has been committed,
+  thd->transaction.all list is cleared.
+
+  When a connection is closed, the current normal transaction, if
+  any, is rolled back.
+
+  Roles and responsibilities
+  --------------------------
+
+  The server has no way to know that an engine participates in
+  the statement and a transaction has been started
+  in it unless the engine says so. Thus, in order to be
+  a part of a transaction, the engine must "register" itself.
+  This is done by invoking trans_register_ha() server call.
+  Normally the engine registers itself whenever handler::external_lock()
+  is called. trans_register_ha() can be invoked many times: if
+  an engine is already registered, the call does nothing.
+  In case autocommit is not set, the engine must register itself
+  twice -- both in the statement list and in the normal transaction
+  list.
+  In which list to register is a parameter of trans_register_ha().
+
+  Note, that although the registration interface in itself is
+  fairly clear, the current usage practice often leads to undesired
+  effects. E.g. since a call to trans_register_ha() in most engines
+  is embedded into implementation of handler::external_lock(), some
+  DDL statements start a transaction (at least from the server
+  point of view) even though they are not expected to. E.g.
+  CREATE TABLE does not start a transaction, since
+  handler::external_lock() is never called during CREATE TABLE. But
+  CREATE TABLE ... SELECT does, since handler::external_lock() is
+  called for the table that is being selected from. This has no
+  practical effects currently, but must be kept in mind
+  nevertheless.
+
+  Once an engine is registered, the server will do the rest
+  of the work.
+
+  During statement execution, whenever any of data-modifying
+  PSEA API methods is used, e.g. handler::write_row() or
+  handler::update_row(), the read-write flag is raised in the
+  statement transaction for the involved engine.
+  Currently All PSEA calls are "traced", and the data can not be
+  changed in a way other than issuing a PSEA call. Important:
+  unless this invariant is preserved the server will not know that
+  a transaction in a given engine is read-write and will not
+  involve the two-phase commit protocol!
+
+  At the end of a statement, server call
+  ha_autocommit_or_rollback() is invoked. This call in turn
+  invokes handlerton::prepare() for every involved engine.
+  Prepare is followed by a call to handlerton::commit_one_phase()
+  If a one-phase commit will suffice, handlerton::prepare() is not
+  invoked and the server only calls handlerton::commit_one_phase().
+  At statement commit, the statement-related read-write engine
+  flag is propagated to the corresponding flag in the normal
+  transaction.  When the commit is complete, the list of registered
+  engines is cleared.
+
+  Rollback is handled in a similar fashion.
+
+  Additional notes on DDL and the normal transaction.
+  ---------------------------------------------------
+
+  DDLs and operations with non-transactional engines
+  do not "register" in thd->transaction lists, and thus do not
+  modify the transaction state. Besides, each DDL in
+  MySQL is prefixed with an implicit normal transaction commit
+  (a call to end_active_trans()), and thus leaves nothing
+  to modify.
+  However, as it has been pointed out with CREATE TABLE .. SELECT,
+  some DDL statements can start a *new* transaction.
+
+  Behaviour of the server in this case is currently badly
+  defined.
+  DDL statements use a form of "semantic" logging
+  to maintain atomicity: if CREATE TABLE .. SELECT failed,
+  the newly created table is deleted.
+  In addition, some DDL statements issue interim transaction
+  commits: e.g. ALTER TABLE issues a commit after data is copied
+  from the original table to the internal temporary table. Other
+  statements, e.g. CREATE TABLE ... SELECT do not always commit
+  after itself.
+  And finally there is a group of DDL statements such as
+  RENAME/DROP TABLE that doesn't start a new transaction
+  and doesn't commit.
+
+  This diversity makes it hard to say what will happen if
+  by chance a stored function is invoked during a DDL --
+  whether any modifications it makes will be committed or not
+  is not clear. Fortunately, SQL grammar of few DDLs allows
+  invocation of a stored function.
+
+  A consistent behaviour is perhaps to always commit the normal
+  transaction after all DDLs, just like the statement transaction
+  is always committed at the end of all statements.
+*/
+
+/**
   Register a storage engine for a transaction.
 
   Every storage engine MUST call this function when it starts
@@ -592,7 +881,7 @@ void ha_close_connection(THD* thd)
 void trans_register_ha(THD *thd, bool all, handlerton *ht_arg)
 {
   THD_TRANS *trans;
-  handlerton **ht;
+  Ha_trx_info *ha_info;
   DBUG_ENTER("trans_register_ha");
   DBUG_PRINT("enter",("%s", all ? "all" : "stmt"));
 
@@ -604,12 +893,13 @@ void trans_register_ha(THD *thd, bool all, handlerton *ht_arg)
   else
     trans= &thd->transaction.stmt;
 
-  for (ht=trans->ht; *ht; ht++)
-    if (*ht == ht_arg)
-      DBUG_VOID_RETURN;  /* already registered, return */
+  ha_info= thd->ha_data[ht_arg->slot].ha_info + static_cast<unsigned>(all);
+
+  if (ha_info->is_started())
+    DBUG_VOID_RETURN; /* already registered, return */
+
+  ha_info->register_ha(trans, ht_arg);
 
-  trans->ht[trans->nht++]=ht_arg;
-  DBUG_ASSERT(*ht == ht_arg);
   trans->no_2pc|=(ht_arg->prepare==0);
   if (thd->transaction.xid_state.xid.is_null())
     thd->transaction.xid_state.xid.set(thd->query_id);
@@ -626,18 +916,19 @@ int ha_prepare(THD *thd)
 {
   int error=0, all=1;
   THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
-  handlerton **ht=trans->ht;
+  Ha_trx_info *ha_info= trans->ha_list;
   DBUG_ENTER("ha_prepare");
 #ifdef USING_TRANSACTIONS
-  if (trans->nht)
+  if (ha_info)
   {
-    for (; *ht; ht++)
+    for (; ha_info; ha_info= ha_info->next())
     {
       int err;
+      handlerton *ht= ha_info->ht();
       status_var_increment(thd->status_var.ha_prepare_count);
-      if ((*ht)->prepare)
+      if (ht->prepare)
       {
-        if ((err= (*(*ht)->prepare)(*ht, thd, all)))
+        if ((err= ht->prepare(ht, thd, all)))
         {
           my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
           ha_rollback_trans(thd, all);
@@ -649,7 +940,7 @@ int ha_prepare(THD *thd)
       {
         push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_WARN,
                             ER_ILLEGAL_HA, ER(ER_ILLEGAL_HA),
-                            ha_resolve_storage_engine_name(*ht));
+                            ha_resolve_storage_engine_name(ht));
       }
     }
   }
@@ -658,6 +949,62 @@ int ha_prepare(THD *thd)
 }
 
 /**
+  Check if we can skip the two-phase commit.
+
+  A helper function to evaluate if two-phase commit is mandatory.
+  As a side effect, propagates the read-only/read-write flags
+  of the statement transaction to its enclosing normal transaction.
+
+  @retval TRUE   we must run a two-phase commit. Returned
+                 if we have at least two engines with read-write changes.
+  @retval FALSE  Don't need two-phase commit. Even if we have two
+                 transactional engines, we can run two independent
+                 commits if changes in one of the engines are read-only.
+*/
+
+static
+bool
+ha_check_and_coalesce_trx_read_only(THD *thd, Ha_trx_info *ha_list,
+                                    bool all)
+{
+  /* The number of storage engines that have actual changes. */
+  unsigned rw_ha_count= 0;
+  Ha_trx_info *ha_info;
+
+  for (ha_info= ha_list; ha_info; ha_info= ha_info->next())
+  {
+    if (ha_info->is_trx_read_write())
+      ++rw_ha_count;
+
+    if (! all)
+    {
+      Ha_trx_info *ha_info_all= &thd->ha_data[ha_info->ht()->slot].ha_info[1];
+      DBUG_ASSERT(ha_info != ha_info_all);
+      /*
+        Merge read-only/read-write information about statement
+        transaction to its enclosing normal transaction. Do this
+        only if in a real transaction -- that is, if we know
+        that ha_info_all is registered in thd->transaction.all.
+        Since otherwise we only clutter the normal transaction flags.
+      */
+      if (ha_info_all->is_started()) /* FALSE if autocommit. */
+        ha_info_all->coalesce_trx_with(ha_info);
+    }
+    else if (rw_ha_count > 1)
+    {
+      /*
+        It is a normal transaction, so we don't need to merge read/write
+        information up, and the need for two-phase commit has been
+        already established. Break the loop prematurely.
+      */
+      break;
+    }
+  }
+  return rw_ha_count > 1;
+}
+
+
+/**
   @retval
     0   ok
   @retval
@@ -674,12 +1021,25 @@ int ha_prepare(THD *thd)
 int ha_commit_trans(THD *thd, bool all)
 {
   int error= 0, cookie= 0;
+  /*
+    'all' means that this is either an explicit commit issued by
+    user, or an implicit commit issued by a DDL.
+  */
   THD_TRANS *trans= all ? &thd->transaction.all : &thd->transaction.stmt;
-  bool is_real_trans= all || thd->transaction.all.nht == 0;
-  handlerton **ht= trans->ht;
+  bool is_real_trans= all || thd->transaction.all.ha_list == 0;
+  Ha_trx_info *ha_info= trans->ha_list;
   my_xid xid= thd->transaction.xid_state.xid.get_my_xid();
   DBUG_ENTER("ha_commit_trans");
 
+  /*
+    We must not commit the normal transaction if a statement
+    transaction is pending. Otherwise statement transaction
+    flags will not get propagated to its normal transaction's
+    counterpart.
+  */
+  DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL ||
+              trans == &thd->transaction.stmt);
+
   if (thd->in_sub_stmt)
   {
     /*
@@ -701,8 +1061,10 @@ int ha_commit_trans(THD *thd, bool all)
     DBUG_RETURN(2);
   }
 #ifdef USING_TRANSACTIONS
-  if (trans->nht)
+  if (ha_info)
   {
+    bool must_2pc;
+
     if (is_real_trans && wait_if_global_read_lock(thd, 0, 0))
     {
       ha_rollback_trans(thd, all);
@@ -727,12 +1089,26 @@ int ha_commit_trans(THD *thd, bool all)
     if (is_real_trans)                          /* not a statement commit */
       thd->stmt_map.close_transient_cursors();
 
-    if (!trans->no_2pc && trans->nht > 1)
+    must_2pc= ha_check_and_coalesce_trx_read_only(thd, ha_info, all);
+
+    if (!trans->no_2pc && must_2pc)
     {
-      for (; *ht && !error; ht++)
+      for (; ha_info && !error; ha_info= ha_info->next())
       {
         int err;
-        if ((err= (*(*ht)->prepare)(*ht, thd, all)))
+        handlerton *ht= ha_info->ht();
+        /*
+          Do not call two-phase commit if this particular
+          transaction is read-only. This allows for simpler
+          implementation in engines that are always read-only.
+        */
+        if (! ha_info->is_trx_read_write())
+          continue;
+        /*
+          Sic: we know that prepare() is not NULL since otherwise
+          trans->no_2pc would have been set.
+        */
+        if ((err= ht->prepare(ht, thd, all)))
         {
           my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
           error= 1;
@@ -770,24 +1146,26 @@ int ha_commit_one_phase(THD *thd, bool all)
 {
   int error=0;
   THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
-  bool is_real_trans=all || thd->transaction.all.nht == 0;
-  handlerton **ht=trans->ht;
+  bool is_real_trans=all || thd->transaction.all.ha_list == 0;
+  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
   DBUG_ENTER("ha_commit_one_phase");
 #ifdef USING_TRANSACTIONS
-  if (trans->nht)
+  if (ha_info)
   {
-    for (ht=trans->ht; *ht; ht++)
+    for (; ha_info; ha_info= ha_info_next)
     {
       int err;
-      if ((err= (*(*ht)->commit)(*ht, thd, all)))
+      handlerton *ht= ha_info->ht();
+      if ((err= ht->commit(ht, thd, all)))
       {
         my_error(ER_ERROR_DURING_COMMIT, MYF(0), err);
         error=1;
       }
       status_var_increment(thd->status_var.ha_commit_count);
-      *ht= 0;
+      ha_info_next= ha_info->next();
+      ha_info->reset(); /* keep it conveniently zero-filled */
     }
-    trans->nht=0;
+    trans->ha_list= 0;
     trans->no_2pc=0;
     if (is_real_trans)
       thd->transaction.xid_state.xid.null();
@@ -810,8 +1188,17 @@ int ha_rollback_trans(THD *thd, bool all)
 {
   int error=0;
   THD_TRANS *trans=all ? &thd->transaction.all : &thd->transaction.stmt;
-  bool is_real_trans=all || thd->transaction.all.nht == 0;
+  Ha_trx_info *ha_info= trans->ha_list, *ha_info_next;
+  bool is_real_trans=all || thd->transaction.all.ha_list == 0;
   DBUG_ENTER("ha_rollback_trans");
+
+  /*
+    We must not rollback the normal transaction if a statement
+    transaction is pending.
+  */
+  DBUG_ASSERT(thd->transaction.stmt.ha_list == NULL ||
+              trans == &thd->transaction.stmt);
+
   if (thd->in_sub_stmt)
   {
     /*
@@ -826,24 +1213,26 @@ int ha_rollback_trans(THD *thd, bool all)
     DBUG_RETURN(1);
   }
 #ifdef USING_TRANSACTIONS
-  if (trans->nht)
+  if (ha_info)
   {
     /* Close all cursors that can not survive ROLLBACK */
     if (is_real_trans)                          /* not a statement commit */
       thd->stmt_map.close_transient_cursors();
 
-    for (handlerton **ht=trans->ht; *ht; ht++)
+    for (; ha_info; ha_info= ha_info_next)
     {
       int err;
-      if ((err= (*(*ht)->rollback)(*ht, thd, all)))
+      handlerton *ht= ha_info->ht();
+      if ((err= ht->rollback(ht, thd, all)))
       { // cannot happen
         my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
         error=1;
       }
       status_var_increment(thd->status_var.ha_rollback_count);
-      *ht= 0;
+      ha_info_next= ha_info->next();
+      ha_info->reset(); /* keep it conveniently zero-filled */
     }
-    trans->nht=0;
+    trans->ha_list= 0;
     trans->no_2pc=0;
     if (is_real_trans)
       thd->transaction.xid_state.xid.null();
@@ -889,17 +1278,19 @@ int ha_autocommit_or_rollback(THD *thd, int error)
 {
   DBUG_ENTER("ha_autocommit_or_rollback");
 #ifdef USING_TRANSACTIONS
-  if (thd->transaction.stmt.nht)
+  if (thd->transaction.stmt.ha_list)
   {
     if (!error)
     {
-      if (ha_commit_stmt(thd))
+      if (ha_commit_trans(thd, 0))
 	error=1;
     }
-    else if (thd->transaction_rollback_request && !thd->in_sub_stmt)
-      (void) ha_rollback(thd);
-    else
-      (void) ha_rollback_stmt(thd);
+    else 
+    {
+      (void) ha_rollback_trans(thd, 0);
+      if (thd->transaction_rollback_request && !thd->in_sub_stmt)
+        (void) ha_rollback(thd);
+    }
 
     thd->variables.tx_isolation=thd->session_tx_isolation;
   }
@@ -1199,7 +1590,7 @@ bool mysql_xa_recover(THD *thd)
   }
 
   pthread_mutex_unlock(&LOCK_xid_cache);
-  send_eof(thd);
+  my_eof(thd);
   DBUG_RETURN(0);
 }
 
@@ -1246,43 +1637,49 @@ int ha_rollback_to_savepoint(THD *thd, SAVEPOINT *sv)
   int error=0;
   THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
                                         &thd->transaction.all);
-  handlerton **ht=trans->ht, **end_ht;
+  Ha_trx_info *ha_info, *ha_info_next;
+
   DBUG_ENTER("ha_rollback_to_savepoint");
 
-  trans->nht=sv->nht;
   trans->no_2pc=0;
-  end_ht=ht+sv->nht;
   /*
     rolling back to savepoint in all storage engines that were part of the
     transaction when the savepoint was set
   */
-  for (; ht < end_ht; ht++)
+  for (ha_info= sv->ha_list; ha_info; ha_info= ha_info->next())
   {
     int err;
-    DBUG_ASSERT((*ht)->savepoint_set != 0);
-    if ((err= (*(*ht)->savepoint_rollback)(*ht, thd, (uchar *)(sv+1)+(*ht)->savepoint_offset)))
+    handlerton *ht= ha_info->ht();
+    DBUG_ASSERT(ht);
+    DBUG_ASSERT(ht->savepoint_set != 0);
+    if ((err= ht->savepoint_rollback(ht, thd,
+                                     (uchar *)(sv+1)+ht->savepoint_offset)))
     { // cannot happen
       my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
       error=1;
     }
     status_var_increment(thd->status_var.ha_savepoint_rollback_count);
-    trans->no_2pc|=(*ht)->prepare == 0;
+    trans->no_2pc|= ht->prepare == 0;
   }
   /*
     rolling back the transaction in all storage engines that were not part of
     the transaction when the savepoint was set
   */
-  for (; *ht ; ht++)
+  for (ha_info= trans->ha_list; ha_info != sv->ha_list;
+       ha_info= ha_info_next)
   {
     int err;
-    if ((err= (*(*ht)->rollback)(*ht, thd, !thd->in_sub_stmt)))
+    handlerton *ht= ha_info->ht();
+    if ((err= ht->rollback(ht, thd, !thd->in_sub_stmt)))
     { // cannot happen
       my_error(ER_ERROR_DURING_ROLLBACK, MYF(0), err);
       error=1;
     }
     status_var_increment(thd->status_var.ha_rollback_count);
-    *ht=0; // keep it conveniently zero-filled
+    ha_info_next= ha_info->next();
+    ha_info->reset(); /* keep it conveniently zero-filled */
   }
+  trans->ha_list= sv->ha_list;
   DBUG_RETURN(error);
 }
 
@@ -1297,26 +1694,32 @@ int ha_savepoint(THD *thd, SAVEPOINT *sv)
   int error=0;
   THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
                                         &thd->transaction.all);
-  handlerton **ht=trans->ht;
+  Ha_trx_info *ha_info= trans->ha_list;
   DBUG_ENTER("ha_savepoint");
 #ifdef USING_TRANSACTIONS
-  for (; *ht; ht++)
+  for (; ha_info; ha_info= ha_info->next())
   {
     int err;
-    if (! (*ht)->savepoint_set)
+    handlerton *ht= ha_info->ht();
+    DBUG_ASSERT(ht);
+    if (! ht->savepoint_set)
     {
       my_error(ER_CHECK_NOT_IMPLEMENTED, MYF(0), "SAVEPOINT");
       error=1;
       break;
     }
-    if ((err= (*(*ht)->savepoint_set)(*ht, thd, (uchar *)(sv+1)+(*ht)->savepoint_offset)))
+    if ((err= ht->savepoint_set(ht, thd, (uchar *)(sv+1)+ht->savepoint_offset)))
     { // cannot happen
       my_error(ER_GET_ERRNO, MYF(0), err);
       error=1;
     }
     status_var_increment(thd->status_var.ha_savepoint_count);
   }
-  sv->nht=trans->nht;
+  /*
+    Remember the list of registered storage engines. All new
+    engines are prepended to the beginning of the list.
+  */
+  sv->ha_list= trans->ha_list;
 #endif /* USING_TRANSACTIONS */
   DBUG_RETURN(error);
 }
@@ -1324,20 +1727,19 @@ int ha_savepoint(THD *thd, SAVEPOINT *sv)
 int ha_release_savepoint(THD *thd, SAVEPOINT *sv)
 {
   int error=0;
-  THD_TRANS *trans= (thd->in_sub_stmt ? &thd->transaction.stmt :
-                                        &thd->transaction.all);
-  handlerton **ht=trans->ht, **end_ht;
+  Ha_trx_info *ha_info= sv->ha_list;
   DBUG_ENTER("ha_release_savepoint");
 
-  end_ht=ht+sv->nht;
-  for (; ht < end_ht; ht++)
+  for (; ha_info; ha_info= ha_info->next())
   {
     int err;
-    if (!(*ht)->savepoint_release)
+    handlerton *ht= ha_info->ht();
+    /* Savepoint life time is enclosed into transaction life time. */
+    DBUG_ASSERT(ht);
+    if (!ht->savepoint_release)
       continue;
-    if ((err= (*(*ht)->savepoint_release)(*ht, thd, 
-                                          (uchar *)(sv+1)+
-                                          (*ht)->savepoint_offset)))
+    if ((err= ht->savepoint_release(ht, thd,
+                                    (uchar *)(sv+1) + ht->savepoint_offset)))
     { // cannot happen
       my_error(ER_GET_ERRNO, MYF(0), err);
       error=1;
@@ -2506,6 +2908,36 @@ int handler::ha_check(THD *thd, HA_CHECK_OPT *check_opt)
   return update_frm_version(table);
 }
 
+/**
+  A helper function to mark a transaction read-write,
+  if it is started.
+*/
+
+inline
+void
+handler::mark_trx_read_write()
+{
+  Ha_trx_info *ha_info= &ha_thd()->ha_data[ht->slot].ha_info[0];
+  /*
+    When a storage engine method is called, the transaction must
+    have been started, unless it's a DDL call, for which the
+    storage engine starts the transaction internally, and commits
+    it internally, without registering in the ha_list.
+    Unfortunately here we can't know know for sure if the engine
+    has registered the transaction or not, so we must check.
+  */
+  if (ha_info->is_started())
+  {
+    DBUG_ASSERT(has_transactions());
+    /*
+      table_share can be NULL in ha_delete_table(). See implementation
+      of standalone function ha_delete_table() in sql_base.cc.
+    */
+    if (table_share == NULL || table_share->tmp_table == NO_TMP_TABLE)
+      ha_info->set_trx_read_write();
+  }
+}
+
 
 /**
   Repair table: public interface.
@@ -2516,6 +2948,9 @@ int handler::ha_check(THD *thd, HA_CHECK_OPT *check_opt)
 int handler::ha_repair(THD* thd, HA_CHECK_OPT* check_opt)
 {
   int result;
+
+  mark_trx_read_write();
+
   if ((result= repair(thd, check_opt)))
     return result;
   return update_frm_version(table);
@@ -2532,6 +2967,8 @@ int
 handler::ha_bulk_update_row(const uchar *old_data, uchar *new_data,
                             uint *dup_key_found)
 {
+  mark_trx_read_write();
+
   return bulk_update_row(old_data, new_data, dup_key_found);
 }
 
@@ -2545,6 +2982,8 @@ handler::ha_bulk_update_row(const uchar *old_data, uchar *new_data,
 int
 handler::ha_delete_all_rows()
 {
+  mark_trx_read_write();
+
   return delete_all_rows();
 }
 
@@ -2558,6 +2997,8 @@ handler::ha_delete_all_rows()
 int
 handler::ha_reset_auto_increment(ulonglong value)
 {
+  mark_trx_read_write();
+
   return reset_auto_increment(value);
 }
 
@@ -2571,6 +3012,8 @@ handler::ha_reset_auto_increment(ulonglong value)
 int
 handler::ha_backup(THD* thd, HA_CHECK_OPT* check_opt)
 {
+  mark_trx_read_write();
+
   return backup(thd, check_opt);
 }
 
@@ -2584,6 +3027,8 @@ handler::ha_backup(THD* thd, HA_CHECK_OPT* check_opt)
 int
 handler::ha_restore(THD* thd, HA_CHECK_OPT* check_opt)
 {
+  mark_trx_read_write();
+
   return restore(thd, check_opt);
 }
 
@@ -2597,6 +3042,8 @@ handler::ha_restore(THD* thd, HA_CHECK_OPT* check_opt)
 int
 handler::ha_optimize(THD* thd, HA_CHECK_OPT* check_opt)
 {
+  mark_trx_read_write();
+
   return optimize(thd, check_opt);
 }
 
@@ -2610,6 +3057,8 @@ handler::ha_optimize(THD* thd, HA_CHECK_OPT* check_opt)
 int
 handler::ha_analyze(THD* thd, HA_CHECK_OPT* check_opt)
 {
+  mark_trx_read_write();
+
   return analyze(thd, check_opt);
 }
 
@@ -2623,6 +3072,8 @@ handler::ha_analyze(THD* thd, HA_CHECK_OPT* check_opt)
 bool
 handler::ha_check_and_repair(THD *thd)
 {
+  mark_trx_read_write();
+
   return check_and_repair(thd);
 }
 
@@ -2636,6 +3087,8 @@ handler::ha_check_and_repair(THD *thd)
 int
 handler::ha_disable_indexes(uint mode)
 {
+  mark_trx_read_write();
+
   return disable_indexes(mode);
 }
 
@@ -2649,6 +3102,8 @@ handler::ha_disable_indexes(uint mode)
 int
 handler::ha_enable_indexes(uint mode)
 {
+  mark_trx_read_write();
+
   return enable_indexes(mode);
 }
 
@@ -2662,6 +3117,8 @@ handler::ha_enable_indexes(uint mode)
 int
 handler::ha_discard_or_import_tablespace(my_bool discard)
 {
+  mark_trx_read_write();
+
   return discard_or_import_tablespace(discard);
 }
 
@@ -2677,6 +3134,8 @@ handler::ha_discard_or_import_tablespace(my_bool discard)
 void
 handler::ha_prepare_for_alter()
 {
+  mark_trx_read_write();
+
   prepare_for_alter();
 }
 
@@ -2690,6 +3149,8 @@ handler::ha_prepare_for_alter()
 int
 handler::ha_rename_table(const char *from, const char *to)
 {
+  mark_trx_read_write();
+
   return rename_table(from, to);
 }
 
@@ -2703,6 +3164,8 @@ handler::ha_rename_table(const char *from, const char *to)
 int
 handler::ha_delete_table(const char *name)
 {
+  mark_trx_read_write();
+
   return delete_table(name);
 }
 
@@ -2716,6 +3179,8 @@ handler::ha_delete_table(const char *name)
 void
 handler::ha_drop_table(const char *name)
 {
+  mark_trx_read_write();
+
   return drop_table(name);
 }
 
@@ -2729,6 +3194,8 @@ handler::ha_drop_table(const char *name)
 int
 handler::ha_create(const char *name, TABLE *form, HA_CREATE_INFO *info)
 {
+  mark_trx_read_write();
+
   return create(name, form, info);
 }
 
@@ -2743,6 +3210,8 @@ int
 handler::ha_create_handler_files(const char *name, const char *old_name,
                         int action_flag, HA_CREATE_INFO *info)
 {
+  mark_trx_read_write();
+
   return create_handler_files(name, old_name, action_flag, info);
 }
 
@@ -2761,6 +3230,8 @@ handler::ha_change_partitions(HA_CREATE_INFO *create_info,
                      const uchar *pack_frm_data,
                      size_t pack_frm_len)
 {
+  mark_trx_read_write();
+
   return change_partitions(create_info, path, copied, deleted,
                            pack_frm_data, pack_frm_len);
 }
@@ -2775,6 +3246,8 @@ handler::ha_change_partitions(HA_CREATE_INFO *create_info,
 int
 handler::ha_drop_partitions(const char *path)
 {
+  mark_trx_read_write();
+
   return drop_partitions(path);
 }
 
@@ -2788,6 +3261,8 @@ handler::ha_drop_partitions(const char *path)
 int
 handler::ha_rename_partitions(const char *path)
 {
+  mark_trx_read_write();
+
   return rename_partitions(path);
 }
 
@@ -2801,6 +3276,8 @@ handler::ha_rename_partitions(const char *path)
 int
 handler::ha_optimize_partitions(THD *thd)
 {
+  mark_trx_read_write();
+
   return optimize_partitions(thd);
 }
 
@@ -2814,6 +3291,8 @@ handler::ha_optimize_partitions(THD *thd)
 int
 handler::ha_analyze_partitions(THD *thd)
 {
+  mark_trx_read_write();
+
   return analyze_partitions(thd);
 }
 
@@ -2827,6 +3306,8 @@ handler::ha_analyze_partitions(THD *thd)
 int
 handler::ha_check_partitions(THD *thd)
 {
+  mark_trx_read_write();
+
   return check_partitions(thd);
 }
 
@@ -2840,6 +3321,8 @@ handler::ha_check_partitions(THD *thd)
 int
 handler::ha_repair_partitions(THD *thd)
 {
+  mark_trx_read_write();
+
   return repair_partitions(thd);
 }
 
@@ -2866,7 +3349,7 @@ int ha_enable_transaction(THD *thd, bool on)
       is an optimization hint that storage engine is free to ignore.
       So, let's commit an open transaction (if any) now.
     */
-    if (!(error= ha_commit_stmt(thd)))
+    if (!(error= ha_commit_trans(thd, 0)))
       error= end_trans(thd, COMMIT);
   }
   DBUG_RETURN(error);
@@ -3826,7 +4309,7 @@ bool ha_show_status(THD *thd, handlerton *db_type, enum ha_stat_type stat)
   }
 
   if (!result)
-    send_eof(thd);
+    my_eof(thd);
   return result;
 }
 
@@ -4044,6 +4527,9 @@ int handler::ha_write_row(uchar *buf)
 {
   int error;
   DBUG_ENTER("handler::ha_write_row");
+
+  mark_trx_read_write();
+
   if (unlikely(error= write_row(buf)))
     DBUG_RETURN(error);
   if (unlikely(error= binlog_log_row<Write_rows_log_event>(table, 0, buf)))
@@ -4062,6 +4548,8 @@ int handler::ha_update_row(const uchar *old_data, uchar *new_data)
    */
   DBUG_ASSERT(new_data == table->record[0]);
 
+  mark_trx_read_write();
+
   if (unlikely(error= update_row(old_data, new_data)))
     return error;
   if (unlikely(error= binlog_log_row<Update_rows_log_event>(table, old_data, new_data)))
@@ -4072,6 +4560,9 @@ int handler::ha_update_row(const uchar *old_data, uchar *new_data)
 int handler::ha_delete_row(const uchar *buf)
 {
   int error;
+
+  mark_trx_read_write();
+
   if (unlikely(error= delete_row(buf)))
     return error;
   if (unlikely(error= binlog_log_row<Delete_rows_log_event>(table, buf, 0)))