Fix for bug #52044 "FLUSH TABLES WITH READ LOCK and FLUSH

TABLES <list> WITH READ LOCK are incompatible". The problem was that FLUSH TABLES <list> WITH READ LOCK which was issued when other connection has acquired global read lock using FLUSH TABLES WITH READ LOCK was blocked and has to wait until global read lock is released. This issue stemmed from the fact that FLUSH TABLES <list> WITH READ LOCK implementation has acquired X metadata locks on tables to be flushed. Since these locks required acquiring of global IX lock this statement was incompatible with global read lock. This patch addresses problem by using SNW metadata type of lock for tables to be flushed by FLUSH TABLES <list> WITH READ LOCK. It is OK to acquire them without global IX lock as long as we won't try to upgrade those locks. Since SNW locks allow concurrent statements using same table FLUSH TABLE <list> WITH READ LOCK now has to wait until old versions of tables to be flushed go away after acquiring metadata locks. Since such waiting can lead to deadlock MDL deadlock detector was extended to take into account waits for flush and resolve such deadlocks. As a bonus code in open_tables() which was responsible for waiting old versions of tables to go away was refactored. Now when we encounter old version of table in open_table() we don't back-off and wait for all old version to go away, but instead wait for this particular table to be flushed. Such approach supported by deadlock detection should reduce number of scenarios in which FLUSH TABLES aborts concurrent multi-statement transactions. Note that active FLUSH TABLES <list> WITH READ LOCK still blocks concurrent FLUSH TABLES WITH READ LOCK statement as the former keeps tables open and thus prevents the latter statement from doing flush.
author: Dmitry Lenev <dlenev@mysql.com> 2010-07-27 17:34:58 +0400
committer: Dmitry Lenev <dlenev@mysql.com> 2010-07-27 17:34:58 +0400
commit: 5fff906edd3d7a5d999cec5403f009f33f8dfb81 (patch)
tree: bef62913efddc244d466f7ff730bcc4205357491 /sql
parent: ec2c3bf2c1c27e4401c767a6cdcb3172453ff42c (diff)
download: mariadb-git-5fff906edd3d7a5d999cec5403f009f33f8dfb81.tar.gz
15 files changed, 606 insertions, 287 deletions
diff --git a/sql/ha_ndbcluster.cc b/sql/ha_ndbcluster.cc
index 7cac8373bc4..e3bafe36fb7 100644
--- a/sql/ha_ndbcluster.cc
+++ b/sql/ha_ndbcluster.cc
@@ -679,7 +679,7 @@ int ha_ndbcluster::ndb_err(NdbTransaction *trans)
     bzero((char*) &table_list,sizeof(table_list));
     table_list.db= m_dbname;
     table_list.alias= table_list.table_name= m_tabname;
-    close_cached_tables(thd, &table_list, FALSE, FALSE);
+    close_cached_tables(thd, &table_list, FALSE, FALSE, LONG_TIMEOUT);
     break;
   }
   default:
@@ -8452,7 +8452,7 @@ int handle_trailing_share(NDB_SHARE *share)
   table_list.db= share->db;
   table_list.alias= table_list.table_name= share->table_name;
   mysql_mutex_assert_owner(&LOCK_open);
-  close_cached_tables(thd, &table_list, TRUE, FALSE);
+  close_cached_tables(thd, &table_list, TRUE, FALSE, LONG_TIMEOUT);
 
   mysql_mutex_lock(&ndbcluster_mutex);
   /* ndb_share reference temporary free */
diff --git a/sql/ha_ndbcluster_binlog.cc b/sql/ha_ndbcluster_binlog.cc
index b610687496e..e7ec6d67d52 100644
--- a/sql/ha_ndbcluster_binlog.cc
+++ b/sql/ha_ndbcluster_binlog.cc
@@ -937,7 +937,7 @@ int ndbcluster_setup_binlog_table_shares(THD *thd)
     ndb_binlog_tables_inited= TRUE;
     if (opt_ndb_extra_logging)
       sql_print_information("NDB Binlog: ndb tables writable");
-    close_cached_tables(NULL, NULL, TRUE, FALSE);
+    close_cached_tables(NULL, NULL, TRUE, FALSE, LONG_TIMEOUT);
     mysql_mutex_unlock(&LOCK_open);
     /* Signal injector thread that all is setup */
     mysql_cond_signal(&injector_cond);
@@ -1751,7 +1751,7 @@ ndb_handle_schema_change(THD *thd, Ndb *ndb, NdbEventOperation *pOp,
       bzero((char*) &table_list,sizeof(table_list));
       table_list.db= (char *)dbname;
       table_list.alias= table_list.table_name= (char *)tabname;
-      close_cached_tables(thd, &table_list, TRUE, FALSE);
+      close_cached_tables(thd, &table_list, TRUE, FALSE, LONG_TIMEOUT);
 
       if ((error= ndbcluster_binlog_open_table(thd, share,
                                                table_share, table, 1)))
@@ -1857,7 +1857,7 @@ ndb_handle_schema_change(THD *thd, Ndb *ndb, NdbEventOperation *pOp,
     bzero((char*) &table_list,sizeof(table_list));
     table_list.db= (char *)dbname;
     table_list.alias= table_list.table_name= (char *)tabname;
-    close_cached_tables(thd, &table_list, FALSE, FALSE);
+    close_cached_tables(thd, &table_list, FALSE, FALSE, LONG_TIMEOUT);
     /* ndb_share reference create free */
     DBUG_PRINT("NDB_SHARE", ("%s create free  use_count: %u",
                              share->key, share->use_count));
@@ -1978,7 +1978,7 @@ ndb_binlog_thread_handle_schema_event(THD *thd, Ndb *ndb,
             bzero((char*) &table_list,sizeof(table_list));
             table_list.db= schema->db;
             table_list.alias= table_list.table_name= schema->name;
-            close_cached_tables(thd, &table_list, FALSE, FALSE);
+            close_cached_tables(thd, &table_list, FALSE, FALSE, LONG_TIMEOUT);
           }
           /* ndb_share reference temporary free */
           if (share)
@@ -2095,7 +2095,7 @@ ndb_binlog_thread_handle_schema_event(THD *thd, Ndb *ndb,
       mysql_mutex_unlock(&ndb_schema_share_mutex);
       /* end protect ndb_schema_share */
 
-      close_cached_tables(NULL, NULL, FALSE, FALSE);
+      close_cached_tables(NULL, NULL, FALSE, FALSE, LONG_TIMEOUT);
       // fall through
     case NDBEVENT::TE_ALTER:
       ndb_handle_schema_change(thd, ndb, pOp, tmp_share);
@@ -2252,7 +2252,7 @@ ndb_binlog_thread_handle_schema_event_post_epoch(THD *thd,
           bzero((char*) &table_list,sizeof(table_list));
           table_list.db= schema->db;
           table_list.alias= table_list.table_name= schema->name;
-          close_cached_tables(thd, &table_list, FALSE, FALSE);
+          close_cached_tables(thd, &table_list, FALSE, FALSE, LONG_TIMEOUT);
         }
         if (schema_type != SOT_ALTER_TABLE)
           break;
diff --git a/sql/lock.cc b/sql/lock.cc
index 7c0acb58e7c..24566a04463 100644
--- a/sql/lock.cc
+++ b/sql/lock.cc
@@ -1298,27 +1298,19 @@ bool Global_read_lock::make_global_read_lock_block_commit(THD *thd)
 
 
 /**
-  Broadcast COND_refresh and COND_global_read_lock.
-
-    Due to a bug in a threading library it could happen that a signal
-    did not reach its target. A condition for this was that the same
-    condition variable was used with different mutexes in
-    mysql_cond_wait(). Some time ago we changed LOCK_open to
-    LOCK_global_read_lock in global read lock handling. So COND_refresh
-    was used with LOCK_open and LOCK_global_read_lock.
-
-    We did now also change from COND_refresh to COND_global_read_lock
-    in global read lock handling. But now it is necessary to signal
-    both conditions at the same time.
-
-  @note
-    When signalling COND_global_read_lock within the global read lock
-    handling, it is not necessary to also signal COND_refresh.
+  Broadcast COND_global_read_lock.
+
+  TODO/FIXME: Dmitry thinks that we broadcast on COND_global_read_lock
+              when old instance of table is closed to avoid races
+              between incrementing refresh_version and
+              wait_if_global_read_lock(thd, TRUE, FALSE) call.
+              Once global read lock implementation starts using MDL
+              infrastructure this will became unnecessary and should
+              be removed.
 */
 
 void broadcast_refresh(void)
 {
-  mysql_cond_broadcast(&COND_refresh);
   mysql_cond_broadcast(&COND_global_read_lock);
 }
 
diff --git a/sql/mdl.cc b/sql/mdl.cc
index ca66799baed..61a43c83409 100644
--- a/sql/mdl.cc
+++ b/sql/mdl.cc
@@ -98,70 +98,6 @@ private:
 };
 
 
-enum enum_deadlock_weight
-{
-  MDL_DEADLOCK_WEIGHT_DML= 0,
-  MDL_DEADLOCK_WEIGHT_DDL= 100
-};
-
-
-/**
-  A context of the recursive traversal through all contexts
-  in all sessions in search for deadlock.
-*/
-
-class Deadlock_detection_visitor
-{
-public:
-  Deadlock_detection_visitor(MDL_context *start_node_arg)
-    : m_start_node(start_node_arg),
-      m_victim(NULL),
-      m_current_search_depth(0)
-  {}
-  bool enter_node(MDL_context * /* unused */);
-  void leave_node(MDL_context * /* unused */);
-
-  bool inspect_edge(MDL_context *dest);
-
-  MDL_context *get_victim() const { return m_victim; }
-
-  /**
-    Change the deadlock victim to a new one if it has lower deadlock
-    weight.
-  */
-  MDL_context *opt_change_victim_to(MDL_context *new_victim);
-private:
-  /**
-    The context which has initiated the search. There
-    can be multiple searches happening in parallel at the same time.
-  */
-  MDL_context *m_start_node;
-  /** If a deadlock is found, the context that identifies the victim. */
-  MDL_context *m_victim;
-  /** Set to the 0 at start. Increased whenever
-    we descend into another MDL context (aka traverse to the next
-    wait-for graph node). When MAX_SEARCH_DEPTH is reached, we
-    assume that a deadlock is found, even if we have not found a
-    loop.
-  */
-  uint m_current_search_depth;
-  /**
-    Maximum depth for deadlock searches. After this depth is
-    achieved we will unconditionally declare that there is a
-    deadlock.
-
-    @note This depth should be small enough to avoid stack
-          being exhausted by recursive search algorithm.
-
-    TODO: Find out what is the optimal value for this parameter.
-          Current value is safe, but probably sub-optimal,
-          as there is an anecdotal evidence that real-life
-          deadlocks are even shorter typically.
-  */
-  static const uint MAX_SEARCH_DEPTH= 32;
-};
-
-
 /**
   Enter a node of a wait-for graph. After
   a node is entered, inspect_edge() will be called
@@ -876,7 +812,7 @@ void MDL_ticket::destroy(MDL_ticket *ticket)
 uint MDL_ticket::get_deadlock_weight() const
 {
   return (m_lock->key.mdl_namespace() == MDL_key::GLOBAL ||
-          m_type > MDL_SHARED_NO_WRITE ?
+          m_type >= MDL_SHARED_NO_WRITE ?
           MDL_DEADLOCK_WEIGHT_DDL : MDL_DEADLOCK_WEIGHT_DML);
 }
 
@@ -1528,9 +1464,8 @@ MDL_context::try_acquire_lock_impl(MDL_request *mdl_request,
   MDL_ticket *ticket;
   bool is_transactional;
 
-  DBUG_ASSERT(mdl_request->type < MDL_SHARED_NO_WRITE ||
-              (is_lock_owner(MDL_key::GLOBAL, "", "",
-                             MDL_INTENTION_EXCLUSIVE)));
+  DBUG_ASSERT(mdl_request->type != MDL_EXCLUSIVE ||
+              is_lock_owner(MDL_key::GLOBAL, "", "", MDL_INTENTION_EXCLUSIVE));
   DBUG_ASSERT(mdl_request->ticket == NULL);
 
   /* Don't take chances in production. */
@@ -2088,6 +2023,21 @@ end:
 
 
 /**
+  Traverse portion of wait-for graph which is reachable through edge
+  represented by this ticket in search for deadlocks.
+
+  @retval TRUE  A deadlock is found. A victim is remembered
+                by the visitor.
+  @retval FALSE
+*/
+
+bool MDL_ticket::find_deadlock(Deadlock_detection_visitor *dvisitor)
+{
+  return m_lock->find_deadlock(this, dvisitor);
+}
+
+
+/**
   Recursively traverse the wait-for graph of MDL contexts
   in search for deadlocks.
 
@@ -2105,7 +2055,7 @@ bool MDL_context::find_deadlock(Deadlock_detection_visitor *dvisitor)
 
   if (m_waiting_for)
   {
-    result= m_waiting_for->m_lock->find_deadlock(m_waiting_for, dvisitor);
+    result= m_waiting_for->find_deadlock(dvisitor);
     if (result)
       m_unlock_ctx= dvisitor->opt_change_victim_to(this);
   }
diff --git a/sql/mdl.h b/sql/mdl.h
index c8acd69c0f1..d7fbb14a140 100644
--- a/sql/mdl.h
+++ b/sql/mdl.h
@@ -34,7 +34,6 @@ class THD;
 class MDL_context;
 class MDL_lock;
 class MDL_ticket;
-class Deadlock_detection_visitor;
 
 /**
   Type of metadata lock request.
@@ -360,6 +359,96 @@ public:
 
 typedef void (*mdl_cached_object_release_hook)(void *);
 
+
+enum enum_deadlock_weight
+{
+  MDL_DEADLOCK_WEIGHT_DML= 0,
+  MDL_DEADLOCK_WEIGHT_DDL= 100
+};
+
+
+/**
+  A context of the recursive traversal through all contexts
+  in all sessions in search for deadlock.
+*/
+
+class Deadlock_detection_visitor
+{
+public:
+  Deadlock_detection_visitor(MDL_context *start_node_arg)
+    : m_start_node(start_node_arg),
+      m_victim(NULL),
+      m_current_search_depth(0),
+      m_table_shares_visited(0)
+  {}
+  bool enter_node(MDL_context * /* unused */);
+  void leave_node(MDL_context * /* unused */);
+
+  bool inspect_edge(MDL_context *dest);
+
+  MDL_context *get_victim() const { return m_victim; }
+
+  /**
+    Change the deadlock victim to a new one if it has lower deadlock
+    weight.
+  */
+  MDL_context *opt_change_victim_to(MDL_context *new_victim);
+private:
+  /**
+    The context which has initiated the search. There
+    can be multiple searches happening in parallel at the same time.
+  */
+  MDL_context *m_start_node;
+  /** If a deadlock is found, the context that identifies the victim. */
+  MDL_context *m_victim;
+  /** Set to the 0 at start. Increased whenever
+    we descend into another MDL context (aka traverse to the next
+    wait-for graph node). When MAX_SEARCH_DEPTH is reached, we
+    assume that a deadlock is found, even if we have not found a
+    loop.
+  */
+  uint m_current_search_depth;
+  /**
+    Maximum depth for deadlock searches. After this depth is
+    achieved we will unconditionally declare that there is a
+    deadlock.
+
+    @note This depth should be small enough to avoid stack
+          being exhausted by recursive search algorithm.
+
+    TODO: Find out what is the optimal value for this parameter.
+          Current value is safe, but probably sub-optimal,
+          as there is an anecdotal evidence that real-life
+          deadlocks are even shorter typically.
+  */
+  static const uint MAX_SEARCH_DEPTH= 32;
+
+public:
+  /**
+    Number of TABLE_SHARE objects visited by deadlock detector so far.
+    Used by TABLE_SHARE::find_deadlock() method to implement recursive
+    locking for LOCK_open mutex.
+  */
+  uint m_table_shares_visited;
+};
+
+
+/**
+  Abstract class representing edge in waiters graph to be
+  traversed by deadlock detection algorithm.
+*/
+
+class Wait_for_edge
+{
+public:
+  virtual ~Wait_for_edge() {};
+
+  virtual bool find_deadlock(Deadlock_detection_visitor *dvisitor) = 0;
+
+  virtual uint get_deadlock_weight() const = 0;
+};
+
+
 /**
   A granted metadata lock.
 
@@ -380,7 +469,7 @@ typedef void (*mdl_cached_object_release_hook)(void *);
           threads/contexts.
 */
 
-class MDL_ticket
+class MDL_ticket : public Wait_for_edge
 {
 public:
   /**
@@ -414,6 +503,7 @@ public:
   bool is_incompatible_when_granted(enum_mdl_type type) const;
   bool is_incompatible_when_waiting(enum_mdl_type type) const;
 
+  bool find_deadlock(Deadlock_detection_visitor *dvisitor);
   /* A helper used to determine which lock request should be aborted. */
   uint get_deadlock_weight() const;
 private:
@@ -680,7 +770,7 @@ private:
     by inspecting waiting queues, but we'd very much like it to be
     readily available to the wait-for graph iterator.
    */
-  MDL_ticket *m_waiting_for;
+  Wait_for_edge *m_waiting_for;
 private:
   MDL_ticket *find_ticket(MDL_request *mdl_req,
                           bool *is_transactional);
@@ -688,10 +778,11 @@ private:
   bool try_acquire_lock_impl(MDL_request *mdl_request,
                              MDL_ticket **out_ticket);
 
+public:
   void find_deadlock();
 
   /** Inform the deadlock detector there is an edge in the wait-for graph. */
-  void will_wait_for(MDL_ticket *pending_ticket)
+  void will_wait_for(Wait_for_edge *pending_ticket)
   {
     mysql_prlock_wrlock(&m_LOCK_waiting_for);
     m_waiting_for= pending_ticket;
diff --git a/sql/mysqld.cc b/sql/mysqld.cc
index 375c96bdec4..2b4547a299c 100644
--- a/sql/mysqld.cc
+++ b/sql/mysqld.cc
@@ -634,7 +634,7 @@ mysql_mutex_t LOCK_des_key_file;
 mysql_rwlock_t LOCK_grant, LOCK_sys_init_connect, LOCK_sys_init_slave;
 mysql_rwlock_t LOCK_system_variables_hash;
 mysql_cond_t COND_thread_count;
-mysql_cond_t COND_refresh, COND_global_read_lock;
+mysql_cond_t COND_global_read_lock;
 pthread_t signal_thread;
 pthread_attr_t connection_attrib;
 mysql_mutex_t LOCK_server_started;
@@ -1573,7 +1573,6 @@ static void clean_up_mutexes()
   mysql_mutex_destroy(&LOCK_prepared_stmt_count);
   mysql_mutex_destroy(&LOCK_error_messages);
   mysql_cond_destroy(&COND_thread_count);
-  mysql_cond_destroy(&COND_refresh);
   mysql_cond_destroy(&COND_global_read_lock);
   mysql_cond_destroy(&COND_thread_cache);
   mysql_cond_destroy(&COND_flush_thread_cache);
@@ -3564,7 +3563,6 @@ static int init_thread_environment()
   mysql_rwlock_init(key_rwlock_LOCK_sys_init_slave, &LOCK_sys_init_slave);
   mysql_rwlock_init(key_rwlock_LOCK_grant, &LOCK_grant);
   mysql_cond_init(key_COND_thread_count, &COND_thread_count, NULL);
-  mysql_cond_init(key_COND_refresh, &COND_refresh, NULL);
   mysql_cond_init(key_COND_global_read_lock, &COND_global_read_lock, NULL);
   mysql_cond_init(key_COND_thread_cache, &COND_thread_cache, NULL);
   mysql_cond_init(key_COND_flush_thread_cache, &COND_flush_thread_cache, NULL);
@@ -7786,7 +7784,7 @@ PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool;
 
 PSI_cond_key key_BINLOG_COND_prep_xids, key_BINLOG_update_cond,
   key_COND_cache_status_changed, key_COND_global_read_lock, key_COND_manager,
-  key_COND_refresh, key_COND_rpl_status, key_COND_server_started,
+  key_COND_rpl_status, key_COND_server_started,
   key_delayed_insert_cond, key_delayed_insert_cond_client,
   key_item_func_sleep_cond, key_master_info_data_cond,
   key_master_info_start_cond, key_master_info_stop_cond,
@@ -7810,7 +7808,6 @@ static PSI_cond_info all_server_conds[]=
   { &key_COND_cache_status_changed, "Query_cache::COND_cache_status_changed", 0},
   { &key_COND_global_read_lock, "COND_global_read_lock", PSI_FLAG_GLOBAL},
   { &key_COND_manager, "COND_manager", PSI_FLAG_GLOBAL},
-  { &key_COND_refresh, "COND_refresh", PSI_FLAG_GLOBAL},
   { &key_COND_rpl_status, "COND_rpl_status", PSI_FLAG_GLOBAL},
   { &key_COND_server_started, "COND_server_started", PSI_FLAG_GLOBAL},
   { &key_delayed_insert_cond, "Delayed_insert::cond", 0},
diff --git a/sql/mysqld.h b/sql/mysqld.h
index b07d148f507..74d840d55cb 100644
--- a/sql/mysqld.h
+++ b/sql/mysqld.h
@@ -255,7 +255,7 @@ extern PSI_cond_key key_PAGE_cond, key_COND_active, key_COND_pool;
 
 extern PSI_cond_key key_BINLOG_COND_prep_xids, key_BINLOG_update_cond,
   key_COND_cache_status_changed, key_COND_global_read_lock, key_COND_manager,
-  key_COND_refresh, key_COND_rpl_status, key_COND_server_started,
+  key_COND_rpl_status, key_COND_server_started,
   key_delayed_insert_cond, key_delayed_insert_cond_client,
   key_item_func_sleep_cond, key_master_info_data_cond,
   key_master_info_start_cond, key_master_info_stop_cond,
@@ -339,7 +339,7 @@ extern mysql_cond_t COND_server_started;
 extern mysql_rwlock_t LOCK_grant, LOCK_sys_init_connect, LOCK_sys_init_slave;
 extern mysql_rwlock_t LOCK_system_variables_hash;
 extern mysql_cond_t COND_thread_count;
-extern mysql_cond_t COND_refresh, COND_manager;
+extern mysql_cond_t COND_manager;
 extern mysql_cond_t COND_global_read_lock;
 extern int32 thread_running;
 extern my_atomic_rwlock_t thread_running_lock;
diff --git a/sql/sql_base.cc b/sql/sql_base.cc
index 7a59fefdddd..74f03d8d3c6 100644
--- a/sql/sql_base.cc
+++ b/sql/sql_base.cc
@@ -146,9 +146,6 @@ static bool check_and_update_table_version(THD *thd, TABLE_LIST *tables,
 static bool open_table_entry_fini(THD *thd, TABLE_SHARE *share, TABLE *entry);
 static bool auto_repair_table(THD *thd, TABLE_LIST *table_list);
 static void free_cache_entry(TABLE *entry);
-static bool tdc_wait_for_old_versions(THD *thd,
-                                      MDL_request_list *mdl_requests,
-                                      ulong timeout);
 static bool
 has_write_table_with_auto_increment(TABLE_LIST *tables);
 
@@ -315,7 +312,7 @@ void table_def_start_shutdown(void)
   {
     mysql_mutex_lock(&LOCK_open);
     /* Free all cached but unused TABLEs and TABLE_SHAREs first. */
-    close_cached_tables(NULL, NULL, TRUE, FALSE);
+    close_cached_tables(NULL, NULL, TRUE, FALSE, LONG_TIMEOUT);
     /*
       Ensure that TABLE and TABLE_SHARE objects which are created for
       tables that are open during process of plugins' shutdown are
@@ -928,6 +925,7 @@ static void kill_delayed_threads_for_table(TABLE_SHARE *share)
   @param tables List of tables to remove from the cache
   @param have_lock If LOCK_open is locked
   @param wait_for_refresh Wait for a impending flush
+  @param timeout Timeout for waiting for flush to be completed.
 
   @note THD can be NULL, but then wait_for_refresh must be FALSE
         and tables must be NULL.
@@ -941,10 +939,11 @@ static void kill_delayed_threads_for_table(TABLE_SHARE *share)
 */
 
 bool close_cached_tables(THD *thd, TABLE_LIST *tables, bool have_lock,
-                         bool wait_for_refresh)
+                         bool wait_for_refresh, ulong timeout)
 {
   bool result= FALSE;
   bool found= TRUE;
+  struct timespec abstime;
   DBUG_ENTER("close_cached_tables");
   DBUG_ASSERT(thd || (!wait_for_refresh && !tables));
 
@@ -952,7 +951,16 @@ bool close_cached_tables(THD *thd, TABLE_LIST *tables, bool have_lock,
     mysql_mutex_lock(&LOCK_open);
   if (!tables)
   {
-    refresh_version++;				// Force close of open tables
+    /*
+      Force close of all open tables.
+
+      Note that code in TABLE_SHARE::wait_until_flushed() assumes that
+      incrementing of refresh_version and removal of unused tables and
+      shares from TDC happens atomically under protection of LOCK_open,
+      or putting it another way that TDC does not contain old shares
+      which don't have any tables used.
+    */
+    refresh_version++;
     DBUG_PRINT("tcache", ("incremented global refresh_version to: %lu",
                           refresh_version));
     kill_delayed_threads();
@@ -995,6 +1003,8 @@ bool close_cached_tables(THD *thd, TABLE_LIST *tables, bool have_lock,
   /* Code below assume that LOCK_open is released. */
   DBUG_ASSERT(!have_lock);
 
+  set_timespec(abstime, timeout);
+
   if (thd->locked_tables_mode)
   {
     /*
@@ -1034,6 +1044,7 @@ bool close_cached_tables(THD *thd, TABLE_LIST *tables, bool have_lock,
 
   while (found && ! thd->killed)
   {
+    TABLE_SHARE *share;
     found= FALSE;
     /*
       To a self-deadlock or deadlocks with other FLUSH threads
@@ -1044,13 +1055,11 @@ bool close_cached_tables(THD *thd, TABLE_LIST *tables, bool have_lock,
 
     mysql_mutex_lock(&LOCK_open);
 
-    thd->enter_cond(&COND_refresh, &LOCK_open, "Flushing tables");
-
     if (!tables)
     {
       for (uint idx=0 ; idx < table_def_cache.records ; idx++)
       {
-        TABLE_SHARE *share=(TABLE_SHARE*) my_hash_element(&table_def_cache,
+        share= (TABLE_SHARE*) my_hash_element(&table_def_cache,
                                                           idx);
         if (share->needs_reopen())
         {
@@ -1063,7 +1072,7 @@ bool close_cached_tables(THD *thd, TABLE_LIST *tables, bool have_lock,
     {
       for (TABLE_LIST *table= tables; table; table= table->next_local)
       {
-        TABLE_SHARE *share= get_cached_table_share(table->db, table->table_name);
+        share= get_cached_table_share(table->db, table->table_name);
         if (share && share->needs_reopen())
         {
 	  found= TRUE;
@@ -1074,11 +1083,17 @@ bool close_cached_tables(THD *thd, TABLE_LIST *tables, bool have_lock,
 
     if (found)
     {
-      DBUG_PRINT("signal", ("Waiting for COND_refresh"));
-      mysql_cond_wait(&COND_refresh, &LOCK_open);
+      /* The below method will unlock LOCK_open and frees share's memory. */
+      if (share->wait_until_flushed(&thd->mdl_context, &abstime,
+                                    MDL_DEADLOCK_WEIGHT_DDL))
+      {
+        mysql_mutex_unlock(&LOCK_open);
+        result= TRUE;
+        goto err_with_reopen;
+      }
     }
 
-    thd->exit_cond(NULL);
+    mysql_mutex_unlock(&LOCK_open);
   }
 
 err_with_reopen:
@@ -1149,7 +1164,7 @@ bool close_cached_connection_tables(THD *thd, bool if_wait_for_refresh,
   }
 
   if (tables)
-    result= close_cached_tables(thd, tables, TRUE, FALSE);
+    result= close_cached_tables(thd, tables, TRUE, FALSE, LONG_TIMEOUT);
 
   if (!have_lock)
     mysql_mutex_unlock(&LOCK_open);
@@ -2347,7 +2362,7 @@ bool MDL_deadlock_handler::handle_condition(THD *,
   {
     /* Disable the handler to avoid infinite recursion. */
     m_is_active= TRUE;
-    (void) m_ot_ctx->request_backoff_action(Open_table_context::OT_MDL_CONFLICT,
+    (void) m_ot_ctx->request_backoff_action(Open_table_context::OT_CONFLICT,
                                             NULL);
     m_is_active= FALSE;
     /*
@@ -2394,6 +2409,8 @@ open_table_get_mdl_lock(THD *thd, Open_table_context *ot_ctx,
                         uint flags,
                         MDL_ticket **mdl_ticket)
 {
+  MDL_request mdl_request_shared;
+
   if (flags & (MYSQL_OPEN_FORCE_SHARED_MDL |
                MYSQL_OPEN_FORCE_SHARED_HIGH_PRIO_MDL))
   {
@@ -2419,16 +2436,12 @@ open_table_get_mdl_lock(THD *thd, Open_table_context *ot_ctx,
     DBUG_ASSERT(!(flags & MYSQL_OPEN_FORCE_SHARED_MDL) ||
                 !(flags & MYSQL_OPEN_FORCE_SHARED_HIGH_PRIO_MDL));
 
-    mdl_request= new (thd->mem_root) MDL_request(mdl_request);
-    if (mdl_request == NULL)
-      return TRUE;
-
-    mdl_request->set_type((flags & MYSQL_OPEN_FORCE_SHARED_MDL) ?
-                          MDL_SHARED : MDL_SHARED_HIGH_PRIO);
+    mdl_request_shared.init(&mdl_request->key,
+                            (flags & MYSQL_OPEN_FORCE_SHARED_MDL) ?
+                            MDL_SHARED : MDL_SHARED_HIGH_PRIO);
+    mdl_request= &mdl_request_shared;
   }
 
-  ot_ctx->add_request(mdl_request);
-
   if (flags & MYSQL_OPEN_FAIL_ON_MDL_CONFLICT)
   {
     /*
@@ -2491,6 +2504,38 @@ open_table_get_mdl_lock(THD *thd, Open_table_context *ot_ctx,
 }
 
 
+/**
+  Check if table's share requires flush and if yes wait until it
+  will be flushed.
+
+  @param thd             Thread context.
+  @param table_list      Table which share should be checked.
+  @param timeout         Timeout for waiting.
+  @param deadlock_weight Weight of this wait for deadlock detector.
+
+  @retval FALSE - Success. Share is up to date or has been flushed.
+  @retval TRUE - Error (OOM, thread was killed, wait resulted in
+                 deadlock or timeout).
+*/
+
+static bool tdc_wait_for_old_version(THD *thd, TABLE_LIST *table_list,
+                                     ulong timeout, uint deadlock_weight)
+{
+  TABLE_SHARE *share;
+
+  if ((share= get_cached_table_share(table_list->db,
+                                     table_list->table_name)) &&
+      share->needs_reopen())
+  {
+    struct timespec abstime;
+    set_timespec(abstime, timeout);
+    return share->wait_until_flushed(&thd->mdl_context, &abstime,
+                                     deadlock_weight);
+  }
+  return FALSE;
+}
+
+
 /*
   Open a table.
 
@@ -2580,8 +2625,8 @@ bool open_table(THD *thd, TABLE_LIST *table_list, MEM_ROOT *mem_root,
 
     if (thd->open_tables && thd->open_tables->s->version != refresh_version)
     {
-      (void) ot_ctx->request_backoff_action(Open_table_context::OT_WAIT_TDC,
-                                            NULL);
+      (void)ot_ctx->request_backoff_action(Open_table_context::OT_REOPEN_TABLES,
+                                           NULL);
       DBUG_RETURN(TRUE);
     }
   }
@@ -2794,6 +2839,8 @@ bool open_table(THD *thd, TABLE_LIST *table_list, MEM_ROOT *mem_root,
 
   mysql_mutex_lock(&LOCK_open);
 
+retry_share:
+
   if (!(share= get_table_share_with_create(thd, table_list, key,
                                            key_length, OPEN_VIEW,
                                            &error,
@@ -2849,31 +2896,50 @@ bool open_table(THD *thd, TABLE_LIST *table_list, MEM_ROOT *mem_root,
   if (table_list->i_s_requested_object &  OPEN_VIEW_ONLY)
     goto err_unlock;
 
-  /*
-    If the version changes while we're opening the tables,
-    we have to back off, close all the tables opened-so-far,
-    and try to reopen them. Note: refresh_version is currently
-    changed only during FLUSH TABLES.
-  */
-  if (share->needs_reopen() ||
-      (thd->open_tables && thd->open_tables->s->version != share->version))
+  if (!(flags & MYSQL_OPEN_IGNORE_FLUSH))
   {
-    if (!(flags & MYSQL_OPEN_IGNORE_FLUSH))
+    if (share->needs_reopen())
     {
-       /*
-         We already have an MDL lock. But we have encountered an old
-         version of table in the table definition cache which is possible
-         when someone changes the table version directly in the cache
-         without acquiring a metadata lock (e.g. this can happen during
-         "rolling" FLUSH TABLE(S)).
-         Note, that to avoid a "busywait" in this case, we have to wait
-         separately in the caller for old table versions to go away
-         (see tdc_wait_for_old_versions()).
-       */
+      /*
+        We already have an MDL lock. But we have encountered an old
+        version of table in the table definition cache which is possible
+        when someone changes the table version directly in the cache
+        without acquiring a metadata lock (e.g. this can happen during
+        "rolling" FLUSH TABLE(S)).
+        Release our reference to share, wait until old version of
+        share goes away and then try to get new version of table share.
+      */
+      MDL_deadlock_handler mdl_deadlock_handler(ot_ctx);
+      bool wait_result;
+
+      release_table_share(share);
+
+      thd->push_internal_handler(&mdl_deadlock_handler);
+      wait_result= tdc_wait_for_old_version(thd, table_list,
+                                            ot_ctx->get_timeout(),
+                                            mdl_ticket->get_deadlock_weight());
+      thd->pop_internal_handler();
+
+      if (wait_result)
+      {
+        mysql_mutex_unlock(&LOCK_open);
+        DBUG_RETURN(TRUE);
+      }
+      goto retry_share;
+    }
+
+    if (thd->open_tables && thd->open_tables->s->version != share->version)
+    {
+      /*
+        If the version changes while we're opening the tables,
+        we have to back off, close all the tables opened-so-far,
+        and try to reopen them. Note: refresh_version is currently
+        changed only during FLUSH TABLES.
+      */
       release_table_share(share);
       mysql_mutex_unlock(&LOCK_open);
-      (void) ot_ctx->request_backoff_action(Open_table_context::OT_WAIT_TDC,
-                                            NULL);
+      (void)ot_ctx->request_backoff_action(Open_table_context::OT_REOPEN_TABLES,
+                                           NULL);
       DBUG_RETURN(TRUE);
     }
   }
@@ -3831,7 +3897,7 @@ request_backoff_action(enum_open_table_action action_arg,
       Since there is no way to detect such a deadlock, we prevent
       it by reporting an error.
   */
-  if (m_has_locks)
+  if (action_arg != OT_REOPEN_TABLES && m_has_locks)
   {
     my_error(ER_LOCK_DEADLOCK, MYF(0));
     return TRUE;
@@ -3877,11 +3943,9 @@ recover_from_failed_open(THD *thd)
   /* Execute the action. */
   switch (m_action)
   {
-    case OT_MDL_CONFLICT:
+    case OT_CONFLICT:
       break;
-    case OT_WAIT_TDC:
-      result= tdc_wait_for_old_versions(thd, &m_mdl_requests, get_timeout());
-      DBUG_ASSERT(thd->mysys_var->current_mutex == NULL);
+    case OT_REOPEN_TABLES:
       break;
     case OT_DISCOVER:
       {
@@ -3921,8 +3985,6 @@ recover_from_failed_open(THD *thd)
     default:
       DBUG_ASSERT(0);
   }
-  /* Remove all old requests, they will be re-added. */
-  m_mdl_requests.empty();
   /*
     Reset the pointers to conflicting MDL request and the
     TABLE_LIST element, set when we need auto-discovery or repair,
@@ -4043,8 +4105,6 @@ open_and_process_routine(THD *thd, Query_tables_list *prelocking_ctx,
       if (rt != (Sroutine_hash_entry*)prelocking_ctx->sroutines_list.first ||
           mdl_type != MDL_key::PROCEDURE)
       {
-        ot_ctx->add_request(&rt->mdl_request);
-
         /*
           Since we acquire only shared lock on routines we don't
           need to care about global intention exclusive locks.
@@ -4721,6 +4781,8 @@ restart:
         }
         goto err;
       }
+
+      DEBUG_SYNC(thd, "open_tables_after_open_and_process_table");
     }
 
     /*
@@ -8595,17 +8657,6 @@ bool mysql_notify_thread_having_shared_lock(THD *thd, THD *in_use,
     }
     mysql_mutex_unlock(&in_use->LOCK_thd_data);
   }
-  /*
-    Wake up threads waiting in tdc_wait_for_old_versions().
-    Normally such threads would already get blocked
-    in MDL subsystem, when trying to acquire a shared lock.
-    But in case a thread has an open HANDLER statement,
-    (and thus already grabbed a metadata lock), it gets
-    blocked only too late -- at the table cache level.
-    Starting from 5.5, this could also easily happen in
-    a multi-statement transaction.
-  */
-  broadcast_refresh();
   return signalled;
 }
 
@@ -8680,6 +8731,13 @@ void tdc_remove_table(THD *thd, enum_tdc_remove_table_type remove_type,
       /*
         Set share's version to zero in order to ensure that it gets
         automatically deleted once it is no longer referenced.
+
+        Note that code in TABLE_SHARE::wait_until_flushed() assumes
+        that marking share as old and removal of its unused tables
+        and of the share itself from TDC happens atomically under
+        protection of LOCK_open, or, putting it another way, that
+        TDC does not contain old shares which don't have any tables
+        used.
       */
       share->version= 0;
 
@@ -8692,84 +8750,6 @@ void tdc_remove_table(THD *thd, enum_tdc_remove_table_type remove_type,
 }
 
 
-/**
-   Wait until there are no old versions of tables in the table
-   definition cache for the metadata locks that we try to acquire.
-
-   @param thd      Thread context
-   @param context  Metadata locking context with locks.
-   @param timeout  Seconds to wait before reporting ER_LOCK_WAIT_TIMEOUT.
-*/
-
-static bool
-tdc_wait_for_old_versions(THD *thd, MDL_request_list *mdl_requests,
-                          ulong timeout)
-{
-  TABLE_SHARE *share;
-  const char *old_msg;
-  MDL_request *mdl_request;
-  struct timespec abstime;
-  set_timespec(abstime, timeout);
-  int wait_result= 0;
-
-  while (!thd->killed)
-  {
-    /*
-      We have to get rid of HANDLERs which are open by this thread
-      and have old TABLE versions. Otherwise we might get a deadlock
-      in situation when we are waiting for an old TABLE object which
-      corresponds to a HANDLER open by another session. And this
-      other session waits for our HANDLER object to get closed.
-
-      TODO: We should also investigate in which situations we have
-            to broadcast on COND_refresh because of this.
-    */
-    mysql_ha_flush(thd);
-
-    mysql_mutex_lock(&LOCK_open);
-
-    MDL_request_list::Iterator it(*mdl_requests);
-    while ((mdl_request= it++))
-    {
-      /* Skip requests on non-TDC objects. */
-      if (mdl_request->key.mdl_namespace() != MDL_key::TABLE)
-        continue;
-
-      if ((share= get_cached_table_share(mdl_request->key.db_name(),
-                                         mdl_request->key.name())) &&
-          share->needs_reopen())
-        break;
-    }
-    if (!mdl_request)
-    {
-      /*
-        Reset wait_result here in case this was the final check
-        after getting a timeout from mysql_cond_timedwait().
-      */
-      wait_result= 0;
-      mysql_mutex_unlock(&LOCK_open);
-      break;
-    }
-    if (wait_result == ETIMEDOUT || wait_result == ETIME)
-    {
-      /*
-        Test for timeout here instead of right after mysql_cond_timedwait().
-        This allows for a final iteration and a final check before reporting
-        ER_LOCK_WAIT_TIMEOUT.
-      */
-      mysql_mutex_unlock(&LOCK_open);
-      my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
-      break;
-    }
-    old_msg= thd->enter_cond(&COND_refresh, &LOCK_open, "Waiting for table");
-    wait_result= mysql_cond_timedwait(&COND_refresh, &LOCK_open, &abstime);
-    /* LOCK_open mutex is unlocked by THD::exit_cond() as side-effect. */
-    thd->exit_cond(old_msg);
-  }
-  return thd->killed || wait_result == ETIMEDOUT || wait_result == ETIME;
-}
-
-
 int setup_ftfuncs(SELECT_LEX *select_lex)
 {
   List_iterator<Item_func_match> li(*(select_lex->ftfunc_list)),
diff --git a/sql/sql_base.h b/sql/sql_base.h
index b912f80d44f..7d13b69e063 100644
--- a/sql/sql_base.h
+++ b/sql/sql_base.h
@@ -250,7 +250,7 @@ TABLE *open_performance_schema_table(THD *thd, TABLE_LIST *one_table,
 void close_performance_schema_table(THD *thd, Open_tables_state *backup);
 
 bool close_cached_tables(THD *thd, TABLE_LIST *tables, bool have_lock,
-                         bool wait_for_refresh);
+                         bool wait_for_refresh, ulong timeout);
 bool close_cached_connection_tables(THD *thd, bool wait_for_refresh,
                                     LEX_STRING *connect_string,
                                     bool have_lock = FALSE);
@@ -454,8 +454,8 @@ public:
   enum enum_open_table_action
   {
     OT_NO_ACTION= 0,
-    OT_MDL_CONFLICT,
-    OT_WAIT_TDC,
+    OT_CONFLICT,
+    OT_REOPEN_TABLES,
     OT_DISCOVER,
     OT_REPAIR
   };
@@ -465,9 +465,6 @@ public:
   bool request_backoff_action(enum_open_table_action action_arg,
                               TABLE_LIST *table);
 
-  void add_request(MDL_request *request)
-  { m_mdl_requests.push_front(request); }
-
   bool can_recover_from_failed_open() const
   { return m_action != OT_NO_ACTION; }
 
@@ -489,8 +486,6 @@ public:
 
   uint get_flags() const { return m_flags; }
 private:
-  /** List of requests for all locks taken so far. Used for waiting on locks. */
-  MDL_request_list m_mdl_requests;
   /**
     For OT_DISCOVER and OT_REPAIR actions, the table list element for
     the table which definition should be re-discovered or which
diff --git a/sql/sql_class.h b/sql/sql_class.h
index c095fee6232..e96c3f8dd26 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -2302,6 +2302,12 @@ public:
   {
     const char* old_msg = proc_info;
     mysql_mutex_assert_owner(mutex);
+    /*
+      This method should not be called with LOCK_open mutex as an
+      argument. Otherwise deadlocks can arise in MDL deadlock detector.
+      @sa TABLE_SHARE::find_deadlock().
+    */
+    DBUG_ASSERT(mutex != &LOCK_open);
     mysys_var->current_mutex = mutex;
     mysys_var->current_cond = cond;
     proc_info = msg;
diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc
index 53c2ca6fa39..7cb27ff4916 100644
--- a/sql/sql_parse.cc
+++ b/sql/sql_parse.cc
@@ -1756,6 +1756,7 @@ static bool flush_tables_with_read_lock(THD *thd, TABLE_LIST *all_tables)
 {
   Lock_tables_prelocking_strategy lock_tables_prelocking_strategy;
   TABLE_LIST *table_list;
+  MDL_request_list mdl_requests;
 
   /*
     This is called from SQLCOM_FLUSH, the transaction has
@@ -1774,23 +1775,27 @@ static bool flush_tables_with_read_lock(THD *thd, TABLE_LIST *all_tables)
   }
 
   /*
-    @todo: Since lock_table_names() acquires a global IX
-    lock, this actually waits for a GRL in another connection.
-    We are thus introducing an incompatibility.
-    Do nothing for now, since not taking a global IX violates
-    current internal MDL asserts, fix after discussing with
-    Dmitry.
+    Acquire SNW locks on tables to be flushed. We can't use
+    lock_table_names() here as this call will also acquire global IX
+    and database-scope IX locks on the tables, and this will make
+    this statement incompatible with FLUSH TABLES WITH READ LOCK.
   */
-  if (lock_table_names(thd, all_tables, 0, thd->variables.lock_wait_timeout,
-                       MYSQL_OPEN_SKIP_TEMPORARY))
+  for (table_list= all_tables; table_list;
+       table_list= table_list->next_global)
+    mdl_requests.push_front(&table_list->mdl_request);
+
+  if (thd->mdl_context.acquire_locks(&mdl_requests,
+                                     thd->variables.lock_wait_timeout))
     goto error;
 
+  DEBUG_SYNC(thd,"flush_tables_with_read_lock_after_acquire_locks");
+
   for (table_list= all_tables; table_list;
        table_list= table_list->next_global)
   {
-    /* Remove the table from cache. */
+    /* Request removal of table from cache. */
     mysql_mutex_lock(&LOCK_open);
-    tdc_remove_table(thd, TDC_RT_REMOVE_ALL,
+    tdc_remove_table(thd, TDC_RT_REMOVE_UNUSED,
                      table_list->db,
                      table_list->table_name);
     mysql_mutex_unlock(&LOCK_open);
@@ -1800,6 +1805,11 @@ static bool flush_tables_with_read_lock(THD *thd, TABLE_LIST *all_tables)
     table_list->open_type= OT_BASE_ONLY;      /* Ignore temporary tables. */
   }
 
+  /*
+    Before opening and locking tables the below call also waits for old
+    shares to go away, so the fact that we don't pass MYSQL_LOCK_IGNORE_FLUSH
+    flag to it is important.
+  */
   if  (open_and_lock_tables(thd, all_tables, FALSE,
                             MYSQL_OPEN_HAS_MDL_LOCK,
                             &lock_tables_prelocking_strategy) ||
@@ -1810,17 +1820,11 @@ static bool flush_tables_with_read_lock(THD *thd, TABLE_LIST *all_tables)
   thd->variables.option_bits|= OPTION_TABLE_LOCK;
 
   /*
-    Downgrade the exclusive locks.
-    Use MDL_SHARED_NO_WRITE as the intended
-    post effect of this call is identical
-    to LOCK TABLES <...> READ, and we didn't use
-    thd->in_lock_talbes and thd->sql_command= SQLCOM_LOCK_TABLES
-    hacks to enter the LTM.
-    @todo: release the global IX lock here!!!
+    We don't downgrade MDL_SHARED_NO_WRITE here as the intended
+    post effect of this call is identical to LOCK TABLES <...> READ,
+    and we didn't use thd->in_lock_talbes and
+    thd->sql_command= SQLCOM_LOCK_TABLES hacks to enter the LTM.
   */
-  for (table_list= all_tables; table_list;
-       table_list= table_list->next_global)
-    table_list->mdl_request.ticket->downgrade_exclusive_lock(MDL_SHARED_NO_WRITE);
 
   return FALSE;
 
@@ -6854,8 +6858,8 @@ bool reload_acl_and_cache(THD *thd, ulong options, TABLE_LIST *tables,
       tmp_write_to_binlog= 0;
       if (thd->global_read_lock.lock_global_read_lock(thd))
 	return 1;                               // Killed
-      if (close_cached_tables(thd, tables, FALSE, (options & REFRESH_FAST) ?
-                              FALSE : TRUE))
+      if (close_cached_tables(thd, tables, FALSE, ((options & REFRESH_FAST) ?
+                              FALSE : TRUE), thd->variables.lock_wait_timeout))
           result= 1;
       
       if (thd->global_read_lock.make_global_read_lock_block_commit(thd)) // Killed
@@ -6894,8 +6898,10 @@ bool reload_acl_and_cache(THD *thd, ulong options, TABLE_LIST *tables,
         }
       }
 
-      if (close_cached_tables(thd, tables, FALSE, (options & REFRESH_FAST) ?
-                              FALSE : TRUE))
+      if (close_cached_tables(thd, tables, FALSE, ((options & REFRESH_FAST) ?
+                              FALSE : TRUE),
+                              (thd ? thd->variables.lock_wait_timeout :
+                                     LONG_TIMEOUT)))
         result= 1;
     }
     my_dbopt_cleanup();
diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy
index ca951897055..3341ffc7a30 100644
--- a/sql/sql_yacc.yy
+++ b/sql/sql_yacc.yy
@@ -11202,9 +11202,8 @@ opt_with_read_lock:
           {
             TABLE_LIST *tables= Lex->query_tables;
             Lex->type|= REFRESH_READ_LOCK;
-            /* We acquire an X lock currently and then downgrade. */
             for (; tables; tables= tables->next_global)
-              tables->mdl_request.set_type(MDL_EXCLUSIVE);
+              tables->mdl_request.set_type(MDL_SHARED_NO_WRITE);
           }
         ;
 
diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc
index 9e212fb95e9..cf185db0b7a 100644
--- a/sql/sys_vars.cc
+++ b/sql/sys_vars.cc
@@ -1488,7 +1488,8 @@ static bool fix_read_only(sys_var *self, THD *thd, enum_var_type type)
     can cause to wait on a read lock, it's required for the client application
     to unlock everything, and acceptable for the server to wait on all locks.
   */
-  if ((result= close_cached_tables(thd, NULL, FALSE, TRUE)))
+  if ((result= close_cached_tables(thd, NULL, FALSE, TRUE,
+                                   thd->variables.lock_wait_timeout)))
     goto end_with_read_lock;
 
   if ((result= thd->global_read_lock.make_global_read_lock_block_commit(thd)))
diff --git a/sql/table.cc b/sql/table.cc
index a58623f0036..a8e1caa271a 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -34,6 +34,7 @@
 #include <m_ctype.h>
 #include "my_md5.h"
 #include "sql_select.h"
+#include "mdl.h"                 // Deadlock_detection_visitor
 
 /* INFORMATION_SCHEMA name */
 LEX_STRING INFORMATION_SCHEMA_NAME= {C_STRING_WITH_LEN("information_schema")};
@@ -325,6 +326,7 @@ TABLE_SHARE *alloc_table_share(TABLE_LIST *table_list, char *key,
 
     share->used_tables.empty();
     share->free_tables.empty();
+    share->m_flush_tickets.empty();
 
     memcpy((char*) &share->mem_root, (char*) &mem_root, sizeof(mem_root));
     mysql_mutex_init(key_TABLE_SHARE_LOCK_ha_data,
@@ -389,6 +391,7 @@ void init_tmp_table_share(THD *thd, TABLE_SHARE *share, const char *key,
 
   share->used_tables.empty();
   share->free_tables.empty();
+  share->m_flush_tickets.empty();
 
   DBUG_VOID_RETURN;
 }
@@ -432,9 +435,40 @@ void free_table_share(TABLE_SHARE *share)
       key_info->flags= 0;
     }
   }
-  /* We must copy mem_root from share because share is allocated through it */
-  memcpy((char*) &mem_root, (char*) &share->mem_root, sizeof(mem_root));
-  free_root(&mem_root, MYF(0));                 // Free's share
+
+  if (share->m_flush_tickets.is_empty())
+  {
+    /*
+      There are no threads waiting for this share to be flushed. So
+      we can immediately release memory associated with it. We must
+      copy mem_root from share because share is allocated through it.
+    */
+    memcpy((char*) &mem_root, (char*) &share->mem_root, sizeof(mem_root));
+    free_root(&mem_root, MYF(0));                 // Free's share
+  }
+  else
+  {
+    /*
+      If there are threads waiting for this share to be flushed we
+      don't free share memory here. Instead we notify waiting threads
+      and delegate freeing share's memory to them.
+      At this point a) all resources except memory associated with share
+      were already released b) share should have been already removed
+      from table definition cache. So it is OK to proceed without waiting
+      for these threads to finish their work.
+    */
+    Flush_ticket_list::Iterator it(share->m_flush_tickets);
+    Flush_ticket *ticket;
+
+    /*
+      To avoid problems due to threads being wake up concurrently modifying
+      flush ticket list we must hold LOCK_open here.
+    */
+    mysql_mutex_assert_owner(&LOCK_open);
+
+    while ((ticket= it++))
+      (void) ticket->get_ctx()->m_wait.set_status(MDL_wait::GRANTED);
+  }
   DBUG_VOID_RETURN;
 }
 
@@ -2996,6 +3030,223 @@ Table_check_intact::check(TABLE *table, const TABLE_FIELD_DEF *table_def)
 }
 
 
+/**
+  Traverse portion of wait-for graph which is reachable through edge
+  represented by this flush ticket in search for deadlocks.
+
+  @retval TRUE  A deadlock is found. A victim is remembered
+                by the visitor.
+  @retval FALSE
+*/
+
+bool Flush_ticket::find_deadlock(Deadlock_detection_visitor *dvisitor)
+{
+  return m_share->find_deadlock(this, dvisitor);
+}
+
+
+uint Flush_ticket::get_deadlock_weight() const
+{
+  return m_deadlock_weight;
+}
+
+
+/**
+  Traverse portion of wait-for graph which is reachable through this
+  table share in search for deadlocks.
+
+  @param waiting_ticket  Ticket representing wait for this share.
+  @param dvisitor        Deadlock detection visitor.
+
+  @retval TRUE  A deadlock is found. A victim is remembered
+                by the visitor.
+  @retval FALSE
+*/
+
+bool TABLE_SHARE::find_deadlock(Flush_ticket *waiting_ticket,
+                                Deadlock_detection_visitor *dvisitor)
+{
+  TABLE *table;
+  MDL_context *src_ctx= waiting_ticket->get_ctx();
+  bool result= TRUE;
+
+  /*
+    To protect used_tables list from being concurrently modified while we
+    are iterating through it we acquire LOCK_open. This should not introduce
+    deadlocks in deadlock detector because we support recursive acquiring of
+    such mutex and also because we won't try to acquire LOCK_open mutex while
+    holding write-lock on MDL_lock::m_rwlock.
+
+    Here is the more elaborate proof:
+
+    0) Let us assume that there is a deadlock.
+    1) Wait graph (the one which reflects waits for system synchronization
+       primitives and not the one which inspected by MDL deadlock detector)
+       for this deadlock should contain loop including both LOCK_open and
+       some of MDL synchronization primitives. Otherwise deadlock would had
+       already exisited before we have introduced acquiring of LOCK_open in
+       MDL deadlock detector.
+    2) Also in this graph edge going out of LOCK_open node should go to one
+       of MDL synchronization primitives. Different situation would mean that
+       we have some non-MDL synchronization primitive besides LOCK_open under
+       which we try to acquire MDL lock, which is not the case.
+    3) Moreover edge coming from LOCK_open should go to MDL_lock::m_rwlock
+       object and correspond to request for read-lock. It can't be request
+       for rwlock in MDL_context or mutex in MDL_wait object because they
+       are terminal (i.e. thread having them locked in exclusive mode won't
+       wait for any other resource). It can't be request for write-lock on
+       MDL_lock::m_rwlock as this would mean that we try to acquire metadata
+       lock under LOCK_open (which is not the case).
+    4) Since MDL_lock::m_rwlock is rwlock which prefers readers the only
+       situation when it can be waited for is when some thread has it 
+       write-locked.
+    5) TODO/FIXME:
+       - Either prove that thread having MDL_lock::m_rwlock write-locked won't
+         wait for LOCK_open directly or indirectly (see notify_shared_lock()).
+       - Or change code to hold only read-lock on MDL_lock::m_rwlock during
+         notify_shared_lock() and thus make MDL_lock::m_rwlock terminal when
+         write-locked.
+  */
+  if (! (dvisitor->m_table_shares_visited++))
+    mysql_mutex_lock(&LOCK_open);
+
+  I_P_List_iterator <TABLE, TABLE_share> tables_it(used_tables);
+
+  /* Not strictly necessary ? */
+  if (src_ctx->m_wait.get_status() != MDL_wait::EMPTY)
+  {
+    result= FALSE;
+    goto end;
+  }
+
+  if (dvisitor->enter_node(src_ctx))
+    goto end;
+
+  while ((table= tables_it++))
+  {
+    if (dvisitor->inspect_edge(&table->in_use->mdl_context))
+    {
+      goto end_leave_node;
+    }
+  }
+
+  tables_it.rewind();
+  while ((table= tables_it++))
+  {
+    if (table->in_use->mdl_context.find_deadlock(dvisitor))
+    {
+      goto end_leave_node;
+    }
+  }
+
+  result= FALSE;
+
+end_leave_node:
+  dvisitor->leave_node(src_ctx);
+
+end:
+  if (! (--dvisitor->m_table_shares_visited))
+    mysql_mutex_unlock(&LOCK_open);
+
+  return result;
+}
+
+
+/**
+  Wait until old version of table share is removed from TDC.
+
+  @param mdl_context     MDL context for thread which is going to wait.
+  @param abstime         Timeout for waiting as absolute time value.
+  @param deadlock_weight Weight of this wait for deadlock detector.
+
+  @note This method assumes that its caller owns LOCK_open mutex.
+        This mutex will be unlocked temporarily during its execution.
+
+  @retval FALSE - Success.
+  @retval TRUE  - Error (OOM, deadlock, timeout, etc...).
+*/
+
+bool TABLE_SHARE::wait_until_flushed(MDL_context *mdl_context,
+                                     struct timespec *abstime,
+                                     uint deadlock_weight)
+{
+  Flush_ticket *ticket;
+  MDL_wait::enum_wait_status wait_status;
+
+  mysql_mutex_assert_owner(&LOCK_open);
+
+  /*
+    We should enter this method only then share's version is not
+    up to date and the share is referenced. Otherwise there is
+    no guarantee that our thread will be waken-up from wait.
+  */
+  DBUG_ASSERT(version != refresh_version && ref_count != 0);
+
+  if (! (ticket= new Flush_ticket(mdl_context, this, deadlock_weight)))
+  {
+    mysql_mutex_unlock(&LOCK_open);
+    return TRUE;
+  }
+
+  m_flush_tickets.push_front(ticket);
+
+  mdl_context->m_wait.reset_status();
+
+  mysql_mutex_unlock(&LOCK_open);
+
+  mdl_context->will_wait_for(ticket);
+
+  mdl_context->find_deadlock();
+
+  wait_status= mdl_context->m_wait.timed_wait(mdl_context->get_thd(),
+                                              abstime, TRUE);
+
+  mdl_context->done_waiting_for();
+
+  mysql_mutex_lock(&LOCK_open);
+
+  m_flush_tickets.remove(ticket);
+
+  /*
+    If our thread was the last one waiting for table share to be flushed
+    we can finish destruction of share object by releasing its memory
+    (share object was allocated on share's own MEM_ROOT).
+
+    In cases when our wait was aborted due KILL statement, deadlock or
+    timeout share still might be referenced, so we don't free its memory
+    in this case. Note that we can't rely on checking wait_status to
+    determine this condition as, for example, timeout can happen even
+    when there are no references to table share so memory should be
+    released.
+  */
+  if (m_flush_tickets.is_empty() && ! ref_count)
+  {
+    MEM_ROOT mem_root_copy;
+    memcpy((char*) &mem_root_copy, (char*) &mem_root, sizeof(mem_root));
+    free_root(&mem_root_copy, MYF(0));
+  }
+
+  delete ticket;
+
+  switch (wait_status)
+  {
+  case MDL_wait::GRANTED:
+    return FALSE;
+  case MDL_wait::VICTIM:
+    my_error(ER_LOCK_DEADLOCK, MYF(0));
+    return TRUE;
+  case MDL_wait::TIMEOUT:
+    my_error(ER_LOCK_WAIT_TIMEOUT, MYF(0));
+    return TRUE;
+  case MDL_wait::KILLED:
+    return TRUE;
+  default:
+    DBUG_ASSERT(0);
+    return TRUE;
+  }
+}
+
+
 /*
   Create Item_field for each column in the table.
 
diff --git a/sql/table.h b/sql/table.h
index 2bf390aee4d..46015f4425a 100644
--- a/sql/table.h
+++ b/sql/table.h
@@ -45,6 +45,7 @@ class ACL_internal_schema_access;
 class ACL_internal_table_access;
 struct TABLE_LIST;
 class Field;
+class Deadlock_detection_visitor;
 
 /*
   Used to identify NESTED_JOIN structures within a join (applicable only to
@@ -508,6 +509,45 @@ public:
 };
 
 
+/**
+  Class representing the fact that some thread waits for table
+  share to be flushed. Is used to represent information about
+  such waits in MDL deadlock detector.
+*/
+
+class Flush_ticket : public Wait_for_edge
+{
+  MDL_context *m_ctx;
+  TABLE_SHARE *m_share;
+  uint m_deadlock_weight;
+public:
+  Flush_ticket(MDL_context *ctx_arg, TABLE_SHARE *share_arg,
+               uint deadlock_weight_arg)
+    : m_ctx(ctx_arg), m_share(share_arg),
+      m_deadlock_weight(deadlock_weight_arg)
+  {}
+
+  MDL_context *get_ctx() const { return m_ctx; }
+
+  bool find_deadlock(Deadlock_detection_visitor *dvisitor);
+
+  uint get_deadlock_weight() const;
+
+  /**
+    Pointers for participating in the list of waiters for table share.
+  */
+  Flush_ticket *next_in_share;
+  Flush_ticket **prev_in_share;
+};
+
+
+typedef I_P_List <Flush_ticket,
+                  I_P_List_adapter<Flush_ticket,
+                                   &Flush_ticket::next_in_share,
+                                   &Flush_ticket::prev_in_share> >
+                 Flush_ticket_list;
+
+
 /*
   This structure is shared between different table objects. There is one
   instance of table share per one table in the database.
@@ -662,6 +702,11 @@ struct TABLE_SHARE
   /** Instrumentation for this table share. */
   PSI_table_share *m_psi;
 
+  /**
+    List of tickets representing threads waiting for the share to be flushed.
+  */
+  Flush_ticket_list m_flush_tickets;
+
   /*
     Set share's table cache key and update its db and table name appropriately.
 
@@ -837,6 +882,12 @@ struct TABLE_SHARE
     return (tmp_table == SYSTEM_TMP_TABLE || is_view) ? 0 : table_map_id;
   }
 
+  bool find_deadlock(Flush_ticket *waiting_ticket,
+                     Deadlock_detection_visitor *dvisitor);
+
+  bool wait_until_flushed(MDL_context *mdl_context,
+                          struct timespec *abstime,
+                          uint deadlock_weight);
 };
author	Dmitry Lenev <dlenev@mysql.com>	2010-07-27 17:34:58 +0400
committer	Dmitry Lenev <dlenev@mysql.com>	2010-07-27 17:34:58 +0400
commit	5fff906edd3d7a5d999cec5403f009f33f8dfb81 (patch)
tree	bef62913efddc244d466f7ff730bcc4205357491 /sql
parent	ec2c3bf2c1c27e4401c767a6cdcb3172453ff42c (diff)
download	mariadb-git-5fff906edd3d7a5d999cec5403f009f33f8dfb81.tar.gz