summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorsensssz <hjmsens@gmail.com>2016-10-17 21:56:05 -0400
committersensssz <hjmsens@gmail.com>2016-10-19 01:42:10 -0400
commitc455898b2c9aaaa9da2def86523f8a9c5016f49f (patch)
tree29505f8dc123d41b6703e1d7944bda47ff99bd86
parent8303aded294ce905bbc513e7ee42623d5f1fdb50 (diff)
downloadmariadb-git-c455898b2c9aaaa9da2def86523f8a9c5016f49f.tar.gz
Implement VATS in XtraDB and InnoDB.
-rw-r--r--sql/sql_class.cc1
-rw-r--r--sql/sql_class.h2
-rw-r--r--storage/innobase/handler/ha_innodb.cc29
-rw-r--r--storage/innobase/include/lock0lock.h11
-rw-r--r--storage/innobase/lock/lock0lock.cc241
-rw-r--r--storage/xtradb/handler/ha_innodb.cc29
-rw-r--r--storage/xtradb/include/lock0lock.h11
-rw-r--r--storage/xtradb/lock/lock0lock.cc194
8 files changed, 484 insertions, 34 deletions
diff --git a/sql/sql_class.cc b/sql/sql_class.cc
index 1af3b9a9cca..af0a369a23c 100644
--- a/sql/sql_class.cc
+++ b/sql/sql_class.cc
@@ -72,6 +72,7 @@
#include <sys/syscall.h>
#endif
+bool is_slave_replication = false;
/*
The following is used to initialise Table_ident with a internal
table name
diff --git a/sql/sql_class.h b/sql/sql_class.h
index 994a161a646..a068143f7c5 100644
--- a/sql/sql_class.h
+++ b/sql/sql_class.h
@@ -152,6 +152,8 @@ extern MYSQL_PLUGIN_IMPORT const char **errmesg;
extern bool volatile shutdown_in_progress;
+extern bool is_slave_replication;
+
extern "C" LEX_STRING * thd_query_string (MYSQL_THD thd);
extern "C" char **thd_query(MYSQL_THD thd);
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 9cafb9c723a..a6b440b40ca 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -379,6 +379,22 @@ static TYPELIB innodb_default_row_format_typelib = {
NULL
};
+/** Possible values of the parameter innodb_lock_schedule_algorithm */
+static const char* innodb_lock_schedule_algorithm_names[] = {
+ "fcfs",
+ "vats",
+ NullS
+};
+
+/** Used to define an enumerate type of the system variable
+innodb_lock_schedule_algorithm. */
+static TYPELIB innodb_lock_schedule_algorithm_typelib = {
+ array_elements(innodb_lock_schedule_algorithm_names) - 1,
+ "innodb_lock_schedule_algorithm_typelib",
+ innodb_lock_schedule_algorithm_names,
+ NULL
+};
+
/* The following counter is used to convey information to InnoDB
about server activity: in case of normal DML ops it is not
sensible to call srv_active_wake_master_thread after each
@@ -22783,6 +22799,18 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size,
NULL, NULL, 120, 1, 127, 0);
#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
+static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm,
+ PLUGIN_VAR_RQCMDARG,
+ "The algorithm Innodb uses for deciding which locks to grant next when"
+ " a lock is released. Possible values are"
+ " FCFS"
+ " grant the locks in First-Come-First-Served order;"
+ " VATS"
+ " use the Variance-Aware-Transaction-Scheduling algorithm, which"
+ " uses an Eldest-Transaction-First heuristic.",
+ NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
+ &innodb_lock_schedule_algorithm_typelib);
+
static MYSQL_SYSVAR_ULONG(buffer_pool_instances, srv_buf_pool_instances,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
@@ -23656,6 +23684,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(ft_sort_pll_degree),
MYSQL_SYSVAR(large_prefix),
MYSQL_SYSVAR(force_load_corrupted),
+ MYSQL_SYSVAR(lock_schedule_algorithm),
MYSQL_SYSVAR(locks_unsafe_for_binlog),
MYSQL_SYSVAR(lock_wait_timeout),
MYSQL_SYSVAR(page_size),
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index eb554e02bc0..54892fb197c 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -40,6 +40,17 @@ Created 5/7/1996 Heikki Tuuri
#include "gis0rtree.h"
#include "lock0prdt.h"
+/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by
+ setting innodb_lock_schedule_algorithm. */
+enum innodb_lock_schedule_algorithm_t {
+ /*!< First Come First Served */
+ INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
+ /*!< Variance-Aware-Transaction-Scheduling */
+ INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+};
+
+extern ulong innodb_lock_schedule_algorithm;
+
// Forward declaration
class ReadView;
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index e3299e2d74b..7db94b143be 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -30,6 +30,7 @@ Created 5/7/1996 Heikki Tuuri
#include "ha_prototypes.h"
#include <mysql/service_thd_error_context.h>
+#include <sql_class.h>
#include "lock0lock.h"
#include "lock0priv.h"
@@ -58,6 +59,9 @@ Created 5/7/1996 Heikki Tuuri
#include <mysql/service_wsrep.h>
#endif /* WITH_WSREP */
+/** Lock scheduling algorithm */
+ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS;
+
/** Total number of cached record locks */
static const ulint REC_LOCK_CACHE = 8;
@@ -70,10 +74,29 @@ static const ulint TABLE_LOCK_CACHE = 8;
/** Size in bytes, of the table lock instance */
static const ulint TABLE_LOCK_SIZE = sizeof(ib_lock_t);
+/*********************************************************************//**
+Checks if a waiting record lock request still has to wait in a queue.
+@return lock that is causing the wait */
+static
+const lock_t*
+lock_rec_has_to_wait_in_queue(
+/*==========================*/
+ const lock_t* wait_lock); /*!< in: waiting record lock */
+
+/* Buffer to collect THDs to report waits for. */
+struct thd_wait_reports {
+ struct thd_wait_reports *next; /*!< List link */
+ ulint used; /*!< How many elements in waitees[] */
+ trx_t *waitees[64]; /*!< Trxs for thd_report_wait_for() */
+};
+
extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
extern "C" int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
+extern "C" int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
+
+
/** Deadlock checker. */
class DeadlockChecker {
public:
@@ -1869,6 +1892,85 @@ RecLock::create(
return(lock);
}
+/*********************************************************************//**
+Check if lock1 has higher priority than lock2.
+NULL has lowest priority.
+If either is a high priority transaction, the lock has higher priority.
+If neither of them is wait lock, the first one has higher priority.
+If only one of them is a wait lock, it has lower priority.
+Otherwise, the one with an older transaction has higher priority.
+@returns true if lock1 has higher priority, false otherwise. */
+bool
+has_higher_priority(
+ lock_t *lock1,
+ lock_t *lock2)
+{
+ if (lock1 == NULL) {
+ return false;
+ } else if (lock2 == NULL) {
+ return true;
+ }
+ // Ask the upper server layer if any of the two trx should be prefered.
+ int preference = thd_deadlock_victim_preference(lock1->trx->mysql_thd, lock2->trx->mysql_thd);
+ if (preference == -1) {
+ // lock1 is preferred as a victim, so lock2 has higher priority
+ return false;
+ } else if (preference == 1) {
+ // lock2 is preferred as a victim, so lock1 has higher priority
+ return true;
+ }
+ // No preference. Compre them by wait mode and trx age.
+ if (!lock_get_wait(lock1)) {
+ return true;
+ } else if (!lock_get_wait(lock2)) {
+ return false;
+ }
+ return lock1->trx->start_time < lock2->trx->start_time;
+}
+
+/*********************************************************************//**
+Insert a lock to the hash list according to the mode (whether it is a wait
+lock) and the age of the transaction the it is associated with.
+If the lock is not a wait lock, insert it to the head of the hash list.
+Otherwise, insert it to the middle of the wait locks according to the age of
+the transaciton. */
+static
+void
+lock_rec_insert_by_trx_age(
+ lock_t *in_lock, /*!< in: lock to be insert */
+ bool wait) /*!< in: whether it's a wait lock */
+{
+ ulint space;
+ ulint page_no;
+ ulint rec_fold;
+ hash_table_t* hash;
+ hash_cell_t* cell;
+ lock_t* node;
+ lock_t* next;
+
+ space = in_lock->un_member.rec_lock.space;
+ page_no = in_lock->un_member.rec_lock.page_no;
+ rec_fold = lock_rec_fold(space, page_no);
+ hash = lock_hash_get(in_lock->type_mode);
+ cell = hash_get_nth_cell(hash,
+ hash_calc_hash(rec_fold, hash));
+
+ node = (lock_t *) cell->node;
+ // If in_lock is not a wait lock, we insert it to the head of the list.
+ if (node == NULL || !wait || has_higher_priority(in_lock, node)) {
+ cell->node = in_lock;
+ in_lock->hash = node;
+ return;
+ }
+ while (node != NULL && has_higher_priority((lock_t *) node->hash,
+ in_lock)) {
+ node = (lock_t *) node->hash;
+ }
+ next = (lock_t *) node->hash;
+ node->hash = in_lock;
+ in_lock->hash = next;
+}
+
/**
Check the outcome of the deadlock check
@param[in,out] victim_trx Transaction selected for rollback
@@ -1891,7 +1993,23 @@ RecLock::check_deadlock_result(const trx_t* victim_trx, lock_t* lock)
return(DB_DEADLOCK);
- } else if (m_trx->lock.wait_lock == NULL) {
+ }
+
+ // Move it only when it does not cause a deadlock.
+ if (innodb_lock_schedule_algorithm
+ == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+ && !trx_is_high_priority(lock->trx)) {
+
+ HASH_DELETE(lock_t, hash, lock_hash_get(lock->type_mode),
+ m_rec_id.fold(), lock);
+ lock_rec_insert_by_trx_age(lock, m_mode & LOCK_WAIT);
+ if (lock_get_wait(lock) && !lock_rec_has_to_wait_in_queue(lock)) {
+ lock_reset_lock_and_trx_wait(lock);
+ return DB_SUCCESS_LOCKED_REC;
+ }
+ }
+
+ if (m_trx->lock.wait_lock == NULL) {
/* If there was a deadlock but we chose another
transaction as a victim, it is possible that we
@@ -2776,6 +2894,32 @@ lock_rec_cancel(
}
/*************************************************************//**
+Move the lock to the head of the hash list. */
+static
+void
+lock_rec_move_to_front(
+ lock_t *lock_to_move, /*!< in: lock to be moved */
+ ulint rec_fold) /*!< in: rec fold of the lock */
+{
+ hash_table_t* lock_hash;
+ hash_cell_t* cell;
+ lock_t* next;
+
+ if (lock_to_move != NULL)
+ {
+ lock_hash = lock_hash_get(lock_to_move->type_mode);
+ // Move the target lock to the head of the list
+ cell = hash_get_nth_cell(lock_hash,
+ hash_calc_hash(rec_fold, lock_hash));
+ if (lock_to_move != cell->node) {
+ next = (lock_t *) cell->node;
+ cell->node = lock_to_move;
+ lock_to_move->hash = next;
+ }
+ }
+}
+
+/*************************************************************//**
Removes a record lock request, waiting or granted, from the queue and
grants locks to other transactions in the queue if they now are entitled
to a lock. NOTE: all record locks contained in in_lock are removed. */
@@ -2791,6 +2935,8 @@ lock_rec_dequeue_from_page(
{
ulint space;
ulint page_no;
+ ulint rec_fold;
+ lock_t* previous = NULL;
lock_t* lock;
trx_lock_t* trx_lock;
hash_table_t* lock_hash;
@@ -2803,6 +2949,7 @@ lock_rec_dequeue_from_page(
space = in_lock->un_member.rec_lock.space;
page_no = in_lock->un_member.rec_lock.page_no;
+ rec_fold = lock_rec_fold(space, page_no);
ut_ad(in_lock->index->table->n_rec_locks > 0);
in_lock->index->table->n_rec_locks--;
@@ -2817,33 +2964,83 @@ lock_rec_dequeue_from_page(
MONITOR_INC(MONITOR_RECLOCK_REMOVED);
MONITOR_DEC(MONITOR_NUM_RECLOCK);
- /* Check if waiting locks in the queue can now be granted: grant
- locks if there are no conflicting locks ahead. Stop at the first
- X lock that is waiting or has been granted. */
+ if (innodb_lock_schedule_algorithm
+ == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) {
- for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, page_no);
- lock != NULL;
- lock = lock_rec_get_next_on_page(lock)) {
+ /* Check if waiting locks in the queue can now be granted:
+ grant locks if there are no conflicting locks ahead. Stop at
+ the first X lock that is waiting or has been granted. */
- if (lock_get_wait(lock)
- && !lock_rec_has_to_wait_in_queue(lock)) {
+ for (lock = lock_rec_get_first_on_page_addr(lock_hash, space,
+ page_no);
+ lock != NULL;
+ lock = lock_rec_get_next_on_page(lock)) {
- /* Grant the lock */
- ut_ad(lock->trx != in_lock->trx);
+ if (lock_get_wait(lock)
+ && !lock_rec_has_to_wait_in_queue(lock)) {
- bool exit_trx_mutex = false;
+ /* Grant the lock */
+ ut_ad(lock->trx != in_lock->trx);
- if (lock->trx->abort_type != TRX_SERVER_ABORT) {
- ut_ad(trx_mutex_own(lock->trx));
- trx_mutex_exit(lock->trx);
- exit_trx_mutex = true;
+ bool exit_trx_mutex = false;
+
+ if (lock->trx->abort_type != TRX_SERVER_ABORT) {
+ ut_ad(trx_mutex_own(lock->trx));
+ trx_mutex_exit(lock->trx);
+ exit_trx_mutex = true;
+ }
+
+ lock_grant(lock);
+
+ if (exit_trx_mutex) {
+ ut_ad(!trx_mutex_own(lock->trx));
+ trx_mutex_enter(lock->trx);
+ }
}
+ }
+ } else {
+ /* Grant locks if there are no conflicting locks ahead.
+ Move granted locks to the head of the list. */
+ for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, page_no);
+ lock != NULL;) {
+
+ /* If the lock is a wait lock on this page, and it does not need to wait. */
+ if ((lock->un_member.rec_lock.space == space)
+ && (lock->un_member.rec_lock.page_no == page_no)
+ && lock_get_wait(lock)
+ && !lock_rec_has_to_wait_in_queue(lock)) {
+
+ /* Grant the lock */
+ ut_ad(lock->trx != in_lock->trx);
+
+ bool exit_trx_mutex = false;
+
+ if (lock->trx->abort_type != TRX_SERVER_ABORT) {
+ ut_ad(trx_mutex_own(lock->trx));
+ trx_mutex_exit(lock->trx);
+ exit_trx_mutex = true;
+ }
- lock_grant(lock);
+ lock_grant(lock);
- if (exit_trx_mutex) {
- ut_ad(!trx_mutex_own(lock->trx));
- trx_mutex_enter(lock->trx);
+ if (exit_trx_mutex) {
+ ut_ad(!trx_mutex_own(lock->trx));
+ trx_mutex_enter(lock->trx);
+ }
+
+ if (previous != NULL) {
+ /* Move the lock to the head of the list. */
+ HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock);
+ lock_rec_move_to_front(lock, rec_fold);
+ } else {
+ /* Already at the head of the list. */
+ previous = lock;
+ }
+ /* Move on to the next lock. */
+ lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, previous));
+ } else {
+ previous = lock;
+ lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, lock));
}
}
}
@@ -7775,7 +7972,9 @@ DeadlockChecker::get_first_lock(ulint* heap_no) const
/* Must find at least two locks, otherwise there cannot be a
waiting lock, secondly the first lock cannot be the wait_lock. */
ut_a(lock != NULL);
- ut_a(lock != m_wait_lock);
+ ut_a(lock != m_wait_lock ||
+ (innodb_lock_schedule_algorithm
+ == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS));
/* Check that the lock type doesn't change. */
ut_ad(lock_get_type_low(lock) == lock_get_type_low(m_wait_lock));
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index 900d520d242..4bb29e02680 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -348,6 +348,22 @@ static TYPELIB innodb_empty_free_list_algorithm_typelib = {
NULL
};
+/** Possible values of the parameter innodb_lock_schedule_algorithm */
+static const char* innodb_lock_schedule_algorithm_names[] = {
+ "fcfs",
+ "vats",
+ NullS
+};
+
+/** Used to define an enumerate type of the system variable
+innodb_lock_schedule_algorithm. */
+static TYPELIB innodb_lock_schedule_algorithm_typelib = {
+ array_elements(innodb_lock_schedule_algorithm_names) - 1,
+ "innodb_lock_schedule_algorithm_typelib",
+ innodb_lock_schedule_algorithm_names,
+ NULL
+};
+
/* The following counter is used to convey information to InnoDB
about server activity: in case of normal DML ops it is not
sensible to call srv_active_wake_master_thread after each
@@ -20390,6 +20406,18 @@ static MYSQL_SYSVAR_ENUM(empty_free_list_algorithm,
innodb_srv_empty_free_list_algorithm_validate, NULL, SRV_EMPTY_FREE_LIST_BACKOFF,
&innodb_empty_free_list_algorithm_typelib);
+static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm,
+ PLUGIN_VAR_RQCMDARG,
+ "The algorithm Innodb uses for deciding which locks to grant next when"
+ " a lock is released. Possible values are"
+ " FCFS"
+ " grant the locks in First-Come-First-Served order;"
+ " VATS"
+ " use the Variance-Aware-Transaction-Scheduling algorithm, which"
+ " uses an Eldest-Transaction-First heuristic.",
+ NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
+ &innodb_lock_schedule_algorithm_typelib);
+
static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
"Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
@@ -21283,6 +21311,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(ft_sort_pll_degree),
MYSQL_SYSVAR(large_prefix),
MYSQL_SYSVAR(force_load_corrupted),
+ MYSQL_SYSVAR(lock_schedule_algorithm),
MYSQL_SYSVAR(locks_unsafe_for_binlog),
MYSQL_SYSVAR(lock_wait_timeout),
#ifdef UNIV_LOG_ARCHIVE
diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h
index b6100d470cc..25618c3ddca 100644
--- a/storage/xtradb/include/lock0lock.h
+++ b/storage/xtradb/include/lock0lock.h
@@ -47,6 +47,17 @@ extern ibool lock_print_waits;
extern ulint srv_n_lock_deadlock_count;
+/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by
+ setting innodb_lock_schedule_algorithm. */
+enum innodb_lock_schedule_algorithm_t {
+ /*!< First Come First Served */
+ INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
+ /*!< Variance-Aware-Transaction-Scheduling */
+ INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+};
+
+extern ulong innodb_lock_schedule_algorithm;
+
/*********************************************************************//**
Gets the size of a lock struct.
@return size in bytes */
diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc
index 341c452cd15..119a09d0d76 100644
--- a/storage/xtradb/lock/lock0lock.cc
+++ b/storage/xtradb/lock/lock0lock.cc
@@ -385,6 +385,11 @@ extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
extern "C"
int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
+/** Lock scheduling algorithm */
+ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS;
+
+extern "C" int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
+
/** Stack to use during DFS search. Currently only a single stack is required
because there is no parallel deadlock check. This stack is protected by
the lock_sys_t::mutex. */
@@ -2199,6 +2204,85 @@ lock_rec_create(
}
/*********************************************************************//**
+Check if lock1 has higher priority than lock2.
+NULL has lowest priority.
+If either is a high priority transaction, the lock has higher priority.
+If neither of them is wait lock, the first one has higher priority.
+If only one of them is a wait lock, it has lower priority.
+Otherwise, the one with an older transaction has higher priority.
+@returns true if lock1 has higher priority, false otherwise. */
+bool
+has_higher_priority(
+ lock_t *lock1,
+ lock_t *lock2)
+{
+ if (lock1 == NULL) {
+ return false;
+ } else if (lock2 == NULL) {
+ return true;
+ }
+ // Ask the upper server layer if any of the two trx should be prefered.
+ int preference = thd_deadlock_victim_preference(lock1->trx->mysql_thd, lock2->trx->mysql_thd);
+ if (preference == -1) {
+ // lock1 is preferred as a victim, so lock2 has higher priority
+ return false;
+ } else if (preference == 1) {
+ // lock2 is preferred as a victim, so lock1 has higher priority
+ return true;
+ }
+ // No preference. Compre them by wait mode and trx age.
+ if (!lock_get_wait(lock1)) {
+ return true;
+ } else if (!lock_get_wait(lock2)) {
+ return false;
+ }
+ return lock1->trx->start_time < lock2->trx->start_time;
+}
+
+/*********************************************************************//**
+Insert a lock to the hash list according to the mode (whether it is a wait
+lock) and the age of the transaction the it is associated with.
+If the lock is not a wait lock, insert it to the head of the hash list.
+Otherwise, insert it to the middle of the wait locks according to the age of
+the transaciton. */
+static
+void
+lock_rec_insert_by_trx_age(
+ lock_t *in_lock, /*!< in: lock to be insert */
+ bool wait) /*!< in: whether it's a wait lock */
+{
+ ulint space;
+ ulint page_no;
+ ulint rec_fold;
+ hash_table_t* hash;
+ hash_cell_t* cell;
+ lock_t* node;
+ lock_t* next;
+
+ space = in_lock->un_member.rec_lock.space;
+ page_no = in_lock->un_member.rec_lock.page_no;
+ rec_fold = lock_rec_fold(space, page_no);
+ hash = lock_hash_get(in_lock->type_mode);
+ cell = hash_get_nth_cell(hash,
+ hash_calc_hash(rec_fold, hash));
+
+ node = (lock_t *) cell->node;
+ // If in_lock is not a wait lock, we insert it to the head of the list.
+ if (node == NULL || !wait || has_higher_priority(in_lock, node)) {
+ cell->node = in_lock;
+ in_lock->hash = node;
+ return;
+ }
+ while (node != NULL && has_higher_priority((lock_t *) node->hash,
+ in_lock)) {
+ node = (lock_t *) node->hash;
+ }
+ next = (lock_t *) node->hash;
+ node->hash = in_lock;
+ in_lock->hash = next;
+}
+
+/*********************************************************************//**
Enqueues a waiting request for a lock which cannot be granted immediately.
Checks for deadlocks.
@return DB_LOCK_WAIT, DB_DEADLOCK, or DB_QUE_THR_SUSPENDED, or
@@ -2300,7 +2384,23 @@ lock_rec_enqueue_waiting(
return(DB_DEADLOCK);
- } else if (trx->lock.wait_lock == NULL) {
+ }
+
+ // Move it only when it does not cause a deadlock.
+ if (innodb_lock_schedule_algorithm
+ == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+ && !trx_is_high_priority(lock->trx)) {
+
+ HASH_DELETE(lock_t, hash, lock_hash_get(lock->type_mode),
+ m_rec_id.fold(), lock);
+ lock_rec_insert_by_trx_age(lock, m_mode & LOCK_WAIT);
+ if (lock_get_wait(lock) && !lock_rec_has_to_wait_in_queue(lock)) {
+ lock_reset_lock_and_trx_wait(lock);
+ return DB_SUCCESS_LOCKED_REC;
+ }
+ }
+
+ if (trx->lock.wait_lock == NULL) {
/* If there was a deadlock but we chose another
transaction as a victim, it is possible that we
@@ -2859,6 +2959,32 @@ lock_rec_cancel(
}
/*************************************************************//**
+Move the lock to the head of the hash list. */
+static
+void
+lock_rec_move_to_front(
+ lock_t *lock_to_move, /*!< in: lock to be moved */
+ ulint rec_fold) /*!< in: rec fold of the lock */
+{
+ hash_table_t* lock_hash;
+ hash_cell_t* cell;
+ lock_t* next;
+
+ if (lock_to_move != NULL)
+ {
+ lock_hash = lock_hash_get(lock_to_move->type_mode);
+ // Move the target lock to the head of the list
+ cell = hash_get_nth_cell(lock_hash,
+ hash_calc_hash(rec_fold, lock_hash));
+ if (lock_to_move != cell->node) {
+ next = (lock_t *) cell->node;
+ cell->node = lock_to_move;
+ lock_to_move->hash = next;
+ }
+ }
+}
+
+/*************************************************************//**
Removes a record lock request, waiting or granted, from the queue and
grants locks to other transactions in the queue if they now are entitled
to a lock. NOTE: all record locks contained in in_lock are removed. */
@@ -2875,6 +3001,8 @@ lock_rec_dequeue_from_page(
{
ulint space;
ulint page_no;
+ ulint rec_fold;
+ lock_t* previous = NULL;
lock_t* lock;
trx_lock_t* trx_lock;
@@ -2886,6 +3014,7 @@ lock_rec_dequeue_from_page(
space = in_lock->un_member.rec_lock.space;
page_no = in_lock->un_member.rec_lock.page_no;
+ rec_fold = lock_rec_fold(space, page_no);
in_lock->index->table->n_rec_locks--;
@@ -2898,20 +3027,58 @@ lock_rec_dequeue_from_page(
MONITOR_INC(MONITOR_RECLOCK_REMOVED);
MONITOR_DEC(MONITOR_NUM_RECLOCK);
- /* Check if waiting locks in the queue can now be granted: grant
- locks if there are no conflicting locks ahead. Stop at the first
- X lock that is waiting or has been granted. */
+ if (innodb_lock_schedule_algorithm
+ == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) {
- for (lock = lock_rec_get_first_on_page_addr(space, page_no);
- lock != NULL;
- lock = lock_rec_get_next_on_page(lock)) {
+ /* Check if waiting locks in the queue can now be granted:
+ grant locks if there are no conflicting locks ahead. Stop at
+ the first X lock that is waiting or has been granted. */
- if (lock_get_wait(lock)
- && !lock_rec_has_to_wait_in_queue(lock)) {
+ for (lock = lock_rec_get_first_on_page_addr(lock_hash, space,
+ page_no);
+ lock != NULL;
+ lock = lock_rec_get_next_on_page(lock)) {
- /* Grant the lock */
- ut_ad(lock->trx != in_lock->trx);
- lock_grant(lock);
+ if (lock_get_wait(lock)
+ && !lock_rec_has_to_wait_in_queue(lock)) {
+
+ /* Grant the lock */
+ ut_ad(lock->trx != in_lock->trx);
+
+ lock_grant(lock);
+ }
+ }
+ } else {
+ /* Grant locks if there are no conflicting locks ahead.
+ Move granted locks to the head of the list. */
+ for (lock = lock_rec_get_first_on_page_addr(lock_hash, space, page_no);
+ lock != NULL;) {
+
+ /* If the lock is a wait lock on this page, and it does not need to wait. */
+ if ((lock->un_member.rec_lock.space == space)
+ && (lock->un_member.rec_lock.page_no == page_no)
+ && lock_get_wait(lock)
+ && !lock_rec_has_to_wait_in_queue(lock)) {
+
+ /* Grant the lock */
+ ut_ad(lock->trx != in_lock->trx);
+
+ lock_grant(lock);
+
+ if (previous != NULL) {
+ /* Move the lock to the head of the list. */
+ HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock);
+ lock_rec_move_to_front(lock, rec_fold);
+ } else {
+ /* Already at the head of the list. */
+ previous = lock;
+ }
+ /* Move on to the next lock. */
+ lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, previous));
+ } else {
+ previous = lock;
+ lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, lock));
+ }
}
}
}
@@ -4118,7 +4285,8 @@ lock_get_first_lock(
}
ut_a(lock != NULL);
- ut_a(lock != ctx->wait_lock);
+ ut_a(lock != ctx->wait_lock ||
+ innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS);
ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock));
return(lock);