summaryrefslogtreecommitdiff
path: root/storage/innobase/sync/sync0arr.cc
diff options
context:
space:
mode:
authorSergei Golubchik <vuvova@gmail.com>2015-05-04 19:17:21 +0200
committerSergei Golubchik <vuvova@gmail.com>2015-05-04 19:17:21 +0200
commit6d06fbbd1dc25b3c12568f9038060dfdb69f9683 (patch)
tree21e27f3fddc89f9dda6b337091464ba10c490123 /storage/innobase/sync/sync0arr.cc
parent1645930d0bd02f79df3ebff412b90acdc15bd9a0 (diff)
downloadmariadb-git-6d06fbbd1dc25b3c12568f9038060dfdb69f9683.tar.gz
move to storage/innobase
Diffstat (limited to 'storage/innobase/sync/sync0arr.cc')
-rw-r--r--storage/innobase/sync/sync0arr.cc1156
1 files changed, 1156 insertions, 0 deletions
diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc
new file mode 100644
index 00000000000..d56d328d8c3
--- /dev/null
+++ b/storage/innobase/sync/sync0arr.cc
@@ -0,0 +1,1156 @@
+/*****************************************************************************
+
+Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2008, Google Inc.
+
+Portions of this file contain modifications contributed and copyrighted by
+Google, Inc. Those modifications are gratefully acknowledged and are described
+briefly in the InnoDB documentation. The contributions by Google are
+incorporated with their permission, and subject to the conditions contained in
+the file COPYING.Google.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/**************************************************//**
+@file sync/sync0arr.cc
+The wait array used in synchronization primitives
+
+Created 9/5/1995 Heikki Tuuri
+*******************************************************/
+
+#include "sync0arr.h"
+#ifdef UNIV_NONINL
+#include "sync0arr.ic"
+#endif
+
+#include "sync0sync.h"
+#include "sync0rw.h"
+#include "os0sync.h"
+#include "os0file.h"
+#include "lock0lock.h"
+#include "srv0srv.h"
+#include "ha_prototypes.h"
+
+/*
+ WAIT ARRAY
+ ==========
+
+The wait array consists of cells each of which has an
+an operating system event object created for it. The threads
+waiting for a mutex, for example, can reserve a cell
+in the array and suspend themselves to wait for the event
+to become signaled. When using the wait array, remember to make
+sure that some thread holding the synchronization object
+will eventually know that there is a waiter in the array and
+signal the object, to prevent infinite wait.
+Why we chose to implement a wait array? First, to make
+mutexes fast, we had to code our own implementation of them,
+which only in usually uncommon cases resorts to using
+slow operating system primitives. Then we had the choice of
+assigning a unique OS event for each mutex, which would
+be simpler, or using a global wait array. In some operating systems,
+the global wait array solution is more efficient and flexible,
+because we can do with a very small number of OS events,
+say 200. In NT 3.51, allocating events seems to be a quadratic
+algorithm, because 10 000 events are created fast, but
+100 000 events takes a couple of minutes to create.
+
+As of 5.0.30 the above mentioned design is changed. Since now
+OS can handle millions of wait events efficiently, we no longer
+have this concept of each cell of wait array having one event.
+Instead, now the event that a thread wants to wait on is embedded
+in the wait object (mutex or rw_lock). We still keep the global
+wait array for the sake of diagnostics and also to avoid infinite
+wait The error_monitor thread scans the global wait array to signal
+any waiting threads who have missed the signal. */
+
+/** A cell where an individual thread may wait suspended
+until a resource is released. The suspending is implemented
+using an operating system event semaphore. */
+struct sync_cell_t {
+ void* wait_object; /*!< pointer to the object the
+ thread is waiting for; if NULL
+ the cell is free for use */
+ ib_mutex_t* old_wait_mutex; /*!< the latest wait mutex in cell */
+ rw_lock_t* old_wait_rw_lock;
+ /*!< the latest wait rw-lock
+ in cell */
+ ulint request_type; /*!< lock type requested on the
+ object */
+ const char* file; /*!< in debug version file where
+ requested */
+ ulint line; /*!< in debug version line where
+ requested */
+ os_thread_id_t thread; /*!< thread id of this waiting
+ thread */
+ ibool waiting; /*!< TRUE if the thread has already
+ called sync_array_event_wait
+ on this cell */
+ ib_int64_t signal_count; /*!< We capture the signal_count
+ of the wait_object when we
+ reset the event. This value is
+ then passed on to os_event_wait
+ and we wait only if the event
+ has not been signalled in the
+ period between the reset and
+ wait call. */
+ time_t reservation_time;/*!< time when the thread reserved
+ the wait cell */
+};
+
+/* NOTE: It is allowed for a thread to wait
+for an event allocated for the array without owning the
+protecting mutex (depending on the case: OS or database mutex), but
+all changes (set or reset) to the state of the event must be made
+while owning the mutex. */
+
+/** Synchronization array */
+struct sync_array_t {
+ ulint n_reserved; /*!< number of currently reserved
+ cells in the wait array */
+ ulint n_cells; /*!< number of cells in the
+ wait array */
+ sync_cell_t* array; /*!< pointer to wait array */
+ ib_mutex_t mutex; /*!< possible database mutex
+ protecting this data structure */
+ os_ib_mutex_t os_mutex; /*!< Possible operating system mutex
+ protecting the data structure.
+ As this data structure is used in
+ constructing the database mutex,
+ to prevent infinite recursion
+ in implementation, we fall back to
+ an OS mutex. */
+ ulint res_count; /*!< count of cell reservations
+ since creation of the array */
+};
+
+/** User configured sync array size */
+UNIV_INTERN ulong srv_sync_array_size = 32;
+
+/** Locally stored copy of srv_sync_array_size */
+static ulint sync_array_size;
+
+/** The global array of wait cells for implementation of the database's own
+mutexes and read-write locks */
+static sync_array_t** sync_wait_array;
+
+/** count of how many times an object has been signalled */
+static ulint sg_count;
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+This function is called only in the debug version. Detects a deadlock
+of one or more threads because of waits of semaphores.
+@return TRUE if deadlock detected */
+static
+ibool
+sync_array_detect_deadlock(
+/*=======================*/
+ sync_array_t* arr, /*!< in: wait array; NOTE! the caller must
+ own the mutex to array */
+ sync_cell_t* start, /*!< in: cell where recursive search started */
+ sync_cell_t* cell, /*!< in: cell to search */
+ ulint depth); /*!< in: recursion depth */
+#endif /* UNIV_SYNC_DEBUG */
+
+/*****************************************************************//**
+Gets the nth cell in array.
+@return cell */
+static
+sync_cell_t*
+sync_array_get_nth_cell(
+/*====================*/
+ sync_array_t* arr, /*!< in: sync array */
+ ulint n) /*!< in: index */
+{
+ ut_a(arr);
+ ut_a(n < arr->n_cells);
+
+ return(arr->array + n);
+}
+
+/******************************************************************//**
+Reserves the mutex semaphore protecting a sync array. */
+static
+void
+sync_array_enter(
+/*=============*/
+ sync_array_t* arr) /*!< in: sync wait array */
+{
+ os_mutex_enter(arr->os_mutex);
+}
+
+/******************************************************************//**
+Releases the mutex semaphore protecting a sync array. */
+static
+void
+sync_array_exit(
+/*============*/
+ sync_array_t* arr) /*!< in: sync wait array */
+{
+ os_mutex_exit(arr->os_mutex);
+}
+
+/*******************************************************************//**
+Creates a synchronization wait array. It is protected by a mutex
+which is automatically reserved when the functions operating on it
+are called.
+@return own: created wait array */
+static
+sync_array_t*
+sync_array_create(
+/*==============*/
+ ulint n_cells) /*!< in: number of cells in the array
+ to create */
+{
+ ulint sz;
+ sync_array_t* arr;
+
+ ut_a(n_cells > 0);
+
+ /* Allocate memory for the data structures */
+ arr = static_cast<sync_array_t*>(ut_malloc(sizeof(*arr)));
+ memset(arr, 0x0, sizeof(*arr));
+
+ sz = sizeof(sync_cell_t) * n_cells;
+ arr->array = static_cast<sync_cell_t*>(ut_malloc(sz));
+ memset(arr->array, 0x0, sz);
+
+ arr->n_cells = n_cells;
+
+ /* Then create the mutex to protect the wait array complex */
+ arr->os_mutex = os_mutex_create();
+
+ return(arr);
+}
+
+/******************************************************************//**
+Frees the resources in a wait array. */
+static
+void
+sync_array_free(
+/*============*/
+ sync_array_t* arr) /*!< in, own: sync wait array */
+{
+ ut_a(arr->n_reserved == 0);
+
+ sync_array_validate(arr);
+
+ /* Release the mutex protecting the wait array complex */
+
+ os_mutex_free(arr->os_mutex);
+
+ ut_free(arr->array);
+ ut_free(arr);
+}
+
+/********************************************************************//**
+Validates the integrity of the wait array. Checks
+that the number of reserved cells equals the count variable. */
+UNIV_INTERN
+void
+sync_array_validate(
+/*================*/
+ sync_array_t* arr) /*!< in: sync wait array */
+{
+ ulint i;
+ sync_cell_t* cell;
+ ulint count = 0;
+
+ sync_array_enter(arr);
+
+ for (i = 0; i < arr->n_cells; i++) {
+ cell = sync_array_get_nth_cell(arr, i);
+ if (cell->wait_object != NULL) {
+ count++;
+ }
+ }
+
+ ut_a(count == arr->n_reserved);
+
+ sync_array_exit(arr);
+}
+
+/*******************************************************************//**
+Returns the event that the thread owning the cell waits for. */
+static
+os_event_t
+sync_cell_get_event(
+/*================*/
+ sync_cell_t* cell) /*!< in: non-empty sync array cell */
+{
+ ulint type = cell->request_type;
+
+ if (type == SYNC_MUTEX) {
+ return(((ib_mutex_t*) cell->wait_object)->event);
+ } else if (type == RW_LOCK_WAIT_EX) {
+ return(((rw_lock_t*) cell->wait_object)->wait_ex_event);
+ } else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */
+ return(((rw_lock_t*) cell->wait_object)->event);
+ }
+}
+
+/******************************************************************//**
+Reserves a wait array cell for waiting for an object.
+The event of the cell is reset to nonsignalled state.
+@return true if free cell is found, otherwise false */
+UNIV_INTERN
+bool
+sync_array_reserve_cell(
+/*====================*/
+ sync_array_t* arr, /*!< in: wait array */
+ void* object, /*!< in: pointer to the object to wait for */
+ ulint type, /*!< in: lock request type */
+ const char* file, /*!< in: file where requested */
+ ulint line, /*!< in: line where requested */
+ ulint* index) /*!< out: index of the reserved cell */
+{
+ sync_cell_t* cell;
+ os_event_t event;
+ ulint i;
+
+ ut_a(object);
+ ut_a(index);
+
+ sync_array_enter(arr);
+
+ arr->res_count++;
+
+ /* Reserve a new cell. */
+ for (i = 0; i < arr->n_cells; i++) {
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->wait_object == NULL) {
+
+ cell->waiting = FALSE;
+ cell->wait_object = object;
+
+ if (type == SYNC_MUTEX) {
+ cell->old_wait_mutex =
+ static_cast<ib_mutex_t*>(object);
+ } else {
+ cell->old_wait_rw_lock =
+ static_cast<rw_lock_t*>(object);
+ }
+
+ cell->request_type = type;
+
+ cell->file = file;
+ cell->line = line;
+
+ arr->n_reserved++;
+
+ *index = i;
+
+ sync_array_exit(arr);
+
+ /* Make sure the event is reset and also store
+ the value of signal_count at which the event
+ was reset. */
+ event = sync_cell_get_event(cell);
+ cell->signal_count = os_event_reset(event);
+
+ cell->reservation_time = ut_time();
+
+ cell->thread = os_thread_get_curr_id();
+
+ return(true);
+ }
+ }
+
+ /* No free cell found */
+ return false;
+}
+
+/******************************************************************//**
+This function should be called when a thread starts to wait on
+a wait array cell. In the debug version this function checks
+if the wait for a semaphore will result in a deadlock, in which
+case prints info and asserts. */
+UNIV_INTERN
+void
+sync_array_wait_event(
+/*==================*/
+ sync_array_t* arr, /*!< in: wait array */
+ ulint index) /*!< in: index of the reserved cell */
+{
+ sync_cell_t* cell;
+ os_event_t event;
+
+ ut_a(arr);
+
+ sync_array_enter(arr);
+
+ cell = sync_array_get_nth_cell(arr, index);
+
+ ut_a(cell->wait_object);
+ ut_a(!cell->waiting);
+ ut_ad(os_thread_get_curr_id() == cell->thread);
+
+ event = sync_cell_get_event(cell);
+ cell->waiting = TRUE;
+
+#ifdef UNIV_SYNC_DEBUG
+
+ /* We use simple enter to the mutex below, because if
+ we cannot acquire it at once, mutex_enter would call
+ recursively sync_array routines, leading to trouble.
+ rw_lock_debug_mutex freezes the debug lists. */
+
+ rw_lock_debug_mutex_enter();
+
+ if (TRUE == sync_array_detect_deadlock(arr, cell, cell, 0)) {
+
+ fputs("########################################\n", stderr);
+ ut_error;
+ }
+
+ rw_lock_debug_mutex_exit();
+#endif
+ sync_array_exit(arr);
+
+ os_event_wait_low(event, cell->signal_count);
+
+ sync_array_free_cell(arr, index);
+}
+
+/******************************************************************//**
+Reports info of a wait array cell. */
+static
+void
+sync_array_cell_print(
+/*==================*/
+ FILE* file, /*!< in: file where to print */
+ sync_cell_t* cell) /*!< in: sync cell */
+{
+ ib_mutex_t* mutex;
+ rw_lock_t* rwlock;
+ ulint type;
+ ulint writer;
+
+ type = cell->request_type;
+
+ fprintf(file,
+ "--Thread %lu has waited at %s line %lu"
+ " for %.2f seconds the semaphore:\n",
+ (ulong) os_thread_pf(cell->thread),
+ innobase_basename(cell->file), (ulong) cell->line,
+ difftime(time(NULL), cell->reservation_time));
+
+ if (type == SYNC_MUTEX) {
+ /* We use old_wait_mutex in case the cell has already
+ been freed meanwhile */
+ mutex = cell->old_wait_mutex;
+
+ fprintf(file,
+ "Mutex at %p created file %s line %lu, lock var %lu\n"
+#ifdef UNIV_SYNC_DEBUG
+ "Last time reserved in file %s line %lu, "
+#endif /* UNIV_SYNC_DEBUG */
+ "waiters flag %lu\n",
+ (void*) mutex, innobase_basename(mutex->cfile_name),
+ (ulong) mutex->cline,
+ (ulong) mutex->lock_word,
+#ifdef UNIV_SYNC_DEBUG
+ mutex->file_name, (ulong) mutex->line,
+#endif /* UNIV_SYNC_DEBUG */
+ (ulong) mutex->waiters);
+
+ } else if (type == RW_LOCK_EX
+ || type == RW_LOCK_WAIT_EX
+ || type == RW_LOCK_SHARED) {
+
+ fputs(type == RW_LOCK_EX ? "X-lock on"
+ : type == RW_LOCK_WAIT_EX ? "X-lock (wait_ex) on"
+ : "S-lock on", file);
+
+ rwlock = cell->old_wait_rw_lock;
+
+ fprintf(file,
+ " RW-latch at %p created in file %s line %lu\n",
+ (void*) rwlock, innobase_basename(rwlock->cfile_name),
+ (ulong) rwlock->cline);
+ writer = rw_lock_get_writer(rwlock);
+ if (writer != RW_LOCK_NOT_LOCKED) {
+ fprintf(file,
+ "a writer (thread id %lu) has"
+ " reserved it in mode %s",
+ (ulong) os_thread_pf(rwlock->writer_thread),
+ writer == RW_LOCK_EX
+ ? " exclusive\n"
+ : " wait exclusive\n");
+ }
+
+ fprintf(file,
+ "number of readers %lu, waiters flag %lu, "
+ "lock_word: %lx\n"
+ "Last time read locked in file %s line %lu\n"
+ "Last time write locked in file %s line %lu\n",
+ (ulong) rw_lock_get_reader_count(rwlock),
+ (ulong) rwlock->waiters,
+ rwlock->lock_word,
+ innobase_basename(rwlock->last_s_file_name),
+ (ulong) rwlock->last_s_line,
+ rwlock->last_x_file_name,
+ (ulong) rwlock->last_x_line);
+ } else {
+ ut_error;
+ }
+
+ if (!cell->waiting) {
+ fputs("wait has ended\n", file);
+ }
+}
+
+#ifdef UNIV_SYNC_DEBUG
+/******************************************************************//**
+Looks for a cell with the given thread id.
+@return pointer to cell or NULL if not found */
+static
+sync_cell_t*
+sync_array_find_thread(
+/*===================*/
+ sync_array_t* arr, /*!< in: wait array */
+ os_thread_id_t thread) /*!< in: thread id */
+{
+ ulint i;
+ sync_cell_t* cell;
+
+ for (i = 0; i < arr->n_cells; i++) {
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->wait_object != NULL
+ && os_thread_eq(cell->thread, thread)) {
+
+ return(cell); /* Found */
+ }
+ }
+
+ return(NULL); /* Not found */
+}
+
+/******************************************************************//**
+Recursion step for deadlock detection.
+@return TRUE if deadlock detected */
+static
+ibool
+sync_array_deadlock_step(
+/*=====================*/
+ sync_array_t* arr, /*!< in: wait array; NOTE! the caller must
+ own the mutex to array */
+ sync_cell_t* start, /*!< in: cell where recursive search
+ started */
+ os_thread_id_t thread, /*!< in: thread to look at */
+ ulint pass, /*!< in: pass value */
+ ulint depth) /*!< in: recursion depth */
+{
+ sync_cell_t* new_cell;
+
+ if (pass != 0) {
+ /* If pass != 0, then we do not know which threads are
+ responsible of releasing the lock, and no deadlock can
+ be detected. */
+
+ return(FALSE);
+ }
+
+ new_cell = sync_array_find_thread(arr, thread);
+
+ if (new_cell == start) {
+ /* Deadlock */
+ fputs("########################################\n"
+ "DEADLOCK of threads detected!\n", stderr);
+
+ return(TRUE);
+
+ } else if (new_cell) {
+ return(sync_array_detect_deadlock(
+ arr, start, new_cell, depth + 1));
+ }
+ return(FALSE);
+}
+
+/******************************************************************//**
+This function is called only in the debug version. Detects a deadlock
+of one or more threads because of waits of semaphores.
+@return TRUE if deadlock detected */
+static
+ibool
+sync_array_detect_deadlock(
+/*=======================*/
+ sync_array_t* arr, /*!< in: wait array; NOTE! the caller must
+ own the mutex to array */
+ sync_cell_t* start, /*!< in: cell where recursive search started */
+ sync_cell_t* cell, /*!< in: cell to search */
+ ulint depth) /*!< in: recursion depth */
+{
+ ib_mutex_t* mutex;
+ rw_lock_t* lock;
+ os_thread_id_t thread;
+ ibool ret;
+ rw_lock_debug_t*debug;
+
+ ut_a(arr);
+ ut_a(start);
+ ut_a(cell);
+ ut_ad(cell->wait_object);
+ ut_ad(os_thread_get_curr_id() == start->thread);
+ ut_ad(depth < 100);
+
+ depth++;
+
+ if (!cell->waiting) {
+
+ return(FALSE); /* No deadlock here */
+ }
+
+ if (cell->request_type == SYNC_MUTEX) {
+
+ mutex = static_cast<ib_mutex_t*>(cell->wait_object);
+
+ if (mutex_get_lock_word(mutex) != 0) {
+
+ thread = mutex->thread_id;
+
+ /* Note that mutex->thread_id above may be
+ also OS_THREAD_ID_UNDEFINED, because the
+ thread which held the mutex maybe has not
+ yet updated the value, or it has already
+ released the mutex: in this case no deadlock
+ can occur, as the wait array cannot contain
+ a thread with ID_UNDEFINED value. */
+
+ ret = sync_array_deadlock_step(arr, start, thread, 0,
+ depth);
+ if (ret) {
+ fprintf(stderr,
+ "Mutex %p owned by thread %lu file %s line %lu\n",
+ mutex, (ulong) os_thread_pf(mutex->thread_id),
+ mutex->file_name, (ulong) mutex->line);
+ sync_array_cell_print(stderr, cell);
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE); /* No deadlock */
+
+ } else if (cell->request_type == RW_LOCK_EX
+ || cell->request_type == RW_LOCK_WAIT_EX) {
+
+ lock = static_cast<rw_lock_t*>(cell->wait_object);
+
+ for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+ debug != 0;
+ debug = UT_LIST_GET_NEXT(list, debug)) {
+
+ thread = debug->thread_id;
+
+ if (((debug->lock_type == RW_LOCK_EX)
+ && !os_thread_eq(thread, cell->thread))
+ || ((debug->lock_type == RW_LOCK_WAIT_EX)
+ && !os_thread_eq(thread, cell->thread))
+ || (debug->lock_type == RW_LOCK_SHARED)) {
+
+ /* The (wait) x-lock request can block
+ infinitely only if someone (can be also cell
+ thread) is holding s-lock, or someone
+ (cannot be cell thread) (wait) x-lock, and
+ he is blocked by start thread */
+
+ ret = sync_array_deadlock_step(
+ arr, start, thread, debug->pass,
+ depth);
+ if (ret) {
+print:
+ fprintf(stderr, "rw-lock %p ",
+ (void*) lock);
+ sync_array_cell_print(stderr, cell);
+ rw_lock_debug_print(stderr, debug);
+ return(TRUE);
+ }
+ }
+ }
+
+ return(FALSE);
+
+ } else if (cell->request_type == RW_LOCK_SHARED) {
+
+ lock = static_cast<rw_lock_t*>(cell->wait_object);
+
+ for (debug = UT_LIST_GET_FIRST(lock->debug_list);
+ debug != 0;
+ debug = UT_LIST_GET_NEXT(list, debug)) {
+
+ thread = debug->thread_id;
+
+ if ((debug->lock_type == RW_LOCK_EX)
+ || (debug->lock_type == RW_LOCK_WAIT_EX)) {
+
+ /* The s-lock request can block infinitely
+ only if someone (can also be cell thread) is
+ holding (wait) x-lock, and he is blocked by
+ start thread */
+
+ ret = sync_array_deadlock_step(
+ arr, start, thread, debug->pass,
+ depth);
+ if (ret) {
+ goto print;
+ }
+ }
+ }
+
+ return(FALSE);
+
+ } else {
+ ut_error;
+ }
+
+ return(TRUE); /* Execution never reaches this line: for compiler
+ fooling only */
+}
+#endif /* UNIV_SYNC_DEBUG */
+
+/******************************************************************//**
+Determines if we can wake up the thread waiting for a sempahore. */
+static
+ibool
+sync_arr_cell_can_wake_up(
+/*======================*/
+ sync_cell_t* cell) /*!< in: cell to search */
+{
+ ib_mutex_t* mutex;
+ rw_lock_t* lock;
+
+ if (cell->request_type == SYNC_MUTEX) {
+
+ mutex = static_cast<ib_mutex_t*>(cell->wait_object);
+
+ os_rmb;
+ if (mutex_get_lock_word(mutex) == 0) {
+
+ return(TRUE);
+ }
+
+ } else if (cell->request_type == RW_LOCK_EX) {
+
+ lock = static_cast<rw_lock_t*>(cell->wait_object);
+
+ os_rmb;
+ if (lock->lock_word > 0) {
+ /* Either unlocked or only read locked. */
+
+ return(TRUE);
+ }
+
+ } else if (cell->request_type == RW_LOCK_WAIT_EX) {
+
+ lock = static_cast<rw_lock_t*>(cell->wait_object);
+
+ /* lock_word == 0 means all readers have left */
+ os_rmb;
+ if (lock->lock_word == 0) {
+
+ return(TRUE);
+ }
+ } else if (cell->request_type == RW_LOCK_SHARED) {
+ lock = static_cast<rw_lock_t*>(cell->wait_object);
+
+ /* lock_word > 0 means no writer or reserved writer */
+ os_rmb;
+ if (lock->lock_word > 0) {
+
+ return(TRUE);
+ }
+ }
+
+ return(FALSE);
+}
+
+/******************************************************************//**
+Frees the cell. NOTE! sync_array_wait_event frees the cell
+automatically! */
+UNIV_INTERN
+void
+sync_array_free_cell(
+/*=================*/
+ sync_array_t* arr, /*!< in: wait array */
+ ulint index) /*!< in: index of the cell in array */
+{
+ sync_cell_t* cell;
+
+ sync_array_enter(arr);
+
+ cell = sync_array_get_nth_cell(arr, index);
+
+ ut_a(cell->wait_object != NULL);
+
+ cell->waiting = FALSE;
+ cell->wait_object = NULL;
+ cell->signal_count = 0;
+
+ ut_a(arr->n_reserved > 0);
+ arr->n_reserved--;
+
+ sync_array_exit(arr);
+}
+
+/**********************************************************************//**
+Increments the signalled count. */
+UNIV_INTERN
+void
+sync_array_object_signalled(void)
+/*=============================*/
+{
+#ifdef HAVE_ATOMIC_BUILTINS
+ (void) os_atomic_increment_ulint(&sg_count, 1);
+#else
+ ++sg_count;
+#endif /* HAVE_ATOMIC_BUILTINS */
+}
+
+/**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server.
+
+Note that there's a race condition between this thread and mutex_exit
+changing the lock_word and calling signal_object, so sometimes this finds
+threads to wake up even when nothing has gone wrong. */
+static
+void
+sync_array_wake_threads_if_sema_free_low(
+/*=====================================*/
+ sync_array_t* arr) /* in/out: wait array */
+{
+ ulint i = 0;
+ ulint count;
+
+ sync_array_enter(arr);
+
+ for (count = 0; count < arr->n_reserved; ++i) {
+ sync_cell_t* cell;
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->wait_object != NULL) {
+
+ count++;
+
+ if (sync_arr_cell_can_wake_up(cell)) {
+ os_event_t event;
+
+ event = sync_cell_get_event(cell);
+
+ os_event_set(event);
+ }
+ }
+ }
+
+ sync_array_exit(arr);
+}
+
+/**********************************************************************//**
+If the wakeup algorithm does not work perfectly at semaphore relases,
+this function will do the waking (see the comment in mutex_exit). This
+function should be called about every 1 second in the server.
+
+Note that there's a race condition between this thread and mutex_exit
+changing the lock_word and calling signal_object, so sometimes this finds
+threads to wake up even when nothing has gone wrong. */
+UNIV_INTERN
+void
+sync_arr_wake_threads_if_sema_free(void)
+/*====================================*/
+{
+ ulint i;
+
+ for (i = 0; i < sync_array_size; ++i) {
+
+ sync_array_wake_threads_if_sema_free_low(
+ sync_wait_array[i]);
+ }
+}
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return TRUE if fatal semaphore wait threshold was exceeded */
+static
+ibool
+sync_array_print_long_waits_low(
+/*============================*/
+ sync_array_t* arr, /*!< in: sync array instance */
+ os_thread_id_t* waiter, /*!< out: longest waiting thread */
+ const void** sema, /*!< out: longest-waited-for semaphore */
+ ibool* noticed)/*!< out: TRUE if long wait noticed */
+{
+ ulint i;
+ ulint fatal_timeout = srv_fatal_semaphore_wait_threshold;
+ ibool fatal = FALSE;
+ double longest_diff = 0;
+
+ /* For huge tables, skip the check during CHECK TABLE etc... */
+ if (fatal_timeout > SRV_SEMAPHORE_WAIT_EXTENSION) {
+ return(FALSE);
+ }
+
+#ifdef UNIV_DEBUG_VALGRIND
+ /* Increase the timeouts if running under valgrind because it executes
+ extremely slowly. UNIV_DEBUG_VALGRIND does not necessary mean that
+ we are running under valgrind but we have no better way to tell.
+ See Bug#58432 innodb.innodb_bug56143 fails under valgrind
+ for an example */
+# define SYNC_ARRAY_TIMEOUT 2400
+ fatal_timeout *= 10;
+#else
+# define SYNC_ARRAY_TIMEOUT 240
+#endif
+
+ for (i = 0; i < arr->n_cells; i++) {
+
+ double diff;
+ sync_cell_t* cell;
+ void* wait_object;
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ wait_object = cell->wait_object;
+
+ if (wait_object == NULL || !cell->waiting) {
+
+ continue;
+ }
+
+ diff = difftime(time(NULL), cell->reservation_time);
+
+ if (diff > SYNC_ARRAY_TIMEOUT) {
+ fputs("InnoDB: Warning: a long semaphore wait:\n",
+ stderr);
+ sync_array_cell_print(stderr, cell);
+ *noticed = TRUE;
+ }
+
+ if (diff > fatal_timeout) {
+ fatal = TRUE;
+ }
+
+ if (diff > longest_diff) {
+ longest_diff = diff;
+ *sema = wait_object;
+ *waiter = cell->thread;
+ }
+ }
+
+#undef SYNC_ARRAY_TIMEOUT
+
+ return(fatal);
+}
+
+/**********************************************************************//**
+Prints warnings of long semaphore waits to stderr.
+@return TRUE if fatal semaphore wait threshold was exceeded */
+UNIV_INTERN
+ibool
+sync_array_print_long_waits(
+/*========================*/
+ os_thread_id_t* waiter, /*!< out: longest waiting thread */
+ const void** sema) /*!< out: longest-waited-for semaphore */
+{
+ ulint i;
+ ibool fatal = FALSE;
+ ibool noticed = FALSE;
+
+ for (i = 0; i < sync_array_size; ++i) {
+
+ sync_array_t* arr = sync_wait_array[i];
+
+ sync_array_enter(arr);
+
+ if (sync_array_print_long_waits_low(
+ arr, waiter, sema, &noticed)) {
+
+ fatal = TRUE;
+ }
+
+ sync_array_exit(arr);
+ }
+
+ if (noticed) {
+ ibool old_val;
+
+ fprintf(stderr,
+ "InnoDB: ###### Starts InnoDB Monitor"
+ " for 30 secs to print diagnostic info:\n");
+
+ old_val = srv_print_innodb_monitor;
+
+ /* If some crucial semaphore is reserved, then also the InnoDB
+ Monitor can hang, and we do not get diagnostics. Since in
+ many cases an InnoDB hang is caused by a pwrite() or a pread()
+ call hanging inside the operating system, let us print right
+ now the values of pending calls of these. */
+
+ fprintf(stderr,
+ "InnoDB: Pending preads %lu, pwrites %lu\n",
+ (ulong) os_file_n_pending_preads,
+ (ulong) os_file_n_pending_pwrites);
+
+ srv_print_innodb_monitor = TRUE;
+ os_event_set(lock_sys->timeout_event);
+
+ os_thread_sleep(30000000);
+
+ srv_print_innodb_monitor = static_cast<my_bool>(old_val);
+ fprintf(stderr,
+ "InnoDB: ###### Diagnostic info printed"
+ " to the standard error stream\n");
+ }
+
+ return(fatal);
+}
+
+/**********************************************************************//**
+Prints info of the wait array. */
+static
+void
+sync_array_print_info_low(
+/*======================*/
+ FILE* file, /*!< in: file where to print */
+ sync_array_t* arr) /*!< in: wait array */
+{
+ ulint i;
+ ulint count = 0;
+
+ fprintf(file,
+ "OS WAIT ARRAY INFO: reservation count %ld\n",
+ (long) arr->res_count);
+
+ for (i = 0; count < arr->n_reserved; ++i) {
+ sync_cell_t* cell;
+
+ cell = sync_array_get_nth_cell(arr, i);
+
+ if (cell->wait_object != NULL) {
+ count++;
+ sync_array_cell_print(file, cell);
+ }
+ }
+}
+
+/**********************************************************************//**
+Prints info of the wait array. */
+static
+void
+sync_array_print_info(
+/*==================*/
+ FILE* file, /*!< in: file where to print */
+ sync_array_t* arr) /*!< in: wait array */
+{
+ sync_array_enter(arr);
+
+ sync_array_print_info_low(file, arr);
+
+ sync_array_exit(arr);
+}
+
+/**********************************************************************//**
+Create the primary system wait array(s), they are protected by an OS mutex */
+UNIV_INTERN
+void
+sync_array_init(
+/*============*/
+ ulint n_threads) /*!< in: Number of slots to
+ create in all arrays */
+{
+ ulint i;
+ ulint n_slots;
+
+ ut_a(sync_wait_array == NULL);
+ ut_a(srv_sync_array_size > 0);
+ ut_a(n_threads > 0);
+
+ sync_array_size = srv_sync_array_size;
+
+ /* We have to use ut_malloc() because the mutex infrastructure
+ hasn't been initialised yet. It is required by mem_alloc() and
+ the heap functions. */
+
+ sync_wait_array = static_cast<sync_array_t**>(
+ ut_malloc(sizeof(*sync_wait_array) * sync_array_size));
+
+ n_slots = 1 + (n_threads - 1) / sync_array_size;
+
+ for (i = 0; i < sync_array_size; ++i) {
+
+ sync_wait_array[i] = sync_array_create(n_slots);
+ }
+}
+
+/**********************************************************************//**
+Close sync array wait sub-system. */
+UNIV_INTERN
+void
+sync_array_close(void)
+/*==================*/
+{
+ ulint i;
+
+ for (i = 0; i < sync_array_size; ++i) {
+ sync_array_free(sync_wait_array[i]);
+ }
+
+ ut_free(sync_wait_array);
+ sync_wait_array = NULL;
+}
+
+/**********************************************************************//**
+Print info about the sync array(s). */
+UNIV_INTERN
+void
+sync_array_print(
+/*=============*/
+ FILE* file) /*!< in/out: Print to this stream */
+{
+ ulint i;
+
+ for (i = 0; i < sync_array_size; ++i) {
+ sync_array_print_info(file, sync_wait_array[i]);
+ }
+
+ fprintf(file,
+ "OS WAIT ARRAY INFO: signal count %ld\n", (long) sg_count);
+
+}
+
+/**********************************************************************//**
+Get an instance of the sync wait array. */
+UNIV_INTERN
+sync_array_t*
+sync_array_get(void)
+/*================*/
+{
+ ulint i;
+ static ulint count;
+
+#ifdef HAVE_ATOMIC_BUILTINS
+ i = os_atomic_increment_ulint(&count, 1);
+#else
+ i = count++;
+#endif /* HAVE_ATOMIC_BUILTINS */
+
+ return(sync_wait_array[i % sync_array_size]);
+}