diff options
author | unknown <knielsen@knielsen-hq.org> | 2009-06-09 13:16:11 +0200 |
---|---|---|
committer | unknown <knielsen@knielsen-hq.org> | 2009-06-09 13:16:11 +0200 |
commit | a6b7f71329ceb7d0188572f494b5d1a1f0461fc5 (patch) | |
tree | d7e62c1af5118cd3ec9346de436569e907fcc51d /storage/xtradb/sync | |
parent | b125770aaadd09e839ad9211047e88095984308b (diff) | |
parent | 107072563d771422c9bbb9aeeedce8ae19c5b838 (diff) | |
download | mariadb-git-a6b7f71329ceb7d0188572f494b5d1a1f0461fc5.tar.gz |
Import Percona XtraDB into the MariaDB source tree.
Diffstat (limited to 'storage/xtradb/sync')
-rw-r--r-- | storage/xtradb/sync/sync0arr.c | 1045 | ||||
-rw-r--r-- | storage/xtradb/sync/sync0rw.c | 1285 | ||||
-rw-r--r-- | storage/xtradb/sync/sync0sync.c | 1411 |
3 files changed, 3741 insertions, 0 deletions
diff --git a/storage/xtradb/sync/sync0arr.c b/storage/xtradb/sync/sync0arr.c new file mode 100644 index 00000000000..7edbbda5b55 --- /dev/null +++ b/storage/xtradb/sync/sync0arr.c @@ -0,0 +1,1045 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The wait array used in synchronization primitives + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#include "sync0arr.h" +#ifdef UNIV_NONINL +#include "sync0arr.ic" +#endif + +#include "sync0sync.h" +#include "sync0rw.h" +#include "os0sync.h" +#include "os0file.h" +#include "srv0srv.h" + +/* + WAIT ARRAY + ========== + +The wait array consists of cells each of which has an +an operating system event object created for it. The threads +waiting for a mutex, for example, can reserve a cell +in the array and suspend themselves to wait for the event +to become signaled. When using the wait array, remember to make +sure that some thread holding the synchronization object +will eventually know that there is a waiter in the array and +signal the object, to prevent infinite wait. +Why we chose to implement a wait array? First, to make +mutexes fast, we had to code our own implementation of them, +which only in usually uncommon cases resorts to using +slow operating system primitives. Then we had the choice of +assigning a unique OS event for each mutex, which would +be simpler, or using a global wait array. In some operating systems, +the global wait array solution is more efficient and flexible, +because we can do with a very small number of OS events, +say 200. In NT 3.51, allocating events seems to be a quadratic +algorithm, because 10 000 events are created fast, but +100 000 events takes a couple of minutes to create. + +As of 5.0.30 the above mentioned design is changed. Since now +OS can handle millions of wait events efficiently, we no longer +have this concept of each cell of wait array having one event. +Instead, now the event that a thread wants to wait on is embedded +in the wait object (mutex or rw_lock). We still keep the global +wait array for the sake of diagnostics and also to avoid infinite +wait The error_monitor thread scans the global wait array to signal +any waiting threads who have missed the signal. */ + +/* A cell where an individual thread may wait suspended +until a resource is released. The suspending is implemented +using an operating system event semaphore. */ +struct sync_cell_struct { + void* wait_object; /* pointer to the object the + thread is waiting for; if NULL + the cell is free for use */ + mutex_t* old_wait_mutex; /* the latest wait mutex in cell */ + rw_lock_t* old_wait_rw_lock;/* the latest wait rw-lock in cell */ + ulint request_type; /* lock type requested on the + object */ + const char* file; /* in debug version file where + requested */ + ulint line; /* in debug version line where + requested */ + os_thread_id_t thread; /* thread id of this waiting + thread */ + ibool waiting; /* TRUE if the thread has already + called sync_array_event_wait + on this cell */ + ib_int64_t signal_count; /* We capture the signal_count + of the wait_object when we + reset the event. This value is + then passed on to os_event_wait + and we wait only if the event + has not been signalled in the + period between the reset and + wait call. */ + time_t reservation_time;/* time when the thread reserved + the wait cell */ +}; + +/* NOTE: It is allowed for a thread to wait +for an event allocated for the array without owning the +protecting mutex (depending on the case: OS or database mutex), but +all changes (set or reset) to the state of the event must be made +while owning the mutex. */ +struct sync_array_struct { + ulint n_reserved; /* number of currently reserved + cells in the wait array */ + ulint n_cells; /* number of cells in the + wait array */ + sync_cell_t* array; /* pointer to wait array */ + ulint protection; /* this flag tells which + mutex protects the data */ + mutex_t mutex; /* possible database mutex + protecting this data structure */ + os_mutex_t os_mutex; /* Possible operating system mutex + protecting the data structure. + As this data structure is used in + constructing the database mutex, + to prevent infinite recursion + in implementation, we fall back to + an OS mutex. */ + ulint sg_count; /* count of how many times an + object has been signalled */ + ulint res_count; /* count of cell reservations + since creation of the array */ +}; + +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +This function is called only in the debug version. Detects a deadlock +of one or more threads because of waits of semaphores. */ +static +ibool +sync_array_detect_deadlock( +/*=======================*/ + /* out: TRUE if deadlock detected */ + sync_array_t* arr, /* in: wait array; NOTE! the caller must + own the mutex to array */ + sync_cell_t* start, /* in: cell where recursive search started */ + sync_cell_t* cell, /* in: cell to search */ + ulint depth); /* in: recursion depth */ +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************* +Gets the nth cell in array. */ +static +sync_cell_t* +sync_array_get_nth_cell( +/*====================*/ + /* out: cell */ + sync_array_t* arr, /* in: sync array */ + ulint n) /* in: index */ +{ + ut_a(arr); + ut_a(n < arr->n_cells); + + return(arr->array + n); +} + +/********************************************************************** +Reserves the mutex semaphore protecting a sync array. */ +static +void +sync_array_enter( +/*=============*/ + sync_array_t* arr) /* in: sync wait array */ +{ + ulint protection; + + protection = arr->protection; + + if (protection == SYNC_ARRAY_OS_MUTEX) { + os_mutex_enter(arr->os_mutex); + } else if (protection == SYNC_ARRAY_MUTEX) { + mutex_enter(&(arr->mutex)); + } else { + ut_error; + } +} + +/********************************************************************** +Releases the mutex semaphore protecting a sync array. */ +static +void +sync_array_exit( +/*============*/ + sync_array_t* arr) /* in: sync wait array */ +{ + ulint protection; + + protection = arr->protection; + + if (protection == SYNC_ARRAY_OS_MUTEX) { + os_mutex_exit(arr->os_mutex); + } else if (protection == SYNC_ARRAY_MUTEX) { + mutex_exit(&(arr->mutex)); + } else { + ut_error; + } +} + +/*********************************************************************** +Creates a synchronization wait array. It is protected by a mutex +which is automatically reserved when the functions operating on it +are called. */ +UNIV_INTERN +sync_array_t* +sync_array_create( +/*==============*/ + /* out, own: created wait array */ + ulint n_cells, /* in: number of cells in the array + to create */ + ulint protection) /* in: either SYNC_ARRAY_OS_MUTEX or + SYNC_ARRAY_MUTEX: determines the type + of mutex protecting the data structure */ +{ + sync_array_t* arr; + sync_cell_t* cell_array; + sync_cell_t* cell; + ulint i; + + ut_a(n_cells > 0); + + /* Allocate memory for the data structures */ + arr = ut_malloc(sizeof(sync_array_t)); + + cell_array = ut_malloc(sizeof(sync_cell_t) * n_cells); + + arr->n_cells = n_cells; + arr->n_reserved = 0; + arr->array = cell_array; + arr->protection = protection; + arr->sg_count = 0; + arr->res_count = 0; + + /* Then create the mutex to protect the wait array complex */ + if (protection == SYNC_ARRAY_OS_MUTEX) { + arr->os_mutex = os_mutex_create(NULL); + } else if (protection == SYNC_ARRAY_MUTEX) { + mutex_create(&arr->mutex, SYNC_NO_ORDER_CHECK); + } else { + ut_error; + } + + for (i = 0; i < n_cells; i++) { + cell = sync_array_get_nth_cell(arr, i); + cell->wait_object = NULL; + cell->waiting = FALSE; + cell->signal_count = 0; + } + + return(arr); +} + +/********************************************************************** +Frees the resources in a wait array. */ +UNIV_INTERN +void +sync_array_free( +/*============*/ + sync_array_t* arr) /* in, own: sync wait array */ +{ + ulint protection; + + ut_a(arr->n_reserved == 0); + + sync_array_validate(arr); + + protection = arr->protection; + + /* Release the mutex protecting the wait array complex */ + + if (protection == SYNC_ARRAY_OS_MUTEX) { + os_mutex_free(arr->os_mutex); + } else if (protection == SYNC_ARRAY_MUTEX) { + mutex_free(&(arr->mutex)); + } else { + ut_error; + } + + ut_free(arr->array); + ut_free(arr); +} + +/************************************************************************ +Validates the integrity of the wait array. Checks +that the number of reserved cells equals the count variable. */ +UNIV_INTERN +void +sync_array_validate( +/*================*/ + sync_array_t* arr) /* in: sync wait array */ +{ + ulint i; + sync_cell_t* cell; + ulint count = 0; + + sync_array_enter(arr); + + for (i = 0; i < arr->n_cells; i++) { + cell = sync_array_get_nth_cell(arr, i); + if (cell->wait_object != NULL) { + count++; + } + } + + ut_a(count == arr->n_reserved); + + sync_array_exit(arr); +} + +/*********************************************************************** +Returns the event that the thread owning the cell waits for. */ +static +os_event_t +sync_cell_get_event( +/*================*/ + sync_cell_t* cell) /* in: non-empty sync array cell */ +{ + ulint type = cell->request_type; + + if (type == SYNC_MUTEX) { + return(((mutex_t *) cell->wait_object)->event); + } else if (type == RW_LOCK_WAIT_EX) { + return(((rw_lock_t *) cell->wait_object)->wait_ex_event); +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + } else if (type == RW_LOCK_SHARED) { + return(((rw_lock_t *) cell->wait_object)->s_event); + } else { /* RW_LOCK_EX */ + return(((rw_lock_t *) cell->wait_object)->x_event); +#else + } else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */ + return(((rw_lock_t *) cell->wait_object)->event); +#endif + } +} + +/********************************************************************** +Reserves a wait array cell for waiting for an object. +The event of the cell is reset to nonsignalled state. */ +UNIV_INTERN +void +sync_array_reserve_cell( +/*====================*/ + sync_array_t* arr, /* in: wait array */ + void* object, /* in: pointer to the object to wait for */ + ulint type, /* in: lock request type */ + const char* file, /* in: file where requested */ + ulint line, /* in: line where requested */ + ulint* index) /* out: index of the reserved cell */ +{ + sync_cell_t* cell; + os_event_t event; + ulint i; + + ut_a(object); + ut_a(index); + + sync_array_enter(arr); + + arr->res_count++; + + /* Reserve a new cell. */ + for (i = 0; i < arr->n_cells; i++) { + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object == NULL) { + + cell->waiting = FALSE; + cell->wait_object = object; + + if (type == SYNC_MUTEX) { + cell->old_wait_mutex = object; + } else { + cell->old_wait_rw_lock = object; + } + + cell->request_type = type; + + cell->file = file; + cell->line = line; + + arr->n_reserved++; + + *index = i; + + sync_array_exit(arr); + + /* Make sure the event is reset and also store + the value of signal_count at which the event + was reset. */ + event = sync_cell_get_event(cell); + cell->signal_count = os_event_reset(event); + + cell->reservation_time = time(NULL); + + cell->thread = os_thread_get_curr_id(); + + return; + } + } + + ut_error; /* No free cell found */ + + return; +} + +/********************************************************************** +This function should be called when a thread starts to wait on +a wait array cell. In the debug version this function checks +if the wait for a semaphore will result in a deadlock, in which +case prints info and asserts. */ +UNIV_INTERN +void +sync_array_wait_event( +/*==================*/ + sync_array_t* arr, /* in: wait array */ + ulint index) /* in: index of the reserved cell */ +{ + sync_cell_t* cell; + os_event_t event; + + ut_a(arr); + + sync_array_enter(arr); + + cell = sync_array_get_nth_cell(arr, index); + + ut_a(cell->wait_object); + ut_a(!cell->waiting); + ut_ad(os_thread_get_curr_id() == cell->thread); + + event = sync_cell_get_event(cell); + cell->waiting = TRUE; + +#ifdef UNIV_SYNC_DEBUG + + /* We use simple enter to the mutex below, because if + we cannot acquire it at once, mutex_enter would call + recursively sync_array routines, leading to trouble. + rw_lock_debug_mutex freezes the debug lists. */ + + rw_lock_debug_mutex_enter(); + + if (TRUE == sync_array_detect_deadlock(arr, cell, cell, 0)) { + + fputs("########################################\n", stderr); + ut_error; + } + + rw_lock_debug_mutex_exit(); +#endif + sync_array_exit(arr); + + os_event_wait_low(event, cell->signal_count); + + sync_array_free_cell(arr, index); +} + +/********************************************************************** +Reports info of a wait array cell. */ +static +void +sync_array_cell_print( +/*==================*/ + FILE* file, /* in: file where to print */ + sync_cell_t* cell) /* in: sync cell */ +{ + mutex_t* mutex; + rw_lock_t* rwlock; + ulint type; + ulint writer; + + type = cell->request_type; + + fprintf(file, + "--Thread %lu has waited at %s line %lu" + " for %.2f seconds the semaphore:\n", + (ulong) os_thread_pf(cell->thread), cell->file, + (ulong) cell->line, + difftime(time(NULL), cell->reservation_time)); + + if (type == SYNC_MUTEX) { + /* We use old_wait_mutex in case the cell has already + been freed meanwhile */ + mutex = cell->old_wait_mutex; + + fprintf(file, + "Mutex at %p created file %s line %lu, lock var %lu\n" +#ifdef UNIV_SYNC_DEBUG + "Last time reserved in file %s line %lu, " +#endif /* UNIV_SYNC_DEBUG */ + "waiters flag %lu\n", + (void*) mutex, mutex->cfile_name, (ulong) mutex->cline, + (ulong) mutex->lock_word, +#ifdef UNIV_SYNC_DEBUG + mutex->file_name, (ulong) mutex->line, +#endif /* UNIV_SYNC_DEBUG */ + (ulong) mutex->waiters); + + } else if (type == RW_LOCK_EX + || type == RW_LOCK_WAIT_EX + || type == RW_LOCK_SHARED) { + + fputs(type == RW_LOCK_SHARED ? "S-lock on" : "X-lock on", file); + + rwlock = cell->old_wait_rw_lock; + + fprintf(file, + " RW-latch at %p created in file %s line %lu\n", + (void*) rwlock, rwlock->cfile_name, + (ulong) rwlock->cline); + writer = rw_lock_get_writer(rwlock); + if (writer != RW_LOCK_NOT_LOCKED) { + fprintf(file, + "a writer (thread id %lu) has" + " reserved it in mode %s", + (ulong) os_thread_pf(rwlock->writer_thread), + writer == RW_LOCK_EX + ? " exclusive\n" + : " wait exclusive\n"); + } + + fprintf(file, +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + "number of readers %lu, s_waiters flag %lu, x_waiters flag %lu, " +#else + "number of readers %lu, waiters flag %lu, " +#endif + "lock_word: %lx\n" + "Last time read locked in file %s line %lu\n" + "Last time write locked in file %s line %lu\n", + (ulong) rw_lock_get_reader_count(rwlock), +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + (ulong) rwlock->s_waiters, + (ulong) (rwlock->x_waiters || rwlock->wait_ex_waiters), +#else + (ulong) rwlock->waiters, +#endif + rwlock->lock_word, + rwlock->last_s_file_name, + (ulong) rwlock->last_s_line, + rwlock->last_x_file_name, + (ulong) rwlock->last_x_line); + } else { + ut_error; + } + + if (!cell->waiting) { + fputs("wait has ended\n", file); + } +} + +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Looks for a cell with the given thread id. */ +static +sync_cell_t* +sync_array_find_thread( +/*===================*/ + /* out: pointer to cell or NULL + if not found */ + sync_array_t* arr, /* in: wait array */ + os_thread_id_t thread) /* in: thread id */ +{ + ulint i; + sync_cell_t* cell; + + for (i = 0; i < arr->n_cells; i++) { + + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object != NULL + && os_thread_eq(cell->thread, thread)) { + + return(cell); /* Found */ + } + } + + return(NULL); /* Not found */ +} + +/********************************************************************** +Recursion step for deadlock detection. */ +static +ibool +sync_array_deadlock_step( +/*=====================*/ + /* out: TRUE if deadlock detected */ + sync_array_t* arr, /* in: wait array; NOTE! the caller must + own the mutex to array */ + sync_cell_t* start, /* in: cell where recursive search + started */ + os_thread_id_t thread, /* in: thread to look at */ + ulint pass, /* in: pass value */ + ulint depth) /* in: recursion depth */ +{ + sync_cell_t* new; + ibool ret; + + depth++; + + if (pass != 0) { + /* If pass != 0, then we do not know which threads are + responsible of releasing the lock, and no deadlock can + be detected. */ + + return(FALSE); + } + + new = sync_array_find_thread(arr, thread); + + if (new == start) { + /* Stop running of other threads */ + + ut_dbg_stop_threads = TRUE; + + /* Deadlock */ + fputs("########################################\n" + "DEADLOCK of threads detected!\n", stderr); + + return(TRUE); + + } else if (new) { + ret = sync_array_detect_deadlock(arr, start, new, depth); + + if (ret) { + return(TRUE); + } + } + return(FALSE); +} + +/********************************************************************** +This function is called only in the debug version. Detects a deadlock +of one or more threads because of waits of semaphores. */ +static +ibool +sync_array_detect_deadlock( +/*=======================*/ + /* out: TRUE if deadlock detected */ + sync_array_t* arr, /* in: wait array; NOTE! the caller must + own the mutex to array */ + sync_cell_t* start, /* in: cell where recursive search started */ + sync_cell_t* cell, /* in: cell to search */ + ulint depth) /* in: recursion depth */ +{ + mutex_t* mutex; + rw_lock_t* lock; + os_thread_id_t thread; + ibool ret; + rw_lock_debug_t*debug; + + ut_a(arr); + ut_a(start); + ut_a(cell); + ut_ad(cell->wait_object); + ut_ad(os_thread_get_curr_id() == start->thread); + ut_ad(depth < 100); + + depth++; + + if (!cell->waiting) { + + return(FALSE); /* No deadlock here */ + } + + if (cell->request_type == SYNC_MUTEX) { + + mutex = cell->wait_object; + + if (mutex_get_lock_word(mutex) != 0) { + + thread = mutex->thread_id; + + /* Note that mutex->thread_id above may be + also OS_THREAD_ID_UNDEFINED, because the + thread which held the mutex maybe has not + yet updated the value, or it has already + released the mutex: in this case no deadlock + can occur, as the wait array cannot contain + a thread with ID_UNDEFINED value. */ + + ret = sync_array_deadlock_step(arr, start, thread, 0, + depth); + if (ret) { + fprintf(stderr, + "Mutex %p owned by thread %lu file %s line %lu\n", + mutex, (ulong) os_thread_pf(mutex->thread_id), + mutex->file_name, (ulong) mutex->line); + sync_array_cell_print(stderr, cell); + + return(TRUE); + } + } + + return(FALSE); /* No deadlock */ + + } else if (cell->request_type == RW_LOCK_EX + || cell->request_type == RW_LOCK_WAIT_EX) { + + lock = cell->wait_object; + + debug = UT_LIST_GET_FIRST(lock->debug_list); + + while (debug != NULL) { + + thread = debug->thread_id; + + if (((debug->lock_type == RW_LOCK_EX) + && !os_thread_eq(thread, cell->thread)) + || ((debug->lock_type == RW_LOCK_WAIT_EX) + && !os_thread_eq(thread, cell->thread)) + || (debug->lock_type == RW_LOCK_SHARED)) { + + /* The (wait) x-lock request can block + infinitely only if someone (can be also cell + thread) is holding s-lock, or someone + (cannot be cell thread) (wait) x-lock, and + he is blocked by start thread */ + + ret = sync_array_deadlock_step( + arr, start, thread, debug->pass, + depth); + if (ret) { +print: + fprintf(stderr, "rw-lock %p ", + (void*) lock); + sync_array_cell_print(stderr, cell); + rw_lock_debug_print(debug); + return(TRUE); + } + } + + debug = UT_LIST_GET_NEXT(list, debug); + } + + return(FALSE); + + } else if (cell->request_type == RW_LOCK_SHARED) { + + lock = cell->wait_object; + debug = UT_LIST_GET_FIRST(lock->debug_list); + + while (debug != NULL) { + + thread = debug->thread_id; + + if ((debug->lock_type == RW_LOCK_EX) + || (debug->lock_type == RW_LOCK_WAIT_EX)) { + + /* The s-lock request can block infinitely + only if someone (can also be cell thread) is + holding (wait) x-lock, and he is blocked by + start thread */ + + ret = sync_array_deadlock_step( + arr, start, thread, debug->pass, + depth); + if (ret) { + goto print; + } + } + + debug = UT_LIST_GET_NEXT(list, debug); + } + + return(FALSE); + + } else { + ut_error; + } + + return(TRUE); /* Execution never reaches this line: for compiler + fooling only */ +} +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************** +Determines if we can wake up the thread waiting for a sempahore. */ +static +ibool +sync_arr_cell_can_wake_up( +/*======================*/ + sync_cell_t* cell) /* in: cell to search */ +{ + mutex_t* mutex; + rw_lock_t* lock; + + if (cell->request_type == SYNC_MUTEX) { + + mutex = cell->wait_object; + + if (mutex_get_lock_word(mutex) == 0) { + + return(TRUE); + } + + } else if (cell->request_type == RW_LOCK_EX) { + + lock = cell->wait_object; + + if (lock->lock_word > 0) { + /* Either unlocked or only read locked. */ + + return(TRUE); + } + + } else if (cell->request_type == RW_LOCK_WAIT_EX) { + + lock = cell->wait_object; + + /* lock_word == 0 means all readers have left */ + if (lock->lock_word == 0) { + + return(TRUE); + } + } else if (cell->request_type == RW_LOCK_SHARED) { + lock = cell->wait_object; + + /* lock_word > 0 means no writer or reserved writer */ + if (lock->lock_word > 0) { + + return(TRUE); + } + } + + return(FALSE); +} + +/********************************************************************** +Frees the cell. NOTE! sync_array_wait_event frees the cell +automatically! */ +UNIV_INTERN +void +sync_array_free_cell( +/*=================*/ + sync_array_t* arr, /* in: wait array */ + ulint index) /* in: index of the cell in array */ +{ + sync_cell_t* cell; + + sync_array_enter(arr); + + cell = sync_array_get_nth_cell(arr, index); + + ut_a(cell->wait_object != NULL); + + cell->waiting = FALSE; + cell->wait_object = NULL; + cell->signal_count = 0; + + ut_a(arr->n_reserved > 0); + arr->n_reserved--; + + sync_array_exit(arr); +} + +/************************************************************************** +Increments the signalled count. */ +UNIV_INTERN +void +sync_array_object_signalled( +/*========================*/ + sync_array_t* arr) /* in: wait array */ +{ +#ifdef HAVE_GCC_ATOMIC_BUILTINS + (void) os_atomic_increment(&arr->sg_count, 1); +#else + sync_array_enter(arr); + + arr->sg_count++; + + sync_array_exit(arr); +#endif +} + +/************************************************************************** +If the wakeup algorithm does not work perfectly at semaphore relases, +this function will do the waking (see the comment in mutex_exit). This +function should be called about every 1 second in the server. + +Note that there's a race condition between this thread and mutex_exit +changing the lock_word and calling signal_object, so sometimes this finds +threads to wake up even when nothing has gone wrong. */ +UNIV_INTERN +void +sync_arr_wake_threads_if_sema_free(void) +/*====================================*/ +{ + sync_array_t* arr = sync_primary_wait_array; + sync_cell_t* cell; + ulint count; + ulint i; + os_event_t event; + + sync_array_enter(arr); + + i = 0; + count = 0; + + while (count < arr->n_reserved) { + + cell = sync_array_get_nth_cell(arr, i); + i++; + + if (cell->wait_object == NULL) { + continue; + } + count++; + + if (sync_arr_cell_can_wake_up(cell)) { + + event = sync_cell_get_event(cell); + + os_event_set(event); + } + + } + + sync_array_exit(arr); +} + +/************************************************************************** +Prints warnings of long semaphore waits to stderr. */ +UNIV_INTERN +ibool +sync_array_print_long_waits(void) +/*=============================*/ + /* out: TRUE if fatal semaphore wait threshold + was exceeded */ +{ + sync_cell_t* cell; + ibool old_val; + ibool noticed = FALSE; + ulint i; + ulint fatal_timeout = srv_fatal_semaphore_wait_threshold; + ibool fatal = FALSE; + + for (i = 0; i < sync_primary_wait_array->n_cells; i++) { + + cell = sync_array_get_nth_cell(sync_primary_wait_array, i); + + if (cell->wait_object != NULL && cell->waiting + && difftime(time(NULL), cell->reservation_time) > 240) { + fputs("InnoDB: Warning: a long semaphore wait:\n", + stderr); + sync_array_cell_print(stderr, cell); + noticed = TRUE; + } + + if (cell->wait_object != NULL && cell->waiting + && difftime(time(NULL), cell->reservation_time) + > fatal_timeout) { + fatal = TRUE; + } + } + + if (noticed) { + fprintf(stderr, + "InnoDB: ###### Starts InnoDB Monitor" + " for 30 secs to print diagnostic info:\n"); + old_val = srv_print_innodb_monitor; + + /* If some crucial semaphore is reserved, then also the InnoDB + Monitor can hang, and we do not get diagnostics. Since in + many cases an InnoDB hang is caused by a pwrite() or a pread() + call hanging inside the operating system, let us print right + now the values of pending calls of these. */ + + fprintf(stderr, + "InnoDB: Pending preads %lu, pwrites %lu\n", + (ulong)os_file_n_pending_preads, + (ulong)os_file_n_pending_pwrites); + + srv_print_innodb_monitor = TRUE; + os_event_set(srv_lock_timeout_thread_event); + + os_thread_sleep(30000000); + + srv_print_innodb_monitor = old_val; + fprintf(stderr, + "InnoDB: ###### Diagnostic info printed" + " to the standard error stream\n"); + } + + return(fatal); +} + +/************************************************************************** +Prints info of the wait array. */ +static +void +sync_array_output_info( +/*===================*/ + FILE* file, /* in: file where to print */ + sync_array_t* arr) /* in: wait array; NOTE! caller must own the + mutex */ +{ + sync_cell_t* cell; + ulint count; + ulint i; + + fprintf(file, + "OS WAIT ARRAY INFO: reservation count %ld, signal count %ld\n", + (long) arr->res_count, (long) arr->sg_count); + i = 0; + count = 0; + + while (count < arr->n_reserved) { + + cell = sync_array_get_nth_cell(arr, i); + + if (cell->wait_object != NULL) { + count++; + sync_array_cell_print(file, cell); + } + + i++; + } +} + +/************************************************************************** +Prints info of the wait array. */ +UNIV_INTERN +void +sync_array_print_info( +/*==================*/ + FILE* file, /* in: file where to print */ + sync_array_t* arr) /* in: wait array */ +{ + sync_array_enter(arr); + + sync_array_output_info(file, arr); + + sync_array_exit(arr); +} diff --git a/storage/xtradb/sync/sync0rw.c b/storage/xtradb/sync/sync0rw.c new file mode 100644 index 00000000000..556e46a2ca1 --- /dev/null +++ b/storage/xtradb/sync/sync0rw.c @@ -0,0 +1,1285 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +The read-write lock (for thread synchronization) + +Created 9/11/1995 Heikki Tuuri +*******************************************************/ + +#include "sync0rw.h" +#ifdef UNIV_NONINL +#include "sync0rw.ic" +#endif + +#include "os0thread.h" +#include "mem0mem.h" +#include "srv0srv.h" + +/* + IMPLEMENTATION OF THE RW_LOCK + ============================= +The status of a rw_lock is held in lock_word. The initial value of lock_word is +X_LOCK_DECR. lock_word is decremented by 1 for each s-lock and by X_LOCK_DECR +for each x-lock. This describes the lock state for each value of lock_word: + +lock_word == X_LOCK_DECR: Unlocked. +0 < lock_word < X_LOCK_DECR: Read locked, no waiting writers. + (X_LOCK_DECR - lock_word) is the + number of readers that hold the lock. +lock_word == 0: Write locked +-X_LOCK_DECR < lock_word < 0: Read locked, with a waiting writer. + (-lock_word) is the number of readers + that hold the lock. +lock_word <= -X_LOCK_DECR: Recursively write locked. lock_word has been + decremented by X_LOCK_DECR once for each lock, + so the number of locks is: + ((-lock_word) / X_LOCK_DECR) + 1 +When lock_word <= -X_LOCK_DECR, we also know that lock_word % X_LOCK_DECR == 0: +other values of lock_word are invalid. + +The lock_word is always read and updated atomically and consistently, so that +it always represents the state of the lock, and the state of the lock changes +with a single atomic operation. This lock_word holds all of the information +that a thread needs in order to determine if it is eligible to gain the lock +or if it must spin or sleep. The one exception to this is that writer_thread +must be verified before recursive write locks: to solve this scenario, we make +writer_thread readable by all threads, but only writeable by the x-lock holder. + +The other members of the lock obey the following rules to remain consistent: + +recursive: This and the writer_thread field together control the + behaviour of recursive x-locking. + lock->recursive must be FALSE in following states: + 1) The writer_thread contains garbage i.e.: the + lock has just been initialized. + 2) The lock is not x-held and there is no + x-waiter waiting on WAIT_EX event. + 3) The lock is x-held or there is an x-waiter + waiting on WAIT_EX event but the 'pass' value + is non-zero. + lock->recursive is TRUE iff: + 1) The lock is x-held or there is an x-waiter + waiting on WAIT_EX event and the 'pass' value + is zero. + This flag must be set after the writer_thread field + has been updated with a memory ordering barrier. + It is unset before the lock_word has been incremented. +writer_thread: Is used only in recursive x-locking. Can only be safely + read iff lock->recursive flag is TRUE. + This field is uninitialized at lock creation time and + is updated atomically when x-lock is acquired or when + move_ownership is called. A thread is only allowed to + set the value of this field to it's thread_id i.e.: a + thread cannot set writer_thread to some other thread's + id. +waiters: May be set to 1 anytime, but to avoid unnecessary wake-up + signals, it should only be set to 1 when there are threads + waiting on event. Must be 1 when a writer starts waiting to + ensure the current x-locking thread sends a wake-up signal + during unlock. May only be reset to 0 immediately before a + a wake-up signal is sent to event. On most platforms, a + memory barrier is required after waiters is set, and before + verifying lock_word is still held, to ensure some unlocker + really does see the flags new value. +event: Threads wait on event for read or writer lock when another + thread has an x-lock or an x-lock reservation (wait_ex). A + thread may only wait on event after performing the following + actions in order: + (1) Record the counter value of event (with os_event_reset). + (2) Set waiters to 1. + (3) Verify lock_word <= 0. + (1) must come before (2) to ensure signal is not missed. + (2) must come before (3) to ensure a signal is sent. + These restrictions force the above ordering. + Immediately before sending the wake-up signal, we should: + (1) Verify lock_word == X_LOCK_DECR (unlocked) + (2) Reset waiters to 0. +wait_ex_event: A thread may only wait on the wait_ex_event after it has + performed the following actions in order: + (1) Decrement lock_word by X_LOCK_DECR. + (2) Record counter value of wait_ex_event (os_event_reset, + called from sync_array_reserve_cell). + (3) Verify that lock_word < 0. + (1) must come first to ensures no other threads become reader + or next writer, and notifies unlocker that signal must be sent. + (2) must come before (3) to ensure the signal is not missed. + These restrictions force the above ordering. + Immediately before sending the wake-up signal, we should: + Verify lock_word == 0 (waiting thread holds x_lock) +*/ + + +/* number of spin waits on rw-latches, +resulted during shared (read) locks */ +UNIV_INTERN ib_int64_t rw_s_spin_wait_count = 0; +UNIV_INTERN ib_int64_t rw_s_spin_round_count = 0; + +/* number of OS waits on rw-latches, +resulted during shared (read) locks */ +UNIV_INTERN ib_int64_t rw_s_os_wait_count = 0; + +/* number of unlocks (that unlock shared locks), +set only when UNIV_SYNC_PERF_STAT is defined */ +UNIV_INTERN ib_int64_t rw_s_exit_count = 0; + +/* number of spin waits on rw-latches, +resulted during exclusive (write) locks */ +UNIV_INTERN ib_int64_t rw_x_spin_wait_count = 0; +UNIV_INTERN ib_int64_t rw_x_spin_round_count = 0; + +/* number of OS waits on rw-latches, +resulted during exclusive (write) locks */ +UNIV_INTERN ib_int64_t rw_x_os_wait_count = 0; + +/* number of unlocks (that unlock exclusive locks), +set only when UNIV_SYNC_PERF_STAT is defined */ +UNIV_INTERN ib_int64_t rw_x_exit_count = 0; + +/* The global list of rw-locks */ +UNIV_INTERN rw_lock_list_t rw_lock_list; +UNIV_INTERN mutex_t rw_lock_list_mutex; + +#ifdef UNIV_SYNC_DEBUG +/* The global mutex which protects debug info lists of all rw-locks. +To modify the debug info list of an rw-lock, this mutex has to be +acquired in addition to the mutex protecting the lock. */ + +UNIV_INTERN mutex_t rw_lock_debug_mutex; +/* If deadlock detection does not get immediately the mutex, +it may wait for this event */ +UNIV_INTERN os_event_t rw_lock_debug_event; +/* This is set to TRUE, if there may be waiters for the event */ +UNIV_INTERN ibool rw_lock_debug_waiters; + +/********************************************************************** +Creates a debug info struct. */ +static +rw_lock_debug_t* +rw_lock_debug_create(void); +/*======================*/ +/********************************************************************** +Frees a debug info struct. */ +static +void +rw_lock_debug_free( +/*===============*/ + rw_lock_debug_t* info); + +/********************************************************************** +Creates a debug info struct. */ +static +rw_lock_debug_t* +rw_lock_debug_create(void) +/*======================*/ +{ + return((rw_lock_debug_t*) mem_alloc(sizeof(rw_lock_debug_t))); +} + +/********************************************************************** +Frees a debug info struct. */ +static +void +rw_lock_debug_free( +/*===============*/ + rw_lock_debug_t* info) +{ + mem_free(info); +} +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************** +Creates, or rather, initializes an rw-lock object in a specified memory +location (which must be appropriately aligned). The rw-lock is initialized +to the non-locked state. Explicit freeing of the rw-lock with rw_lock_free +is necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +rw_lock_create_func( +/*================*/ + rw_lock_t* lock, /* in: pointer to memory */ +#ifdef UNIV_DEBUG +# ifdef UNIV_SYNC_DEBUG + ulint level, /* in: level */ +# endif /* UNIV_SYNC_DEBUG */ + const char* cmutex_name, /* in: mutex name */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /* in: file name where created */ + ulint cline) /* in: file line where created */ +{ + /* If this is the very first time a synchronization object is + created, then the following call initializes the sync system. */ + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_create(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK); + + lock->mutex.cfile_name = cfile_name; + lock->mutex.cline = cline; + +# if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + lock->mutex.cmutex_name = cmutex_name; + lock->mutex.mutex_type = 1; +# endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + +#else /* INNODB_RW_LOCKS_USE_ATOMICS */ +# ifdef UNIV_DEBUG + UT_NOT_USED(cmutex_name); +# endif +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ + + lock->lock_word = X_LOCK_DECR; +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + lock->s_waiters = 0; + lock->x_waiters = 0; + lock->wait_ex_waiters = 0; + lock->writer = RW_LOCK_NOT_LOCKED; + lock->writer_count = 0; + lock->reader_count = 0; + lock->writer_is_wait_ex = FALSE; +#else + lock->waiters = 0; +#endif + + /* We set this value to signify that lock->writer_thread + contains garbage at initialization and cannot be used for + recursive x-locking. */ + lock->recursive = FALSE; + +#ifdef UNIV_SYNC_DEBUG + UT_LIST_INIT(lock->debug_list); + + lock->level = level; +#endif /* UNIV_SYNC_DEBUG */ + + lock->magic_n = RW_LOCK_MAGIC_N; + + lock->cfile_name = cfile_name; + lock->cline = (unsigned int) cline; + + lock->count_os_wait = 0; + lock->last_s_file_name = "not yet reserved"; + lock->last_x_file_name = "not yet reserved"; + lock->last_s_line = 0; + lock->last_x_line = 0; +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + lock->s_event = os_event_create(NULL); + lock->x_event = os_event_create(NULL); +#else + lock->event = os_event_create(NULL); +#endif + lock->wait_ex_event = os_event_create(NULL); + + mutex_enter(&rw_lock_list_mutex); + + if (UT_LIST_GET_LEN(rw_lock_list) > 0) { + ut_a(UT_LIST_GET_FIRST(rw_lock_list)->magic_n + == RW_LOCK_MAGIC_N); + } + + UT_LIST_ADD_FIRST(list, rw_lock_list, lock); + + mutex_exit(&rw_lock_list_mutex); +} + +/********************************************************************** +Calling this function is obligatory only if the memory buffer containing +the rw-lock is freed. Removes an rw-lock object from the global list. The +rw-lock is checked to be in the non-locked state. */ +UNIV_INTERN +void +rw_lock_free( +/*=========*/ + rw_lock_t* lock) /* in: rw-lock */ +{ + ut_ad(rw_lock_validate(lock)); +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); + ut_a(rw_lock_get_s_waiters(lock) == 0); + ut_a(rw_lock_get_x_waiters(lock) == 0); + ut_a(rw_lock_get_wx_waiters(lock) == 0); + ut_a(rw_lock_get_reader_count(lock) == 0); +#else + ut_a(lock->lock_word == X_LOCK_DECR); +#endif + + lock->magic_n = 0; + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_free(rw_lock_get_mutex(lock)); +#endif /* INNODB_RW_LOCKS_USE_ATOMICS */ + + mutex_enter(&rw_lock_list_mutex); +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + os_event_free(lock->s_event); + os_event_free(lock->x_event); +#else + os_event_free(lock->event); +#endif + + os_event_free(lock->wait_ex_event); + + if (UT_LIST_GET_PREV(list, lock)) { + ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); + } + if (UT_LIST_GET_NEXT(list, lock)) { + ut_a(UT_LIST_GET_NEXT(list, lock)->magic_n == RW_LOCK_MAGIC_N); + } + + UT_LIST_REMOVE(list, rw_lock_list, lock); + + mutex_exit(&rw_lock_list_mutex); +} + +#ifdef UNIV_DEBUG +/********************************************************************** +Checks that the rw-lock has been initialized and that there are no +simultaneous shared and exclusive locks. */ +UNIV_INTERN +ibool +rw_lock_validate( +/*=============*/ + rw_lock_t* lock) +{ + ut_a(lock); + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + ut_a(lock->magic_n == RW_LOCK_MAGIC_N); + + ulint waiters = rw_lock_get_s_waiters(lock); + ut_a(waiters == 0 || waiters == 1); + waiters = rw_lock_get_x_waiters(lock); + ut_a(waiters == 0 || waiters == 1); + waiters = rw_lock_get_wx_waiters(lock); + ut_a(waiters == 0 || waiters == 1); +#else + ulint waiters = rw_lock_get_waiters(lock); + lint lock_word = lock->lock_word; + + ut_a(lock->magic_n == RW_LOCK_MAGIC_N); + ut_a(waiters == 0 || waiters == 1); + ut_a(lock_word > -X_LOCK_DECR ||(-lock_word) % X_LOCK_DECR == 0); +#endif + + return(TRUE); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************** +Lock an rw-lock in shared mode for the current thread. If the rw-lock is +locked in exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock, before suspending the thread. */ +UNIV_INTERN +void +rw_lock_s_lock_spin( +/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass, /* in: pass value; != 0, if the lock + will be passed to another thread to unlock */ + const char* file_name, /* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + ulint index; /* index of the reserved wait cell */ + ulint i = 0; /* spin round count */ + + ut_ad(rw_lock_validate(lock)); + + rw_s_spin_wait_count++; /* Count calls to this function */ +lock_loop: + + /* Spin waiting for the writer field to become free */ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + while (i < SYNC_SPIN_ROUNDS + && rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) { +#else + while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) { +#endif + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } + + i++; + } + + if (i == SYNC_SPIN_ROUNDS) { + os_thread_yield(); + } + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread %lu spin wait rw-s-lock at %p" + " cfile %s cline %lu rnds %lu\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), + (void*) lock, + lock->cfile_name, (ulong) lock->cline, (ulong) i); + } + + /* We try once again to obtain the lock */ + if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { + rw_s_spin_round_count += i; + + return; /* Success */ + } else { + + if (i < SYNC_SPIN_ROUNDS) { + goto lock_loop; + } + + rw_s_spin_round_count += i; + + sync_array_reserve_cell(sync_primary_wait_array, + lock, RW_LOCK_SHARED, + file_name, line, + &index); + + /* Set waiters before checking lock_word to ensure wake-up + signal is sent. This may lead to some unnecessary signals. */ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + rw_lock_set_s_waiter_flag(lock); +#else + rw_lock_set_waiter_flag(lock); +#endif + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + for (i = 0; i < 4; i++) { +#endif + if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) { + sync_array_free_cell(sync_primary_wait_array, index); + return; /* Success */ + } +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + } + + /* If wait_ex_waiter stalls, wakes it. */ + if (lock->reader_count == 0 + && __sync_lock_test_and_set(&lock->wait_ex_waiters, 0)) { + os_event_set(lock->wait_ex_event); + sync_array_object_signalled(sync_primary_wait_array); + } +#endif + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread %lu OS wait rw-s-lock at %p" + " cfile %s cline %lu\n", + os_thread_pf(os_thread_get_curr_id()), + (void*) lock, lock->cfile_name, + (ulong) lock->cline); + } + + /* these stats may not be accurate */ + lock->count_os_wait++; + rw_s_os_wait_count++; + + sync_array_wait_event(sync_primary_wait_array, index); + + i = 0; + goto lock_loop; + } +} + +/********************************************************************** +This function is used in the insert buffer to move the ownership of an +x-latch on a buffer frame to the current thread. The x-latch was set by +the buffer read operation and it protected the buffer frame while the +read was done. The ownership is moved because we want that the current +thread is able to acquire a second x-latch which is stored in an mtr. +This, in turn, is needed to pass the debug checks of index page +operations. */ +UNIV_INTERN +void +rw_lock_x_lock_move_ownership( +/*==========================*/ + rw_lock_t* lock) /* in: lock which was x-locked in the + buffer read */ +{ + ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX)); + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + lock->writer_thread = os_thread_get_curr_id(); + lock->recursive = TRUE; +#else + rw_lock_set_writer_id_and_recursion_flag(lock, TRUE); +#endif +} + +/********************************************************************** +Function for the next writer to call. Waits for readers to exit. +The caller must have already decremented lock_word by X_LOCK_DECR.*/ +UNIV_INLINE +void +rw_lock_x_lock_wait( +/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ +#ifdef UNIV_SYNC_DEBUG + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ +#endif + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + ulint index; + ulint i = 0; + + ut_ad(lock->lock_word <= 0); + + while (lock->lock_word < 0) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } + if(i < SYNC_SPIN_ROUNDS) { + i++; + continue; + } + + /* If there is still a reader, then go to sleep.*/ + rw_x_spin_round_count += i; + i = 0; + sync_array_reserve_cell(sync_primary_wait_array, + lock, + RW_LOCK_WAIT_EX, + file_name, line, + &index); + /* Check lock_word to ensure wake-up isn't missed.*/ + if(lock->lock_word < 0) { + + /* these stats may not be accurate */ + lock->count_os_wait++; + rw_x_os_wait_count++; + + /* Add debug info as it is needed to detect possible + deadlock. We must add info for WAIT_EX thread for + deadlock detection to work properly. */ +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, + file_name, line); +#endif + + sync_array_wait_event(sync_primary_wait_array, + index); +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, + RW_LOCK_WAIT_EX); +#endif + /* It is possible to wake when lock_word < 0. + We must pass the while-loop check to proceed.*/ + } else { + sync_array_free_cell(sync_primary_wait_array, + index); + } + } + rw_x_spin_round_count += i; +} + +/********************************************************************** +Low-level function for acquiring an exclusive lock. */ +UNIV_INLINE +#ifdef INNODB_RW_LOCKS_USE_ATOMICS +ulint +#else +ibool +#endif +rw_lock_x_lock_low( +/*===============*/ + /* out: RW_LOCK_NOT_LOCKED if did + not succeed, RW_LOCK_EX if success. */ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + os_thread_id_t curr_thread = os_thread_get_curr_id(); + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS +retry_writer: + /* try to lock writer */ + if(__sync_lock_test_and_set(&(lock->writer),RW_LOCK_EX) + == RW_LOCK_NOT_LOCKED) { + /* success */ + /* obtain RW_LOCK_WAIT_EX right */ + lock->writer_thread = curr_thread; + lock->recursive = pass ? FALSE : TRUE; + lock->writer_is_wait_ex = TRUE; + /* atomic operation may be safer about memory order. */ + __sync_synchronize(); +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX, + file_name, line); +#endif + } + + if (!os_thread_eq(lock->writer_thread, curr_thread)) { + return(RW_LOCK_NOT_LOCKED); + } + + switch(rw_lock_get_writer(lock)) { + case RW_LOCK_WAIT_EX: + /* have right to try x-lock */ +retry_x_lock: + /* try x-lock */ + if(__sync_sub_and_fetch(&(lock->lock_word), + X_LOCK_DECR) == 0) { + /* success */ + lock->recursive = pass ? FALSE : TRUE; + lock->writer_is_wait_ex = FALSE; + __sync_fetch_and_add(&(lock->writer_count),1); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_remove_debug_info(lock, pass, RW_LOCK_WAIT_EX); + rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, + file_name, line); +#endif + + lock->last_x_file_name = file_name; + lock->last_x_line = line; + + /* Locking succeeded, we may return */ + return(RW_LOCK_EX); + } else if(__sync_fetch_and_add(&(lock->lock_word), + X_LOCK_DECR) == 0) { + /* retry x-lock */ + goto retry_x_lock; + } + + /* There are readers, we have to wait */ + return(RW_LOCK_WAIT_EX); + + break; + + case RW_LOCK_EX: + /* already have x-lock */ + if (lock->recursive && (pass == 0)) { + __sync_fetch_and_add(&(lock->writer_count),1); + +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name, + line); +#endif + + lock->last_x_file_name = file_name; + lock->last_x_line = line; + + /* Locking succeeded, we may return */ + return(RW_LOCK_EX); + } + + return(RW_LOCK_NOT_LOCKED); + + break; + + default: /* RW_LOCK_NOT_LOCKED? maybe impossible */ + goto retry_writer; + } + + /* Locking did not succeed */ + return(RW_LOCK_NOT_LOCKED); +#else + if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) { + + /* lock->recursive also tells us if the writer_thread + field is stale or active. As we are going to write + our own thread id in that field it must be that the + current writer_thread value is not active. */ + ut_a(!lock->recursive); + + /* Decrement occurred: we are writer or next-writer. */ + rw_lock_set_writer_id_and_recursion_flag(lock, + pass ? FALSE : TRUE); + + rw_lock_x_lock_wait(lock, +#ifdef UNIV_SYNC_DEBUG + pass, +#endif + file_name, line); + + } else { + /* Decrement failed: relock or failed lock */ + if (!pass && lock->recursive + && os_thread_eq(lock->writer_thread, curr_thread)) { + /* Relock */ + lock->lock_word -= X_LOCK_DECR; + } else { + /* Another thread locked before us */ + return(FALSE); + } + } +#ifdef UNIV_SYNC_DEBUG + rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, + file_name, line); +#endif + lock->last_x_file_name = file_name; + lock->last_x_line = (unsigned int) line; + + return(TRUE); +#endif +} + +/********************************************************************** +NOTE! Use the corresponding macro, not directly this function! Lock an +rw-lock in exclusive mode for the current thread. If the rw-lock is locked +in shared or exclusive mode, or there is an exclusive lock request waiting, +the function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the lock before suspending the thread. If the same thread has an x-lock +on the rw-lock, locking succeed, with the following exception: if pass != 0, +only a single x-lock may be taken on the lock. NOTE: If the same thread has +an s-lock, locking does not succeed! */ +UNIV_INTERN +void +rw_lock_x_lock_func( +/*================*/ + rw_lock_t* lock, /* in: pointer to rw-lock */ + ulint pass, /* in: pass value; != 0, if the lock will + be passed to another thread to unlock */ + const char* file_name,/* in: file name where lock requested */ + ulint line) /* in: line where requested */ +{ + ulint index; /* index of the reserved wait cell */ + ulint i; /* spin round count */ + ibool spinning = FALSE; +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + ulint state = RW_LOCK_NOT_LOCKED; /* lock state acquired */ + ulint prev_state = RW_LOCK_NOT_LOCKED; +#endif + + ut_ad(rw_lock_validate(lock)); + + i = 0; + +lock_loop: +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + prev_state = state; + state = rw_lock_x_lock_low(lock, pass, file_name, line); + +lock_loop_2: + if (state != prev_state) i=0; /* if progress, reset counter. */ + + if (state == RW_LOCK_EX) { +#else + if (rw_lock_x_lock_low(lock, pass, file_name, line)) { +#endif + rw_x_spin_round_count += i; + + return; /* Locking succeeded */ + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + } else if (state == RW_LOCK_WAIT_EX) { + + if (!spinning) { + spinning = TRUE; + rw_x_spin_wait_count++; + } + + /* Spin waiting for the reader count field to become zero */ + while (i < SYNC_SPIN_ROUNDS + && lock->lock_word != X_LOCK_DECR) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, + srv_spin_wait_delay)); + } + + i++; + } + if (i == SYNC_SPIN_ROUNDS) { + os_thread_yield(); + } else { + goto lock_loop; + } +#endif + } else { + + if (!spinning) { + spinning = TRUE; + rw_x_spin_wait_count++; + } + + /* Spin waiting for the lock_word to become free */ + while (i < SYNC_SPIN_ROUNDS +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + && rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) { +#else + && lock->lock_word <= 0) { +#endif + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, + srv_spin_wait_delay)); + } + + i++; + } + if (i == SYNC_SPIN_ROUNDS) { + os_thread_yield(); + } else { + goto lock_loop; + } + } + + rw_x_spin_round_count += i; + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread %lu spin wait rw-x-lock at %p" + " cfile %s cline %lu rnds %lu\n", + os_thread_pf(os_thread_get_curr_id()), (void*) lock, + lock->cfile_name, (ulong) lock->cline, (ulong) i); + } + + sync_array_reserve_cell(sync_primary_wait_array, + lock, +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + (state == RW_LOCK_WAIT_EX) + ? RW_LOCK_WAIT_EX : RW_LOCK_EX, +#else + RW_LOCK_EX, +#endif + file_name, line, + &index); + + /* Waiters must be set before checking lock_word, to ensure signal + is sent. This could lead to a few unnecessary wake-up signals. */ +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + if (state == RW_LOCK_WAIT_EX) { + rw_lock_set_wx_waiter_flag(lock); + } else { + rw_lock_set_x_waiter_flag(lock); + } +#else + rw_lock_set_waiter_flag(lock); +#endif + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + for (i = 0; i < 4; i++) { + prev_state = state; + state = rw_lock_x_lock_low(lock, pass, file_name, line); + if (state == RW_LOCK_EX) { + sync_array_free_cell(sync_primary_wait_array, index); + return; /* Locking succeeded */ + } else if (state != prev_state) { + /* retry! */ + sync_array_free_cell(sync_primary_wait_array, index); + goto lock_loop_2; + } + } +#else + if (rw_lock_x_lock_low(lock, pass, file_name, line)) { + sync_array_free_cell(sync_primary_wait_array, index); + return; /* Locking succeeded */ + } +#endif + + if (srv_print_latch_waits) { + fprintf(stderr, + "Thread %lu OS wait for rw-x-lock at %p" + " cfile %s cline %lu\n", + os_thread_pf(os_thread_get_curr_id()), (void*) lock, + lock->cfile_name, (ulong) lock->cline); + } + + /* these stats may not be accurate */ + lock->count_os_wait++; + rw_x_os_wait_count++; + + sync_array_wait_event(sync_primary_wait_array, index); + + i = 0; + goto lock_loop; +} + +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Acquires the debug mutex. We cannot use the mutex defined in sync0sync, +because the debug mutex is also acquired in sync0arr while holding the OS +mutex protecting the sync array, and the ordinary mutex_enter might +recursively call routines in sync0arr, leading to a deadlock on the OS +mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_enter(void) +/*==========================*/ +{ +loop: + if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) { + return; + } + + os_event_reset(rw_lock_debug_event); + + rw_lock_debug_waiters = TRUE; + + if (0 == mutex_enter_nowait(&rw_lock_debug_mutex)) { + return; + } + + os_event_wait(rw_lock_debug_event); + + goto loop; +} + +/********************************************************************** +Releases the debug mutex. */ +UNIV_INTERN +void +rw_lock_debug_mutex_exit(void) +/*==========================*/ +{ + mutex_exit(&rw_lock_debug_mutex); + + if (rw_lock_debug_waiters) { + rw_lock_debug_waiters = FALSE; + os_event_set(rw_lock_debug_event); + } +} + +/********************************************************************** +Inserts the debug information for an rw-lock. */ +UNIV_INTERN +void +rw_lock_add_debug_info( +/*===================*/ + rw_lock_t* lock, /* in: rw-lock */ + ulint pass, /* in: pass value */ + ulint lock_type, /* in: lock type */ + const char* file_name, /* in: file where requested */ + ulint line) /* in: line where requested */ +{ + rw_lock_debug_t* info; + + ut_ad(lock); + ut_ad(file_name); + + info = rw_lock_debug_create(); + + rw_lock_debug_mutex_enter(); + + info->file_name = file_name; + info->line = line; + info->lock_type = lock_type; + info->thread_id = os_thread_get_curr_id(); + info->pass = pass; + + UT_LIST_ADD_FIRST(list, lock->debug_list, info); + + rw_lock_debug_mutex_exit(); + + if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) { + sync_thread_add_level(lock, lock->level); + } +} + +/********************************************************************** +Removes a debug information struct for an rw-lock. */ +UNIV_INTERN +void +rw_lock_remove_debug_info( +/*======================*/ + rw_lock_t* lock, /* in: rw-lock */ + ulint pass, /* in: pass value */ + ulint lock_type) /* in: lock type */ +{ + rw_lock_debug_t* info; + + ut_ad(lock); + + if ((pass == 0) && (lock_type != RW_LOCK_WAIT_EX)) { + sync_thread_reset_level(lock); + } + + rw_lock_debug_mutex_enter(); + + info = UT_LIST_GET_FIRST(lock->debug_list); + + while (info != NULL) { + if ((pass == info->pass) + && ((pass != 0) + || os_thread_eq(info->thread_id, + os_thread_get_curr_id())) + && (info->lock_type == lock_type)) { + + /* Found! */ + UT_LIST_REMOVE(list, lock->debug_list, info); + rw_lock_debug_mutex_exit(); + + rw_lock_debug_free(info); + + return; + } + + info = UT_LIST_GET_NEXT(list, info); + } + + ut_error; +} +#endif /* UNIV_SYNC_DEBUG */ + +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Checks if the thread has locked the rw-lock in the specified mode, with +the pass value == 0. */ +UNIV_INTERN +ibool +rw_lock_own( +/*========*/ + /* out: TRUE if locked */ + rw_lock_t* lock, /* in: rw-lock */ + ulint lock_type) /* in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +{ + rw_lock_debug_t* info; + + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + + rw_lock_debug_mutex_enter(); + + info = UT_LIST_GET_FIRST(lock->debug_list); + + while (info != NULL) { + + if (os_thread_eq(info->thread_id, os_thread_get_curr_id()) + && (info->pass == 0) + && (info->lock_type == lock_type)) { + + rw_lock_debug_mutex_exit(); + /* Found! */ + + return(TRUE); + } + + info = UT_LIST_GET_NEXT(list, info); + } + rw_lock_debug_mutex_exit(); + + return(FALSE); +} +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************** +Checks if somebody has locked the rw-lock in the specified mode. */ +UNIV_INTERN +ibool +rw_lock_is_locked( +/*==============*/ + /* out: TRUE if locked */ + rw_lock_t* lock, /* in: rw-lock */ + ulint lock_type) /* in: lock type: RW_LOCK_SHARED, + RW_LOCK_EX */ +{ + ibool ret = FALSE; + + ut_ad(lock); + ut_ad(rw_lock_validate(lock)); + + if (lock_type == RW_LOCK_SHARED) { + if (rw_lock_get_reader_count(lock) > 0) { + ret = TRUE; + } + } else if (lock_type == RW_LOCK_EX) { + if (rw_lock_get_writer(lock) == RW_LOCK_EX) { + ret = TRUE; + } + } else { + ut_error; + } + + return(ret); +} + +#ifdef UNIV_SYNC_DEBUG +/******************************************************************* +Prints debug info of currently locked rw-locks. */ +UNIV_INTERN +void +rw_lock_list_print_info( +/*====================*/ + FILE* file) /* in: file where to print */ +{ + rw_lock_t* lock; + ulint count = 0; + rw_lock_debug_t* info; + + mutex_enter(&rw_lock_list_mutex); + + fputs("-------------\n" + "RW-LATCH INFO\n" + "-------------\n", file); + + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { + + count++; + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_enter(&(lock->mutex)); +#endif + if (lock->lock_word != X_LOCK_DECR) { + + fprintf(file, "RW-LOCK: %p ", (void*) lock); + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + if (rw_lock_get_s_waiters(lock)) { + fputs(" s_waiters for the lock exist", file); + } + if (rw_lock_get_x_waiters(lock)) { + fputs(" x_waiters for the lock exist", file); + } + if (rw_lock_get_wx_waiters(lock)) { + fputs(" wait_ex_waiters for the lock exist", file); + } + putc('\n', file); +#else + if (rw_lock_get_waiters(lock)) { + fputs(" Waiters for the lock exist\n", file); + } else { + putc('\n', file); + } +#endif + + info = UT_LIST_GET_FIRST(lock->debug_list); + while (info != NULL) { + rw_lock_debug_print(info); + info = UT_LIST_GET_NEXT(list, info); + } + } +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_exit(&(lock->mutex)); +#endif + + lock = UT_LIST_GET_NEXT(list, lock); + } + + fprintf(file, "Total number of rw-locks %ld\n", count); + mutex_exit(&rw_lock_list_mutex); +} + +/******************************************************************* +Prints debug info of an rw-lock. */ +UNIV_INTERN +void +rw_lock_print( +/*==========*/ + rw_lock_t* lock) /* in: rw-lock */ +{ + rw_lock_debug_t* info; + + fprintf(stderr, + "-------------\n" + "RW-LATCH INFO\n" + "RW-LATCH: %p ", (void*) lock); + +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_enter(&(lock->mutex)); +#endif + if (lock->lock_word != X_LOCK_DECR) { + +#ifdef INNODB_RW_LOCKS_USE_ATOMICS + if (rw_lock_get_s_waiters(lock)) { + fputs(" s_waiters for the lock exist", stderr); + } + if (rw_lock_get_x_waiters(lock)) { + fputs(" x_waiters for the lock exist", stderr); + } + if (rw_lock_get_wx_waiters(lock)) { + fputs(" wait_ex_waiters for the lock exist", stderr); + } + putc('\n', stderr); +#else + if (rw_lock_get_waiters(lock)) { + fputs(" Waiters for the lock exist\n", stderr); + } else { + putc('\n', stderr); + } +#endif + + info = UT_LIST_GET_FIRST(lock->debug_list); + while (info != NULL) { + rw_lock_debug_print(info); + info = UT_LIST_GET_NEXT(list, info); + } + } +#ifndef INNODB_RW_LOCKS_USE_ATOMICS + mutex_exit(&(lock->mutex)); +#endif +} + +/************************************************************************* +Prints info of a debug struct. */ +UNIV_INTERN +void +rw_lock_debug_print( +/*================*/ + rw_lock_debug_t* info) /* in: debug struct */ +{ + ulint rwt; + + rwt = info->lock_type; + + fprintf(stderr, "Locked: thread %ld file %s line %ld ", + (ulong) os_thread_pf(info->thread_id), info->file_name, + (ulong) info->line); + if (rwt == RW_LOCK_SHARED) { + fputs("S-LOCK", stderr); + } else if (rwt == RW_LOCK_EX) { + fputs("X-LOCK", stderr); + } else if (rwt == RW_LOCK_WAIT_EX) { + fputs("WAIT X-LOCK", stderr); + } else { + ut_error; + } + if (info->pass != 0) { + fprintf(stderr, " pass value %lu", (ulong) info->pass); + } + putc('\n', stderr); +} + +/******************************************************************* +Returns the number of currently locked rw-locks. Works only in the debug +version. */ +UNIV_INTERN +ulint +rw_lock_n_locked(void) +/*==================*/ +{ + rw_lock_t* lock; + ulint count = 0; + + mutex_enter(&rw_lock_list_mutex); + + lock = UT_LIST_GET_FIRST(rw_lock_list); + + while (lock != NULL) { + + if (lock->lock_word != X_LOCK_DECR) { + count++; + } + + lock = UT_LIST_GET_NEXT(list, lock); + } + + mutex_exit(&rw_lock_list_mutex); + + return(count); +} +#endif /* UNIV_SYNC_DEBUG */ diff --git a/storage/xtradb/sync/sync0sync.c b/storage/xtradb/sync/sync0sync.c new file mode 100644 index 00000000000..3b2d033aae5 --- /dev/null +++ b/storage/xtradb/sync/sync0sync.c @@ -0,0 +1,1411 @@ +/***************************************************************************** + +Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved. +Copyright (c) 2008, Google Inc. + +Portions of this file contain modifications contributed and copyrighted by +Google, Inc. Those modifications are gratefully acknowledged and are described +briefly in the InnoDB documentation. The contributions by Google are +incorporated with their permission, and subject to the conditions contained in +the file COPYING.Google. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., 59 Temple +Place, Suite 330, Boston, MA 02111-1307 USA + +*****************************************************************************/ + +/****************************************************** +Mutex, the basic synchronization primitive + +Created 9/5/1995 Heikki Tuuri +*******************************************************/ + +#include "sync0sync.h" +#ifdef UNIV_NONINL +#include "sync0sync.ic" +#endif + +#include "sync0rw.h" +#include "buf0buf.h" +#include "srv0srv.h" +#include "buf0types.h" + +/* + REASONS FOR IMPLEMENTING THE SPIN LOCK MUTEX + ============================================ + +Semaphore operations in operating systems are slow: Solaris on a 1993 Sparc +takes 3 microseconds (us) for a lock-unlock pair and Windows NT on a 1995 +Pentium takes 20 microseconds for a lock-unlock pair. Therefore, we have to +implement our own efficient spin lock mutex. Future operating systems may +provide efficient spin locks, but we cannot count on that. + +Another reason for implementing a spin lock is that on multiprocessor systems +it can be more efficient for a processor to run a loop waiting for the +semaphore to be released than to switch to a different thread. A thread switch +takes 25 us on both platforms mentioned above. See Gray and Reuter's book +Transaction processing for background. + +How long should the spin loop last before suspending the thread? On a +uniprocessor, spinning does not help at all, because if the thread owning the +mutex is not executing, it cannot be released. Spinning actually wastes +resources. + +On a multiprocessor, we do not know if the thread owning the mutex is +executing or not. Thus it would make sense to spin as long as the operation +guarded by the mutex would typically last assuming that the thread is +executing. If the mutex is not released by that time, we may assume that the +thread owning the mutex is not executing and suspend the waiting thread. + +A typical operation (where no i/o involved) guarded by a mutex or a read-write +lock may last 1 - 20 us on the current Pentium platform. The longest +operations are the binary searches on an index node. + +We conclude that the best choice is to set the spin time at 20 us. Then the +system should work well on a multiprocessor. On a uniprocessor we have to +make sure that thread swithches due to mutex collisions are not frequent, +i.e., they do not happen every 100 us or so, because that wastes too much +resources. If the thread switches are not frequent, the 20 us wasted in spin +loop is not too much. + +Empirical studies on the effect of spin time should be done for different +platforms. + + + IMPLEMENTATION OF THE MUTEX + =========================== + +For background, see Curt Schimmel's book on Unix implementation on modern +architectures. The key points in the implementation are atomicity and +serialization of memory accesses. The test-and-set instruction (XCHG in +Pentium) must be atomic. As new processors may have weak memory models, also +serialization of memory references may be necessary. The successor of Pentium, +P6, has at least one mode where the memory model is weak. As far as we know, +in Pentium all memory accesses are serialized in the program order and we do +not have to worry about the memory model. On other processors there are +special machine instructions called a fence, memory barrier, or storage +barrier (STBAR in Sparc), which can be used to serialize the memory accesses +to happen in program order relative to the fence instruction. + +Leslie Lamport has devised a "bakery algorithm" to implement a mutex without +the atomic test-and-set, but his algorithm should be modified for weak memory +models. We do not use Lamport's algorithm, because we guess it is slower than +the atomic test-and-set. + +Our mutex implementation works as follows: After that we perform the atomic +test-and-set instruction on the memory word. If the test returns zero, we +know we got the lock first. If the test returns not zero, some other thread +was quicker and got the lock: then we spin in a loop reading the memory word, +waiting it to become zero. It is wise to just read the word in the loop, not +perform numerous test-and-set instructions, because they generate memory +traffic between the cache and the main memory. The read loop can just access +the cache, saving bus bandwidth. + +If we cannot acquire the mutex lock in the specified time, we reserve a cell +in the wait array, set the waiters byte in the mutex to 1. To avoid a race +condition, after setting the waiters byte and before suspending the waiting +thread, we still have to check that the mutex is reserved, because it may +have happened that the thread which was holding the mutex has just released +it and did not see the waiters byte set to 1, a case which would lead the +other thread to an infinite wait. + +LEMMA 1: After a thread resets the event of a mutex (or rw_lock), some +======= +thread will eventually call os_event_set() on that particular event. +Thus no infinite wait is possible in this case. + +Proof: After making the reservation the thread sets the waiters field in the +mutex to 1. Then it checks that the mutex is still reserved by some thread, +or it reserves the mutex for itself. In any case, some thread (which may be +also some earlier thread, not necessarily the one currently holding the mutex) +will set the waiters field to 0 in mutex_exit, and then call +os_event_set() with the mutex as an argument. +Q.E.D. + +LEMMA 2: If an os_event_set() call is made after some thread has called +======= +the os_event_reset() and before it starts wait on that event, the call +will not be lost to the second thread. This is true even if there is an +intervening call to os_event_reset() by another thread. +Thus no infinite wait is possible in this case. + +Proof (non-windows platforms): os_event_reset() returns a monotonically +increasing value of signal_count. This value is increased at every +call of os_event_set() If thread A has called os_event_reset() followed +by thread B calling os_event_set() and then some other thread C calling +os_event_reset(), the is_set flag of the event will be set to FALSE; +but now if thread A calls os_event_wait_low() with the signal_count +value returned from the earlier call of os_event_reset(), it will +return immediately without waiting. +Q.E.D. + +Proof (windows): If there is a writer thread which is forced to wait for +the lock, it may be able to set the state of rw_lock to RW_LOCK_WAIT_EX +The design of rw_lock ensures that there is one and only one thread +that is able to change the state to RW_LOCK_WAIT_EX and this thread is +guaranteed to acquire the lock after it is released by the current +holders and before any other waiter gets the lock. +On windows this thread waits on a separate event i.e.: wait_ex_event. +Since only one thread can wait on this event there is no chance +of this event getting reset before the writer starts wait on it. +Therefore, this thread is guaranteed to catch the os_set_event() +signalled unconditionally at the release of the lock. +Q.E.D. */ + +/* Number of spin waits on mutexes: for performance monitoring */ + +/* round=one iteration of a spin loop */ +UNIV_INTERN ib_int64_t mutex_spin_round_count = 0; +UNIV_INTERN ib_int64_t mutex_spin_wait_count = 0; +UNIV_INTERN ib_int64_t mutex_os_wait_count = 0; +UNIV_INTERN ib_int64_t mutex_exit_count = 0; + +/* The global array of wait cells for implementation of the database's own +mutexes and read-write locks */ +UNIV_INTERN sync_array_t* sync_primary_wait_array; + +/* This variable is set to TRUE when sync_init is called */ +UNIV_INTERN ibool sync_initialized = FALSE; + + +typedef struct sync_level_struct sync_level_t; +typedef struct sync_thread_struct sync_thread_t; + +#ifdef UNIV_SYNC_DEBUG +/* The latch levels currently owned by threads are stored in this data +structure; the size of this array is OS_THREAD_MAX_N */ + +UNIV_INTERN sync_thread_t* sync_thread_level_arrays; + +/* Mutex protecting sync_thread_level_arrays */ +UNIV_INTERN mutex_t sync_thread_mutex; +#endif /* UNIV_SYNC_DEBUG */ + +/* Global list of database mutexes (not OS mutexes) created. */ +UNIV_INTERN ut_list_base_node_t mutex_list; + +/* Mutex protecting the mutex_list variable */ +UNIV_INTERN mutex_t mutex_list_mutex; + +#ifdef UNIV_SYNC_DEBUG +/* Latching order checks start when this is set TRUE */ +UNIV_INTERN ibool sync_order_checks_on = FALSE; +#endif /* UNIV_SYNC_DEBUG */ + +struct sync_thread_struct{ + os_thread_id_t id; /* OS thread id */ + sync_level_t* levels; /* level array for this thread; if this is NULL + this slot is unused */ +}; + +/* Number of slots reserved for each OS thread in the sync level array */ +#define SYNC_THREAD_N_LEVELS 10000 + +struct sync_level_struct{ + void* latch; /* pointer to a mutex or an rw-lock; NULL means that + the slot is empty */ + ulint level; /* level of the latch in the latching order */ +}; + +/********************************************************************** +Creates, or rather, initializes a mutex object in a specified memory +location (which must be appropriately aligned). The mutex is initialized +in the reset state. Explicit freeing of the mutex with mutex_free is +necessary only if the memory block containing it is freed. */ +UNIV_INTERN +void +mutex_create_func( +/*==============*/ + mutex_t* mutex, /* in: pointer to memory */ +#ifdef UNIV_DEBUG + const char* cmutex_name, /* in: mutex name */ +# ifdef UNIV_SYNC_DEBUG + ulint level, /* in: level */ +# endif /* UNIV_SYNC_DEBUG */ +#endif /* UNIV_DEBUG */ + const char* cfile_name, /* in: file name where created */ + ulint cline) /* in: file line where created */ +{ +#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) + mutex_reset_lock_word(mutex); +#elif defined(HAVE_GCC_ATOMIC_BUILTINS) + mutex_reset_lock_word(mutex); +#else + os_fast_mutex_init(&(mutex->os_fast_mutex)); + mutex->lock_word = 0; +#endif + mutex->event = os_event_create(NULL); + mutex_set_waiters(mutex, 0); +#ifdef UNIV_DEBUG + mutex->magic_n = MUTEX_MAGIC_N; +#endif /* UNIV_DEBUG */ +#ifdef UNIV_SYNC_DEBUG + mutex->line = 0; + mutex->file_name = "not yet reserved"; + mutex->level = level; +#endif /* UNIV_SYNC_DEBUG */ + mutex->cfile_name = cfile_name; + mutex->cline = cline; +#ifndef UNIV_HOTBACKUP + mutex->count_os_wait = 0; +# ifdef UNIV_DEBUG + mutex->cmutex_name= cmutex_name; + mutex->count_using= 0; + mutex->mutex_type= 0; + mutex->lspent_time= 0; + mutex->lmax_spent_time= 0; + mutex->count_spin_loop= 0; + mutex->count_spin_rounds= 0; + mutex->count_os_yield= 0; +# endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + + /* Check that lock_word is aligned; this is important on Intel */ + ut_ad(((ulint)(&(mutex->lock_word))) % 4 == 0); + + /* NOTE! The very first mutexes are not put to the mutex list */ + + if ((mutex == &mutex_list_mutex) +#ifdef UNIV_SYNC_DEBUG + || (mutex == &sync_thread_mutex) +#endif /* UNIV_SYNC_DEBUG */ + ) { + + return; + } + + mutex_enter(&mutex_list_mutex); + + ut_ad(UT_LIST_GET_LEN(mutex_list) == 0 + || UT_LIST_GET_FIRST(mutex_list)->magic_n == MUTEX_MAGIC_N); + + UT_LIST_ADD_FIRST(list, mutex_list, mutex); + + mutex_exit(&mutex_list_mutex); +} + +/********************************************************************** +Calling this function is obligatory only if the memory buffer containing +the mutex is freed. Removes a mutex object from the mutex list. The mutex +is checked to be in the reset state. */ +UNIV_INTERN +void +mutex_free( +/*=======*/ + mutex_t* mutex) /* in: mutex */ +{ + ut_ad(mutex_validate(mutex)); + ut_a(mutex_get_lock_word(mutex) == 0); + ut_a(mutex_get_waiters(mutex) == 0); + + if (mutex != &mutex_list_mutex +#ifdef UNIV_SYNC_DEBUG + && mutex != &sync_thread_mutex +#endif /* UNIV_SYNC_DEBUG */ + ) { + + mutex_enter(&mutex_list_mutex); + + ut_ad(!UT_LIST_GET_PREV(list, mutex) + || UT_LIST_GET_PREV(list, mutex)->magic_n + == MUTEX_MAGIC_N); + ut_ad(!UT_LIST_GET_NEXT(list, mutex) + || UT_LIST_GET_NEXT(list, mutex)->magic_n + == MUTEX_MAGIC_N); + + UT_LIST_REMOVE(list, mutex_list, mutex); + + mutex_exit(&mutex_list_mutex); + } + + os_event_free(mutex->event); + +#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER) +#elif defined(HAVE_GCC_ATOMIC_BUILTINS) +#else + os_fast_mutex_free(&(mutex->os_fast_mutex)); +#endif + /* If we free the mutex protecting the mutex list (freeing is + not necessary), we have to reset the magic number AFTER removing + it from the list. */ +#ifdef UNIV_DEBUG + mutex->magic_n = 0; +#endif /* UNIV_DEBUG */ +} + +/************************************************************************ +NOTE! Use the corresponding macro in the header file, not this function +directly. Tries to lock the mutex for the current thread. If the lock is not +acquired immediately, returns with return value 1. */ +UNIV_INTERN +ulint +mutex_enter_nowait_func( +/*====================*/ + /* out: 0 if succeed, 1 if not */ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name __attribute__((unused)), + /* in: file name where mutex + requested */ + ulint line __attribute__((unused))) + /* in: line where requested */ +{ + ut_ad(mutex_validate(mutex)); + + if (!mutex_test_and_set(mutex)) { + + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + + return(0); /* Succeeded! */ + } + + return(1); +} + +#ifdef UNIV_DEBUG +/********************************************************************** +Checks that the mutex has been initialized. */ +UNIV_INTERN +ibool +mutex_validate( +/*===========*/ + const mutex_t* mutex) +{ + ut_a(mutex); + ut_a(mutex->magic_n == MUTEX_MAGIC_N); + + return(TRUE); +} + +/********************************************************************** +Checks that the current thread owns the mutex. Works only in the debug +version. */ +UNIV_INTERN +ibool +mutex_own( +/*======*/ + /* out: TRUE if owns */ + const mutex_t* mutex) /* in: mutex */ +{ + ut_ad(mutex_validate(mutex)); + + return(mutex_get_lock_word(mutex) == 1 + && os_thread_eq(mutex->thread_id, os_thread_get_curr_id())); +} +#endif /* UNIV_DEBUG */ + +/********************************************************************** +Sets the waiters field in a mutex. */ +UNIV_INTERN +void +mutex_set_waiters( +/*==============*/ + mutex_t* mutex, /* in: mutex */ + ulint n) /* in: value to set */ +{ + volatile ulint* ptr; /* declared volatile to ensure that + the value is stored to memory */ + ut_ad(mutex); + + ptr = &(mutex->waiters); + + *ptr = n; /* Here we assume that the write of a single + word in memory is atomic */ +} + +/********************************************************************** +Reserves a mutex for the current thread. If the mutex is reserved, the +function spins a preset time (controlled by SYNC_SPIN_ROUNDS), waiting +for the mutex before suspending the thread. */ +UNIV_INTERN +void +mutex_spin_wait( +/*============*/ + mutex_t* mutex, /* in: pointer to mutex */ + const char* file_name, /* in: file name where mutex + requested */ + ulint line) /* in: line where requested */ +{ + ulint index; /* index of the reserved wait cell */ + ulint i; /* spin round count */ +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + ib_int64_t lstart_time = 0, lfinish_time; /* for timing os_wait */ + ulint ltime_diff; + ulint sec; + ulint ms; + uint timer_started = 0; +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + ut_ad(mutex); + + /* This update is not thread safe, but we don't mind if the count + isn't exact. Moved out of ifdef that follows because we are willing + to sacrifice the cost of counting this as the data is valuable. + Count the number of calls to mutex_spin_wait. */ + mutex_spin_wait_count++; + +mutex_loop: + + i = 0; + + /* Spin waiting for the lock word to become zero. Note that we do + not have to assume that the read access to the lock word is atomic, + as the actual locking is always committed with atomic test-and-set. + In reality, however, all processors probably have an atomic read of + a memory word. */ + +spin_loop: +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + mutex->count_spin_loop++; +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + + while (mutex_get_lock_word(mutex) != 0 && i < SYNC_SPIN_ROUNDS) { + if (srv_spin_wait_delay) { + ut_delay(ut_rnd_interval(0, srv_spin_wait_delay)); + } + + i++; + } + + if (i == SYNC_SPIN_ROUNDS) { +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + mutex->count_os_yield++; + if (timed_mutexes == 1 && timer_started==0) { + ut_usectime(&sec, &ms); + lstart_time= (ib_int64_t)sec * 1000000 + ms; + timer_started = 1; + } +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + os_thread_yield(); + } + +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, + "Thread %lu spin wait mutex at %p" + " cfile %s cline %lu rnds %lu\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex, + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); +#endif + + mutex_spin_round_count += i; + +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + mutex->count_spin_rounds += i; +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + + if (mutex_test_and_set(mutex) == 0) { + /* Succeeded! */ + + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + + goto finish_timing; + } + + /* We may end up with a situation where lock_word is 0 but the OS + fast mutex is still reserved. On FreeBSD the OS does not seem to + schedule a thread which is constantly calling pthread_mutex_trylock + (in mutex_test_and_set implementation). Then we could end up + spinning here indefinitely. The following 'i++' stops this infinite + spin. */ + + i++; + + if (i < SYNC_SPIN_ROUNDS) { + goto spin_loop; + } + + sync_array_reserve_cell(sync_primary_wait_array, mutex, + SYNC_MUTEX, file_name, line, &index); + + /* The memory order of the array reservation and the change in the + waiters field is important: when we suspend a thread, we first + reserve the cell and then set waiters field to 1. When threads are + released in mutex_exit, the waiters field is first set to zero and + then the event is set to the signaled state. */ + + mutex_set_waiters(mutex, 1); + + /* Try to reserve still a few times */ + for (i = 0; i < 4; i++) { + if (mutex_test_and_set(mutex) == 0) { + /* Succeeded! Free the reserved wait cell */ + + sync_array_free_cell(sync_primary_wait_array, index); + + ut_d(mutex->thread_id = os_thread_get_curr_id()); +#ifdef UNIV_SYNC_DEBUG + mutex_set_debug_info(mutex, file_name, line); +#endif + +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, "Thread %lu spin wait succeeds at 2:" + " mutex at %p\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), + (void*) mutex); +#endif + + goto finish_timing; + + /* Note that in this case we leave the waiters field + set to 1. We cannot reset it to zero, as we do not + know if there are other waiters. */ + } + } + + /* Now we know that there has been some thread holding the mutex + after the change in the wait array and the waiters field was made. + Now there is no risk of infinite wait on the event. */ + +#ifdef UNIV_SRV_PRINT_LATCH_WAITS + fprintf(stderr, + "Thread %lu OS wait mutex at %p cfile %s cline %lu rnds %lu\n", + (ulong) os_thread_pf(os_thread_get_curr_id()), (void*) mutex, + mutex->cfile_name, (ulong) mutex->cline, (ulong) i); +#endif + + mutex_os_wait_count++; + +#ifndef UNIV_HOTBACKUP + mutex->count_os_wait++; +# ifdef UNIV_DEBUG + /* !!!!! Sometimes os_wait can be called without os_thread_yield */ + + if (timed_mutexes == 1 && timer_started==0) { + ut_usectime(&sec, &ms); + lstart_time= (ib_int64_t)sec * 1000000 + ms; + timer_started = 1; + } +# endif /* UNIV_DEBUG */ +#endif /* !UNIV_HOTBACKUP */ + + sync_array_wait_event(sync_primary_wait_array, index); + goto mutex_loop; + +finish_timing: +#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP + if (timed_mutexes == 1 && timer_started==1) { + ut_usectime(&sec, &ms); + lfinish_time= (ib_int64_t)sec * 1000000 + ms; + + ltime_diff= (ulint) (lfinish_time - lstart_time); + mutex->lspent_time += ltime_diff; + + if (mutex->lmax_spent_time < ltime_diff) { + mutex->lmax_spent_time= ltime_diff; + } + } +#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */ + return; +} + +/********************************************************************** +Releases the threads waiting in the primary wait array for this mutex. */ +UNIV_INTERN +void +mutex_signal_object( +/*================*/ + mutex_t* mutex) /* in: mutex */ +{ + mutex_set_waiters(mutex, 0); + + /* The memory order of resetting the waiters field and + signaling the object is important. See LEMMA 1 above. */ + os_event_set(mutex->event); + sync_array_object_signalled(sync_primary_wait_array); +} + +#ifdef UNIV_SYNC_DEBUG +/********************************************************************** +Sets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_set_debug_info( +/*=================*/ + mutex_t* mutex, /* in: mutex */ + const char* file_name, /* in: file where requested */ + ulint line) /* in: line where requested */ +{ + ut_ad(mutex); + ut_ad(file_name); + + sync_thread_add_level(mutex, mutex->level); + + mutex->file_name = file_name; + mutex->line = line; +} + +/********************************************************************** +Gets the debug information for a reserved mutex. */ +UNIV_INTERN +void +mutex_get_debug_info( +/*=================*/ + mutex_t* mutex, /* in: mutex */ + const char** file_name, /* out: file where requested */ + ulint* line, /* out: line where requested */ + os_thread_id_t* thread_id) /* out: id of the thread which owns + the mutex */ +{ + ut_ad(mutex); + + *file_name = mutex->file_name; + *line = mutex->line; + *thread_id = mutex->thread_id; +} + +/********************************************************************** +Prints debug info of currently reserved mutexes. */ +static +void +mutex_list_print_info( +/*==================*/ + FILE* file) /* in: file where to print */ +{ + mutex_t* mutex; + const char* file_name; + ulint line; + os_thread_id_t thread_id; + ulint count = 0; + + fputs("----------\n" + "MUTEX INFO\n" + "----------\n", file); + + mutex_enter(&mutex_list_mutex); + + mutex = UT_LIST_GET_FIRST(mutex_list); + + while (mutex != NULL) { + count++; + + if (mutex_get_lock_word(mutex) != 0) { + mutex_get_debug_info(mutex, &file_name, &line, + &thread_id); + fprintf(file, + "Locked mutex: addr %p thread %ld" + " file %s line %ld\n", + (void*) mutex, os_thread_pf(thread_id), + file_name, line); + } + + mutex = UT_LIST_GET_NEXT(list, mutex); + } + + fprintf(file, "Total number of mutexes %ld\n", count); + + mutex_exit(&mutex_list_mutex); +} + +/********************************************************************** +Counts currently reserved mutexes. Works only in the debug version. */ +UNIV_INTERN +ulint +mutex_n_reserved(void) +/*==================*/ +{ + mutex_t* mutex; + ulint count = 0; + + mutex_enter(&mutex_list_mutex); + + mutex = UT_LIST_GET_FIRST(mutex_list); + + while (mutex != NULL) { + if (mutex_get_lock_word(mutex) != 0) { + + count++; + } + + mutex = UT_LIST_GET_NEXT(list, mutex); + } + + mutex_exit(&mutex_list_mutex); + + ut_a(count >= 1); + + return(count - 1); /* Subtract one, because this function itself + was holding one mutex (mutex_list_mutex) */ +} + +/********************************************************************** +Returns TRUE if no mutex or rw-lock is currently locked. Works only in +the debug version. */ +UNIV_INTERN +ibool +sync_all_freed(void) +/*================*/ +{ + return(mutex_n_reserved() + rw_lock_n_locked() == 0); +} + +/********************************************************************** +Gets the value in the nth slot in the thread level arrays. */ +static +sync_thread_t* +sync_thread_level_arrays_get_nth( +/*=============================*/ + /* out: pointer to thread slot */ + ulint n) /* in: slot number */ +{ + ut_ad(n < OS_THREAD_MAX_N); + + return(sync_thread_level_arrays + n); +} + +/********************************************************************** +Looks for the thread slot for the calling thread. */ +static +sync_thread_t* +sync_thread_level_arrays_find_slot(void) +/*====================================*/ + /* out: pointer to thread slot, NULL if not found */ + +{ + sync_thread_t* slot; + os_thread_id_t id; + ulint i; + + id = os_thread_get_curr_id(); + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + + slot = sync_thread_level_arrays_get_nth(i); + + if (slot->levels && os_thread_eq(slot->id, id)) { + + return(slot); + } + } + + return(NULL); +} + +/********************************************************************** +Looks for an unused thread slot. */ +static +sync_thread_t* +sync_thread_level_arrays_find_free(void) +/*====================================*/ + /* out: pointer to thread slot */ + +{ + sync_thread_t* slot; + ulint i; + + for (i = 0; i < OS_THREAD_MAX_N; i++) { + + slot = sync_thread_level_arrays_get_nth(i); + + if (slot->levels == NULL) { + + return(slot); + } + } + + return(NULL); +} + +/********************************************************************** +Gets the value in the nth slot in the thread level array. */ +static +sync_level_t* +sync_thread_levels_get_nth( +/*=======================*/ + /* out: pointer to level slot */ + sync_level_t* arr, /* in: pointer to level array for an OS + thread */ + ulint n) /* in: slot number */ +{ + ut_ad(n < SYNC_THREAD_N_LEVELS); + + return(arr + n); +} + +/********************************************************************** +Checks if all the level values stored in the level array are greater than +the given limit. */ +static +ibool +sync_thread_levels_g( +/*=================*/ + /* out: TRUE if all greater */ + sync_level_t* arr, /* in: pointer to level array for an OS + thread */ + ulint limit) /* in: level limit */ +{ + sync_level_t* slot; + rw_lock_t* lock; + mutex_t* mutex; + ulint i; + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(arr, i); + + if (slot->latch != NULL) { + if (slot->level <= limit) { + + lock = slot->latch; + mutex = slot->latch; + + fprintf(stderr, + "InnoDB: sync levels should be" + " > %lu but a level is %lu\n", + (ulong) limit, (ulong) slot->level); + + if (mutex->magic_n == MUTEX_MAGIC_N) { + fprintf(stderr, + "Mutex created at %s %lu\n", + mutex->cfile_name, + (ulong) mutex->cline); + + if (mutex_get_lock_word(mutex) != 0) { + const char* file_name; + ulint line; + os_thread_id_t thread_id; + + mutex_get_debug_info( + mutex, &file_name, + &line, &thread_id); + + fprintf(stderr, + "InnoDB: Locked mutex:" + " addr %p thread %ld" + " file %s line %ld\n", + (void*) mutex, + os_thread_pf( + thread_id), + file_name, + (ulong) line); + } else { + fputs("Not locked\n", stderr); + } + } else { + rw_lock_print(lock); + } + + return(FALSE); + } + } + } + + return(TRUE); +} + +/********************************************************************** +Checks if the level value is stored in the level array. */ +static +ibool +sync_thread_levels_contain( +/*=======================*/ + /* out: TRUE if stored */ + sync_level_t* arr, /* in: pointer to level array for an OS + thread */ + ulint level) /* in: level */ +{ + sync_level_t* slot; + ulint i; + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(arr, i); + + if (slot->latch != NULL) { + if (slot->level == level) { + + return(TRUE); + } + } + } + + return(FALSE); +} + +/********************************************************************** +Checks that the level array for the current thread is empty. */ +UNIV_INTERN +ibool +sync_thread_levels_empty_gen( +/*=========================*/ + /* out: TRUE if empty except the + exceptions specified below */ + ibool dict_mutex_allowed) /* in: TRUE if dictionary mutex is + allowed to be owned by the thread, + also purge_is_running mutex is + allowed */ +{ + sync_level_t* arr; + sync_thread_t* thread_slot; + sync_level_t* slot; + ulint i; + + if (!sync_order_checks_on) { + + return(TRUE); + } + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + + mutex_exit(&sync_thread_mutex); + + return(TRUE); + } + + arr = thread_slot->levels; + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(arr, i); + + if (slot->latch != NULL + && (!dict_mutex_allowed + || (slot->level != SYNC_DICT + && slot->level != SYNC_DICT_OPERATION))) { + + mutex_exit(&sync_thread_mutex); + ut_error; + + return(FALSE); + } + } + + mutex_exit(&sync_thread_mutex); + + return(TRUE); +} + +/********************************************************************** +Checks that the level array for the current thread is empty. */ +UNIV_INTERN +ibool +sync_thread_levels_empty(void) +/*==========================*/ + /* out: TRUE if empty */ +{ + return(sync_thread_levels_empty_gen(FALSE)); +} + +/********************************************************************** +Adds a latch and its level in the thread level array. Allocates the memory +for the array if called first time for this OS thread. Makes the checks +against other latch levels stored in the array for this thread. */ +UNIV_INTERN +void +sync_thread_add_level( +/*==================*/ + void* latch, /* in: pointer to a mutex or an rw-lock */ + ulint level) /* in: level in the latching order; if + SYNC_LEVEL_VARYING, nothing is done */ +{ + sync_level_t* array; + sync_level_t* slot; + sync_thread_t* thread_slot; + ulint i; + + if (!sync_order_checks_on) { + + return; + } + + if ((latch == (void*)&sync_thread_mutex) + || (latch == (void*)&mutex_list_mutex) + || (latch == (void*)&rw_lock_debug_mutex) + || (latch == (void*)&rw_lock_list_mutex)) { + + return; + } + + if (level == SYNC_LEVEL_VARYING) { + + return; + } + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + /* We have to allocate the level array for a new thread */ + array = ut_malloc(sizeof(sync_level_t) * SYNC_THREAD_N_LEVELS); + + thread_slot = sync_thread_level_arrays_find_free(); + + thread_slot->id = os_thread_get_curr_id(); + thread_slot->levels = array; + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(array, i); + + slot->latch = NULL; + } + } + + array = thread_slot->levels; + + /* NOTE that there is a problem with _NODE and _LEAF levels: if the + B-tree height changes, then a leaf can change to an internal node + or the other way around. We do not know at present if this can cause + unnecessary assertion failures below. */ + + switch (level) { + case SYNC_NO_ORDER_CHECK: + case SYNC_EXTERN_STORAGE: + case SYNC_TREE_NODE_FROM_HASH: + /* Do no order checking */ + break; + case SYNC_MEM_POOL: + case SYNC_MEM_HASH: + case SYNC_RECV: + case SYNC_WORK_QUEUE: + case SYNC_LOG: + case SYNC_THR_LOCAL: + case SYNC_ANY_LATCH: + case SYNC_TRX_SYS_HEADER: + case SYNC_FILE_FORMAT_TAG: + case SYNC_DOUBLEWRITE: + case SYNC_BUF_POOL: + case SYNC_SEARCH_SYS: + case SYNC_SEARCH_SYS_CONF: + case SYNC_TRX_LOCK_HEAP: + case SYNC_KERNEL: + case SYNC_IBUF_BITMAP_MUTEX: + case SYNC_RSEG: + case SYNC_TRX_UNDO: + case SYNC_PURGE_LATCH: + case SYNC_PURGE_SYS: + case SYNC_DICT_AUTOINC_MUTEX: + case SYNC_DICT_OPERATION: + case SYNC_DICT_HEADER: + case SYNC_TRX_I_S_RWLOCK: + case SYNC_TRX_I_S_LAST_READ: + if (!sync_thread_levels_g(array, level)) { + fprintf(stderr, + "InnoDB: sync_thread_levels_g(array, %lu)" + " does not hold!\n", level); + ut_error; + } + break; + case SYNC_BUF_BLOCK: + /* Either the thread must own the buffer pool mutex + (buf_pool_mutex), or it is allowed to latch only ONE + buffer block (block->mutex or buf_pool_zip_mutex). */ + ut_a((sync_thread_levels_contain(array, SYNC_BUF_POOL) + && sync_thread_levels_g(array, SYNC_BUF_BLOCK - 1)) + || sync_thread_levels_g(array, SYNC_BUF_BLOCK)); + break; + case SYNC_REC_LOCK: + ut_a((sync_thread_levels_contain(array, SYNC_KERNEL) + && sync_thread_levels_g(array, SYNC_REC_LOCK - 1)) + || sync_thread_levels_g(array, SYNC_REC_LOCK)); + break; + case SYNC_IBUF_BITMAP: + /* Either the thread must own the master mutex to all + the bitmap pages, or it is allowed to latch only ONE + bitmap page. */ + ut_a((sync_thread_levels_contain(array, SYNC_IBUF_BITMAP_MUTEX) + && sync_thread_levels_g(array, SYNC_IBUF_BITMAP - 1)) + || sync_thread_levels_g(array, SYNC_IBUF_BITMAP)); + break; + case SYNC_FSP_PAGE: + ut_a(sync_thread_levels_contain(array, SYNC_FSP)); + break; + case SYNC_FSP: + ut_a(sync_thread_levels_contain(array, SYNC_FSP) + || sync_thread_levels_g(array, SYNC_FSP)); + break; + case SYNC_TRX_UNDO_PAGE: + ut_a(sync_thread_levels_contain(array, SYNC_TRX_UNDO) + || sync_thread_levels_contain(array, SYNC_RSEG) + || sync_thread_levels_contain(array, SYNC_PURGE_SYS) + || sync_thread_levels_g(array, SYNC_TRX_UNDO_PAGE)); + break; + case SYNC_RSEG_HEADER: + ut_a(sync_thread_levels_contain(array, SYNC_RSEG)); + break; + case SYNC_RSEG_HEADER_NEW: + ut_a(sync_thread_levels_contain(array, SYNC_KERNEL) + && sync_thread_levels_contain(array, SYNC_FSP_PAGE)); + break; + case SYNC_TREE_NODE: + ut_a(sync_thread_levels_contain(array, SYNC_INDEX_TREE) + || sync_thread_levels_contain(array, SYNC_DICT_OPERATION) + || sync_thread_levels_g(array, SYNC_TREE_NODE - 1)); + break; + case SYNC_TREE_NODE_NEW: + ut_a(sync_thread_levels_contain(array, SYNC_FSP_PAGE) + || sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)); + break; + case SYNC_INDEX_TREE: + ut_a((sync_thread_levels_contain(array, SYNC_IBUF_MUTEX) + && sync_thread_levels_contain(array, SYNC_FSP) + && sync_thread_levels_g(array, SYNC_FSP_PAGE - 1)) + || sync_thread_levels_g(array, SYNC_TREE_NODE - 1)); + break; + case SYNC_IBUF_MUTEX: + ut_a(sync_thread_levels_g(array, SYNC_FSP_PAGE - 1)); + break; + case SYNC_IBUF_PESS_INSERT_MUTEX: + ut_a(sync_thread_levels_g(array, SYNC_FSP - 1) + && !sync_thread_levels_contain(array, SYNC_IBUF_MUTEX)); + break; + case SYNC_IBUF_HEADER: + ut_a(sync_thread_levels_g(array, SYNC_FSP - 1) + && !sync_thread_levels_contain(array, SYNC_IBUF_MUTEX) + && !sync_thread_levels_contain( + array, SYNC_IBUF_PESS_INSERT_MUTEX)); + break; + case SYNC_DICT: +#ifdef UNIV_DEBUG + ut_a(buf_debug_prints + || sync_thread_levels_g(array, SYNC_DICT)); +#else /* UNIV_DEBUG */ + ut_a(sync_thread_levels_g(array, SYNC_DICT)); +#endif /* UNIV_DEBUG */ + break; + default: + ut_error; + } + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(array, i); + + if (slot->latch == NULL) { + slot->latch = latch; + slot->level = level; + + break; + } + } + + ut_a(i < SYNC_THREAD_N_LEVELS); + + mutex_exit(&sync_thread_mutex); +} + +/********************************************************************** +Removes a latch from the thread level array if it is found there. */ +UNIV_INTERN +ibool +sync_thread_reset_level( +/*====================*/ + /* out: TRUE if found from the array; it is an error + if the latch is not found */ + void* latch) /* in: pointer to a mutex or an rw-lock */ +{ + sync_level_t* array; + sync_level_t* slot; + sync_thread_t* thread_slot; + ulint i; + + if (!sync_order_checks_on) { + + return(FALSE); + } + + if ((latch == (void*)&sync_thread_mutex) + || (latch == (void*)&mutex_list_mutex) + || (latch == (void*)&rw_lock_debug_mutex) + || (latch == (void*)&rw_lock_list_mutex)) { + + return(FALSE); + } + + mutex_enter(&sync_thread_mutex); + + thread_slot = sync_thread_level_arrays_find_slot(); + + if (thread_slot == NULL) { + + ut_error; + + mutex_exit(&sync_thread_mutex); + return(FALSE); + } + + array = thread_slot->levels; + + for (i = 0; i < SYNC_THREAD_N_LEVELS; i++) { + + slot = sync_thread_levels_get_nth(array, i); + + if (slot->latch == latch) { + slot->latch = NULL; + + mutex_exit(&sync_thread_mutex); + + return(TRUE); + } + } + + if (((mutex_t*) latch)->magic_n != MUTEX_MAGIC_N) { + rw_lock_t* rw_lock; + + rw_lock = (rw_lock_t*) latch; + + if (rw_lock->level == SYNC_LEVEL_VARYING) { + mutex_exit(&sync_thread_mutex); + + return(TRUE); + } + } + + ut_error; + + mutex_exit(&sync_thread_mutex); + + return(FALSE); +} +#endif /* UNIV_SYNC_DEBUG */ + +/********************************************************************** +Initializes the synchronization data structures. */ +UNIV_INTERN +void +sync_init(void) +/*===========*/ +{ +#ifdef UNIV_SYNC_DEBUG + sync_thread_t* thread_slot; + ulint i; +#endif /* UNIV_SYNC_DEBUG */ + + ut_a(sync_initialized == FALSE); + + sync_initialized = TRUE; + + /* Create the primary system wait array which is protected by an OS + mutex */ + + sync_primary_wait_array = sync_array_create(OS_THREAD_MAX_N, + SYNC_ARRAY_OS_MUTEX); +#ifdef UNIV_SYNC_DEBUG + /* Create the thread latch level array where the latch levels + are stored for each OS thread */ + + sync_thread_level_arrays = ut_malloc(OS_THREAD_MAX_N + * sizeof(sync_thread_t)); + for (i = 0; i < OS_THREAD_MAX_N; i++) { + + thread_slot = sync_thread_level_arrays_get_nth(i); + thread_slot->levels = NULL; + } +#endif /* UNIV_SYNC_DEBUG */ + /* Init the mutex list and create the mutex to protect it. */ + + UT_LIST_INIT(mutex_list); + mutex_create(&mutex_list_mutex, SYNC_NO_ORDER_CHECK); +#ifdef UNIV_SYNC_DEBUG + mutex_create(&sync_thread_mutex, SYNC_NO_ORDER_CHECK); +#endif /* UNIV_SYNC_DEBUG */ + + /* Init the rw-lock list and create the mutex to protect it. */ + + UT_LIST_INIT(rw_lock_list); + mutex_create(&rw_lock_list_mutex, SYNC_NO_ORDER_CHECK); + +#ifdef UNIV_SYNC_DEBUG + mutex_create(&rw_lock_debug_mutex, SYNC_NO_ORDER_CHECK); + + rw_lock_debug_event = os_event_create(NULL); + rw_lock_debug_waiters = FALSE; +#endif /* UNIV_SYNC_DEBUG */ +} + +/********************************************************************** +Frees the resources in InnoDB's own synchronization data structures. Use +os_sync_free() after calling this. */ +UNIV_INTERN +void +sync_close(void) +/*===========*/ +{ + mutex_t* mutex; + + sync_array_free(sync_primary_wait_array); + + mutex = UT_LIST_GET_FIRST(mutex_list); + + while (mutex) { + mutex_free(mutex); + mutex = UT_LIST_GET_FIRST(mutex_list); + } + + mutex_free(&mutex_list_mutex); +#ifdef UNIV_SYNC_DEBUG + mutex_free(&sync_thread_mutex); +#endif /* UNIV_SYNC_DEBUG */ +} + +/*********************************************************************** +Prints wait info of the sync system. */ +UNIV_INTERN +void +sync_print_wait_info( +/*=================*/ + FILE* file) /* in: file where to print */ +{ +#ifdef UNIV_SYNC_DEBUG + fprintf(file, "Mutex exits %llu, rws exits %llu, rwx exits %llu\n", + mutex_exit_count, rw_s_exit_count, rw_x_exit_count); +#endif + + fprintf(file, + "Mutex spin waits %llu, rounds %llu, OS waits %llu\n" + "RW-shared spins %llu, OS waits %llu;" + " RW-excl spins %llu, OS waits %llu\n", + mutex_spin_wait_count, + mutex_spin_round_count, + mutex_os_wait_count, + rw_s_spin_wait_count, + rw_s_os_wait_count, + rw_x_spin_wait_count, + rw_x_os_wait_count); + + fprintf(file, + "Spin rounds per wait: %.2f mutex, %.2f RW-shared, " + "%.2f RW-excl\n", + (double) mutex_spin_round_count / + (mutex_spin_wait_count ? mutex_spin_wait_count : 1), + (double) rw_s_spin_round_count / + (rw_s_spin_wait_count ? rw_s_spin_wait_count : 1), + (double) rw_x_spin_round_count / + (rw_x_spin_wait_count ? rw_x_spin_wait_count : 1)); +} + +/*********************************************************************** +Prints info of the sync system. */ +UNIV_INTERN +void +sync_print( +/*=======*/ + FILE* file) /* in: file where to print */ +{ +#ifdef UNIV_SYNC_DEBUG + mutex_list_print_info(file); + + rw_lock_list_print_info(file); +#endif /* UNIV_SYNC_DEBUG */ + + sync_array_print_info(file, sync_primary_wait_array); + + sync_print_wait_info(file); +} |