summaryrefslogtreecommitdiff
path: root/src/mutex/mut_failchk.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/mutex/mut_failchk.c')
-rw-r--r--src/mutex/mut_failchk.c203
1 files changed, 164 insertions, 39 deletions
diff --git a/src/mutex/mut_failchk.c b/src/mutex/mut_failchk.c
index 1425389f..28e5d992 100644
--- a/src/mutex/mut_failchk.c
+++ b/src/mutex/mut_failchk.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2005, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -9,68 +9,193 @@
#include "db_config.h"
#include "db_int.h"
+#include "dbinc/lock.h"
+
+static int __mutex_failchk_single __P((ENV *, db_mutex_t, DB_THREAD_INFO *));
/*
- * __mut_failchk --
- * Check for mutexes held by dead processes.
+ * __mutex_failchk --
+ * Clean up after dead processes which left behind allocated per-process or
+ * locked mutexes.
*
- * PUBLIC: int __mut_failchk __P((ENV *));
+ * PUBLIC: int __mutex_failchk __P((ENV *));
*/
int
-__mut_failchk(env)
+__mutex_failchk(env)
ENV *env;
{
- DB_ENV *dbenv;
- DB_MUTEX *mutexp;
+ DB_HASHTAB *htab;
DB_MUTEXMGR *mtxmgr;
DB_MUTEXREGION *mtxregion;
- db_mutex_t i;
- int ret;
- char buf[DB_THREADID_STRLEN];
- db_threadid_t unused;
+ DB_THREAD_INFO *ip;
+ db_mutex_t mutex;
+ unsigned i;
+ int count;
- if (F_ISSET(env, ENV_PRIVATE))
+ if (F_ISSET(env, ENV_PRIVATE) || (htab = env->thr_hashtab) == NULL)
return (0);
- DB_THREADID_INIT(unused);
-
- dbenv = env->dbenv;
mtxmgr = env->mutex_handle;
mtxregion = mtxmgr->reginfo.primary;
- ret = 0;
+ count = 0;
+ DB_ASSERT(env, F_ISSET(env->dbenv, DB_ENV_FAILCHK));
MUTEX_SYSTEM_LOCK(env);
- for (i = 1; i <= mtxregion->stat.st_mutex_cnt; ++i, ++mutexp) {
- mutexp = MUTEXP_SET(env, i);
- /*
- * We're looking for per-process mutexes where the process
- * has died.
- */
- if (!F_ISSET(mutexp, DB_MUTEX_ALLOCATED) ||
- !F_ISSET(mutexp, DB_MUTEX_PROCESS_ONLY))
+ /*
+ * The first loop does each thread's read-locked latches; the second
+ * does all locked mutexes.
+ */
+ for (i = 0; i < env->thr_nbucket; i++)
+ SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) {
+ if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE)
+ continue;
+ count += __mutex_failchk_thread(env, ip);
+ }
+
+ for (mutex = 1; mutex <= mtxregion->stat.st_mutex_cnt; mutex++)
+ if (__mutex_failchk_single(env, mutex, NULL) != 0)
+ count++;
+
+ MUTEX_SYSTEM_UNLOCK(env);
+
+ if (count == 0)
+ return (count);
+ else
+ return (USR_ERR(env, DB_RUNRECOVERY));
+}
+
+/*
+ * __mutex_failchk_thread -
+ * Do the per-latch failchk work on each of this thread's shared latches.
+ *
+ * PUBLIC: int __mutex_failchk_thread __P((ENV *, DB_THREAD_INFO *));
+ */
+int
+__mutex_failchk_thread(env, ip)
+ ENV *env;
+ DB_THREAD_INFO *ip;
+{
+ db_mutex_t mutex;
+ int count, i;
+
+ count = 0;
+ for (i = 0; i != MUTEX_STATE_MAX; i++) {
+ if (ip->dbth_latches[i].action == MUTEX_ACTION_UNLOCKED ||
+ (mutex = ip->dbth_latches[i].mutex) == MUTEX_INVALID)
continue;
+ if (__mutex_failchk_single(env, mutex, ip) != 0)
+ count++;
+ }
+ return (count);
+}
+/*
+ * __mutex_failchk_single --
+ * Determine whether this mutex is locked or shared by a potentially
+ * dead thread. If so, and the call to is_alive() finds that it is dead,
+ * clean up if possible (a process-only mutex); else wake up any waiters.
+ */
+static int
+__mutex_failchk_single(env, mutex, ip)
+ ENV *env;
+ db_mutex_t mutex;
+ DB_THREAD_INFO *ip;
+{
+ DB_ENV *dbenv;
+ DB_MUTEX *mutexp;
+ db_threadid_t threadid;
+ pid_t pid;
+ int already_dead, ret;
+ u_int32_t flags;
+ char id_str[DB_THREADID_STRLEN];
+ char mtx_desc[DB_MUTEX_DESCRIBE_STRLEN];
+
+ dbenv = env->dbenv;
+ mutexp = MUTEXP_SET(env, mutex);
+ flags = mutexp->flags;
+ /*
+ * Filter out mutexes which couldn't possibly be "interesting", in order
+ * to reduce the number of possibly costly is_alive() calls. Check that:
+ * it is allocated
+ * is it either locked, or a shared latch, or a per-process mutex
+ * it is nether a logical lock, nor self-block, nor already dead.
+ * Self-blocking mutexes are skipped because it is expected that they
+ * can still be locked even though they are really 'idle', as with
+ * the wait case in __lock_get_internal(), LOG->free_commits, and
+ * __rep_waiter->mtx_repwait; or they were allocated by the application.
+ */
+ if (!LF_ISSET(DB_MUTEX_ALLOCATED))
+ return (0);
+ if (!LF_ISSET(
+ DB_MUTEX_SHARED | DB_MUTEX_LOCKED | DB_MUTEX_PROCESS_ONLY))
+ return (0);
+ if (LF_ISSET(
+ DB_MUTEX_SELF_BLOCK | DB_MUTEX_LOGICAL_LOCK | DB_MUTEX_OWNER_DEAD))
+ return (0);
+
+ already_dead = ip != NULL && timespecisset(&ip->dbth_failtime);
+ /*
+ * The pid in the mutex is valid when for locked or per-process mutexes.
+ * The tid is correct only when exclusively locked. It's okay to look at
+ * the tid of an unlocked per-process mutex, we won't use it in the
+ * is_alive() call.
+ */
+ if (LF_ISSET(DB_MUTEX_LOCKED | DB_MUTEX_PROCESS_ONLY)) {
+ pid = mutexp->pid;
+ threadid = mutexp->tid;
+ } else {
+ DB_ASSERT(env, LF_ISSET(DB_MUTEX_SHARED));
/*
- * The thread that allocated the mutex may have exited, but
- * we cannot reclaim the mutex if the process is still alive.
+ * If we get here with no thread, then this is an shared latch
+ * which is neither locked nor shared, we're done with it.
*/
- if (dbenv->is_alive(
- dbenv, mutexp->pid, unused, DB_MUTEX_PROCESS_ONLY))
- continue;
+ if (ip == NULL)
+ return (0);
+ pid = ip->dbth_pid;
+ threadid = ip->dbth_tid;
+ }
+ if (!already_dead && dbenv->is_alive(dbenv,
+ pid, threadid, LF_ISSET(DB_MUTEX_PROCESS_ONLY)))
+ return (0);
+
+ /* The thread is dead; the mutex type indicates the kind of cleanup. */
+ (void)dbenv->thread_id_string(dbenv, pid, threadid, id_str);
+ (void)__mutex_describe(env, mutex, mtx_desc);
- __db_msg(env, DB_STR_A("2017",
- "Freeing mutex for process: %s", "%s"),
- dbenv->thread_id_string(dbenv, mutexp->pid, unused, buf));
+ if (LF_ISSET(DB_MUTEX_PROCESS_ONLY)) {
+ if (already_dead)
+ return (0);
+
+ __db_errx(env, DB_STR_A("2065",
+ "Freeing %s for process: %s", "%s %s"), mtx_desc, id_str);
+
+ /* Clear the mutex id if it is in a cached locker. */
+ if ((ret = __lock_local_locker_invalidate(env, mutex)) != 0)
+ return (ret);
/* Unlock and free the mutex. */
- if (F_ISSET(mutexp, DB_MUTEX_LOCKED))
- MUTEX_UNLOCK(env, i);
+ if (LF_ISSET(DB_MUTEX_LOCKED))
+ MUTEX_UNLOCK(env, mutex);
- if ((ret = __mutex_free_int(env, 0, &i)) != 0)
- break;
+ return (__mutex_free_int(env, 0, &mutex));
}
- MUTEX_SYSTEM_UNLOCK(env);
-
- return (ret);
+#ifdef HAVE_FAILCHK_BROADCAST
+ else if (LF_ISSET(DB_MUTEX_LOCKED)) {
+ __db_errx(env, DB_STR_A("2066",
+ "Marking %s as owned by dead thread %s", "%lu %s"),
+ mtx_desc, id_str);
+ F_SET(mutexp, DB_MUTEX_OWNER_DEAD);
+ } else if (LF_ISSET(DB_MUTEX_SHARED)) {
+ __db_errx(env, DB_STR_A("2067",
+ "Marking %s as shared by dead thread %s", "%lu %s"),
+ mtx_desc, id_str);
+ F_SET(mutexp, DB_MUTEX_OWNER_DEAD);
+ } else {
+ __db_errx(env, DB_STR_A("2068",
+ "mutex_failchk: unknown state for %s with dead thread %s", "%lu %s"),
+ mtx_desc, id_str);
+ }
+#endif
+ return (USR_ERR(env, DB_RUNRECOVERY));
}