diff options
author | Sergei Golubchik <serg@mysql.com> | 2008-08-28 14:43:44 +0200 |
---|---|---|
committer | Sergei Golubchik <serg@mysql.com> | 2008-08-28 14:43:44 +0200 |
commit | 942651ea6cc2b7537aa45ff1d55d64be4e191a16 (patch) | |
tree | 32ccde19f332392c00a45b5bc98fd65e810999e0 /mysys/waiting_threads.c | |
parent | ca23272e1e53e195169bec0609eb0168722e1879 (diff) | |
download | mariadb-git-942651ea6cc2b7537aa45ff1d55d64be4e191a16.tar.gz |
wt: comments, OOM checks, test case for deadlock detection
include/waiting_threads.h:
make wt_thd_dontwait private
mysql-test/r/maria.result:
deadlock example
mysql-test/t/maria.test:
deadlock example
mysys/waiting_threads.c:
comments, OOM checks
sql/mysqld.cc:
fix variables
sql/sql_class.cc:
move wt_lazy_init to THD constructor
sql/sql_class.h:
move wt_lazy_init to THD constructor
storage/maria/ha_maria.cc:
backport from 6.0
storage/maria/ma_write.c:
poset-review fixes, set thd->proc_info
storage/maria/trnman.c:
bugfixing
storage/myisam/mi_check.c:
warnings
storage/myisam/mi_page.c:
warnings
storage/myisam/mi_search.c:
warnings
storage/myisammrg/myrg_create.c:
warnings
unittest/mysys/waiting_threads-t.c:
fixes
Diffstat (limited to 'mysys/waiting_threads.c')
-rw-r--r-- | mysys/waiting_threads.c | 352 |
1 files changed, 293 insertions, 59 deletions
diff --git a/mysys/waiting_threads.c b/mysys/waiting_threads.c index 61b89f7eb64..78cea6c9673 100644 --- a/mysys/waiting_threads.c +++ b/mysys/waiting_threads.c @@ -14,6 +14,77 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* + "waiting threads" subsystem - a unified interface for threads to wait + on each other, with built-in deadlock detection. + + Main concepts + ^^^^^^^^^^^^^ + a thread - is represented by a WT_THD structure. One physical thread + can have only one WT_THD descriptor. + + a resource - a thread does not wait for other threads directly, + instead it waits for a "resource", which is "owned" by other threads. + It waits, exactly, for all "owners" to "release" a resource. + It does not have to correspond to a physical resource. For example, it + may be convenient in certain cases to force resource == thread. + A resource is represented by a WT_RESOURCE structure. + + a resource identifier - a pair of {resource type, value}. A value is + either a ulonglong number or a pointer (it's a union). + WT_RESOURCE_ID structure. + + a resource type - a pointer to a statically defined instance of + WT_RESOURCE_TYPE structure. This structure contains a pointer to + a function that knows how to compare values of this resource type. + In the simple case it could be wt_resource_id_memcmp(). + + Usage + ^^^^^ + to use the interface one needs to use this thread's WT_THD, + call wt_thd_will_wait_for() for every thread it needs to wait on, + then call wt_thd_cond_timedwait(). When thread releases a resource + it should call wt_thd_release() (or wt_thd_release_all()) - it will + notify (send a signal) threads waiting in wt_thd_cond_timedwait(), + if appropriate. + + Just like with pthread's cond_wait, there could be spurious + wake-ups from wt_thd_cond_timedwait(). A caller is expected to + handle that. + + wt_thd_will_wait_for() and wt_thd_cond_timedwait() return either + WT_OK or WT_DEADLOCK. Additionally wt_thd_cond_timedwait() can return + WT_TIMEOUT. Out of memory and other fatal errors are reported as + WT_DEADLOCK - and a transaction must be aborted just the same. + + Configuration + ^^^^^^^^^^^^^ + There are four config variables. Two deadlock search depths - short and + long - and two timeouts. Deadlock search is performed with the short + depth on every wt_thd_will_wait_for() call. wt_thd_cond_timedwait() + waits with a short timeout, performs a deadlock search with the long + depth, and waits with a long timeout. As most deadlock cycles are supposed + to be short, most deadlocks will be detected at once, and waits will + rarely be necessary. + + These config variables are thread-local. Different threads may have + different search depth and timeout values. + + Also, deadlock detector supports different killing strategies, the victim + in a deadlock cycle is selected based on the "weight". See "weight" + description in waiting_threads.h for details. It's up to the caller to + set weights accordingly. + + Status + ^^^^^^ + We calculate the number of successfull waits (WT_OK returned from + wt_thd_cond_timedwait()), a number of timeouts, a deadlock cycle + length distribution - number of deadlocks with every length from + 1 to WT_CYCLE_STATS, and a wait time distribution - number + of waits with a time from 1 us to 1 min in WT_CYCLE_STATS + intervals on a log scale. +*/ + +/* Note that if your lock system satisfy the following condition: there exist four lock levels A, B, C, D, such as @@ -114,8 +185,18 @@ static my_atomic_rwlock_t cycle_stats_lock, wait_stats_lock, success_stats_lock; pthread_rwlock_unlock(&R->lock); \ } while (0) +/* + All resources are stored in a lock-free hash. Different threads + may add new resources and perform deadlock detection concurrently. +*/ static LF_HASH reshash; +/** + WT_RESOURCE constructor + + It's called from lf_hash and takes an offset to LF_SLIST instance. + WT_RESOURCE is located at arg+sizeof(LF_SLIST) +*/ static void wt_resource_init(uchar *arg) { WT_RESOURCE *rc=(WT_RESOURCE*)(arg+LF_HASH_OVERHEAD); @@ -124,10 +205,16 @@ static void wt_resource_init(uchar *arg) bzero(rc, sizeof(*rc)); pthread_rwlock_init(&rc->lock, 0); pthread_cond_init(&rc->cond, 0); - my_init_dynamic_array(&rc->owners, sizeof(WT_THD *), 5, 5); + my_init_dynamic_array(&rc->owners, sizeof(WT_THD *), 0, 5); DBUG_VOID_RETURN; } +/** + WT_RESOURCE destructor + + It's called from lf_hash and takes an offset to LF_SLIST instance. + WT_RESOURCE is located at arg+sizeof(LF_SLIST) +*/ static void wt_resource_destroy(uchar *arg) { WT_RESOURCE *rc=(WT_RESOURCE*)(arg+LF_HASH_OVERHEAD); @@ -159,7 +246,7 @@ void wt_init() bzero(wt_wait_stats, sizeof(wt_wait_stats)); bzero(wt_cycle_stats, sizeof(wt_cycle_stats)); wt_success_stats=0; - { + { /* initialize wt_wait_table[]. from 1 us to 1 min, log scale */ int i; double from=log(1); /* 1 us */ double to=log(60e6); /* 1 min */ @@ -187,17 +274,20 @@ void wt_end() DBUG_VOID_RETURN; } -static void fix_thd_pins(WT_THD *thd) -{ - if (unlikely(thd->pins == 0)) - { - thd->pins=lf_hash_get_pins(&reshash); -#ifndef DBUG_OFF - thd->name=my_thread_name(); -#endif - } -} +/** + Lazy WT_THD initialization + + Cheap initialization of WT_THD. Only initialized fields that don't require + memory allocations - basically, it only does assignments. The rest of the + WT_THD structure will be initialized on demand, on the first use. + This allows one to initialize lazily all WT_THD structures, even if some + (or even most) of them will never be used for deadlock detection. + @param ds a pointer to deadlock search depth short value + @param ts a pointer to deadlock timeout short value + @param dl a pointer to deadlock search depth long value + @param tl a pointer to deadlock timeout long value +*/ void wt_thd_lazy_init(WT_THD *thd, ulong *ds, ulong *ts, ulong *dl, ulong *tl) { DBUG_ENTER("wt_thd_lazy_init"); @@ -209,6 +299,7 @@ void wt_thd_lazy_init(WT_THD *thd, ulong *ds, ulong *ts, ulong *dl, ulong *tl) thd->timeout_short= ts; thd->deadlock_search_depth_long= dl; thd->timeout_long= tl; + /* dynamic array is also initialized lazily - without memory allocations */ my_init_dynamic_array(&thd->my_resources, sizeof(WT_RESOURCE *), 0, 5); #ifndef DBUG_OFF thd->name=my_thread_name(); @@ -216,6 +307,26 @@ void wt_thd_lazy_init(WT_THD *thd, ulong *ds, ulong *ts, ulong *dl, ulong *tl) DBUG_VOID_RETURN; } +/** + Finalize WT_THD initialization + + After lazy WT_THD initialization, parts of the structure are still + uninitialized. This function completes the initialization, allocating + memory, if necessary. It's called automatically on demand, when WT_THD + is about to be used. +*/ +static int fix_thd_pins(WT_THD *thd) +{ + if (unlikely(thd->pins == 0)) + { + thd->pins=lf_hash_get_pins(&reshash); +#ifndef DBUG_OFF + thd->name=my_thread_name(); +#endif + } + return thd->pins == 0; +} + void wt_thd_destroy(WT_THD *thd) { DBUG_ENTER("wt_thd_destroy"); @@ -229,19 +340,30 @@ void wt_thd_destroy(WT_THD *thd) thd->waiting_for=0; DBUG_VOID_RETURN; } +/** + Trivial resource id comparison function - bytewise memcmp. + It can be used in WT_RESOURCE_TYPE structures where bytewise + comparison of values is sufficient. +*/ int wt_resource_id_memcmp(void *a, void *b) { return memcmp(a, b, sizeof(WT_RESOURCE_ID)); } +/** + arguments for the recursive deadlock_search function +*/ struct deadlock_arg { - WT_THD *thd; - uint max_depth; - WT_THD *victim; - WT_RESOURCE *rc; + WT_THD *thd; /**< starting point of a search */ + uint max_depth; /**< search depth limit */ + WT_THD *victim; /**< a thread to be killed to resolve a deadlock */ + WT_RESOURCE *rc; /**< see comment at the end of deadlock_search() */ }; +/** + helper function to change the victim, according to the weight +*/ static void change_victim(WT_THD* found, struct deadlock_arg *arg) { if (found->weight < arg->victim->weight) @@ -256,8 +378,8 @@ static void change_victim(WT_THD* found, struct deadlock_arg *arg) } } -/* - loop detection in a wait-for graph with a limited search depth. +/** + recursive loop detection in a wait-for graph with a limited search depth */ static int deadlock_search(struct deadlock_arg *arg, WT_THD *blocker, uint depth) @@ -301,11 +423,41 @@ retry: lf_unpin(arg->thd->pins, 0); goto retry; } + /* as the state is locked, we can unpin now */ lf_unpin(arg->thd->pins, 0); + /* + Below is not a pure depth-first search. It's a depth-first with a + slightest hint of breadth-first. Depth-first is: + + check(element): + foreach current in element->nodes[] do: + if current == element return error; + check(current); + + while we do + + check(element): + foreach current in element->nodes[] do: + if current == element return error; + foreach current in element->nodes[] do: + check(current); + */ for (i=0; i < rc->owners.elements; i++) { cursor= *dynamic_element(&rc->owners, i, WT_THD**); + /* + We're only looking for (and detecting) cycles that include 'arg->thd'. + That is, only deadlocks that *we* have created. For example, + thd->A->B->thd + (thd waits for A, A waits for B, while B is waiting for thd). + While walking the graph we can encounter other cicles, e.g. + thd->A->B->C->A + This will not be detected. Instead we will walk it in circles until + the search depth limit is reached (the latter guarantees that an + infinite loop is impossible). We expect the thread that has created + the cycle (one of A, B, and C) to detect its deadlock. + */ if (cursor == arg->thd) { ret= WT_DEADLOCK; @@ -319,16 +471,15 @@ retry: { cursor= *dynamic_element(&rc->owners, i, WT_THD**); switch (deadlock_search(arg, cursor, depth+1)) { + case WT_OK: + break; case WT_DEPTH_EXCEEDED: ret= WT_DEPTH_EXCEEDED; break; case WT_DEADLOCK: ret= WT_DEADLOCK; - change_victim(cursor, arg); - if (arg->rc) - rc_unlock(arg->rc); - goto end; - case WT_OK: + change_victim(cursor, arg); /* also sets arg->rc to 0 */ + i= rc->owners.elements; /* jump out of the loop */ break; default: DBUG_ASSERT(0); @@ -337,6 +488,34 @@ retry: rc_unlock(arg->rc); } end: + /* + Note that 'rc' is locked in this function, but it's never unlocked there. + Instead it's saved in arg->rc and the *caller* is expected to unlock it. + It's done to support different killing strategies. This is how it works: + Assuming a graph + + thd->A->B->C->thd + + deadlock_search() function starts from thd, locks it (in fact it locks not + a thd, but a resource it is waiting on, but below, for simplicity, I'll + talk about "locking a thd"). Then it goes down recursively, locks A, and so + on. Goes down recursively, locks B. Goes down recursively, locks C. + Notices that C is waiting on thd. Deadlock detected. Sets arg->victim=thd. + Returns from the last deadlock_search() call. C stays locked! + Now it checks whether C is a more appropriate victim then 'thd'. + If yes - arg->victim=C, otherwise C is unlocked. Returns. B stays locked. + Now it checks whether B is a more appropriate victim then arg->victim. + If yes - old arg->victim is unlocked and arg->victim=B, + otherwise B is unlocked. Return. + And so on. + + In short, a resource is locked in a frame. But it's not unlocked in the + same frame, it's unlocked by the caller, and only after the caller checks + that it doesn't need to use current WT_THD as a victim. If it does - the + lock is kept and the old victim's resource is unlocked. When the recursion + is unrolled and we are back to deadlock() function, there are only two + locks left - on thd and on the victim. + */ arg->rc= rc; DBUG_PRINT("wt", ("exit: %s", ret == WT_DEPTH_EXCEEDED ? "WT_DEPTH_EXCEEDED" : @@ -344,6 +523,25 @@ end: DBUG_RETURN(ret); } +/** + Deadlock detection in a wait-for graph + + A wrapper for recursive deadlock_search() - prepares deadlock_arg structure, + invokes deadlock_search(), increments statistics, notifies the victim. + + @param thd thread that is going to wait. Deadlock is detected + if, while walking the graph, we reach a thread that + is waiting on thd + @param blocker starting point of a search. In wt_thd_cond_timedwait() + it's thd, in wt_thd_will_wait_for() it's a thread that + thd is going to wait for + @param depth starting search depth. In general it's the number of + edges in the wait-for graph between thd and the + blocker. Practically only two values are used (and + supported) - when thd == blocker it's 0, when thd + waits directly for blocker, it's 1 + @param max_depth search depth limit +*/ static int deadlock(WT_THD *thd, WT_THD *blocker, uint depth, uint max_depth) { @@ -357,10 +555,15 @@ static int deadlock(WT_THD *thd, WT_THD *blocker, uint depth, *thd->deadlock_search_depth_long); ret= WT_OK; } + /* + if we started with depth==1, blocker was never considered for a victim + in deadlock_search(). Do it here. + */ if (ret == WT_DEADLOCK && depth) change_victim(blocker, &arg); if (arg.rc) rc_unlock(arg.rc); + /* notify the victim, if appropriate */ if (ret == WT_DEADLOCK && arg.victim != thd) { DBUG_PRINT("wt", ("killing %s", arg.victim->name)); @@ -373,11 +576,12 @@ static int deadlock(WT_THD *thd, WT_THD *blocker, uint depth, } -/* - Deletes an element from reshash. +/** + Delete an element from reshash if it has no waiters or owners + rc->lock must be locked by the caller and it's unlocked on return. */ -static void unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc) +static int unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc) { uint keylen; const void *key; @@ -390,10 +594,14 @@ static void unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc) DBUG_PRINT("wt", ("nothing to do, %d owners, %d waiters", rc->owners.elements, rc->waiter_count)); rc_unlock(rc); - DBUG_VOID_RETURN; + DBUG_RETURN(0); } - fix_thd_pins(thd); + if (fix_thd_pins(thd)) + { + rc_unlock(rc); + DBUG_RETURN(1); + } /* XXX if (rc->id.type->make_key) key= rc->id.type->make_key(&rc->id, &keylen); else */ { @@ -414,29 +622,40 @@ static void unlock_lock_and_free_resource(WT_THD *thd, WT_RESOURCE *rc) */ rc->state=FREE; rc_unlock(rc); - lf_hash_delete(&reshash, thd->pins, key, keylen); - DBUG_VOID_RETURN; + DBUG_RETURN(lf_hash_delete(&reshash, thd->pins, key, keylen) == -1); } -int wt_thd_dontwait_locked(WT_THD *thd) +/** + register the fact that thd is not waiting anymore + + decrease waiter_count, clear waiting_for, free the resource if appropriate. + thd->waiting_for must be locked! +*/ +static int stop_waiting_locked(WT_THD *thd) { + int ret; WT_RESOURCE *rc= thd->waiting_for; - DBUG_ENTER("wt_thd_dontwait_locked"); + DBUG_ENTER("stop_waiting_locked"); DBUG_ASSERT(rc->waiter_count); DBUG_ASSERT(rc->state == ACTIVE); rc->waiter_count--; thd->waiting_for= 0; - unlock_lock_and_free_resource(thd, rc); - DBUG_RETURN(thd->killed ? WT_DEADLOCK : WT_OK); + ret= unlock_lock_and_free_resource(thd, rc); + DBUG_RETURN((thd->killed || ret) ? WT_DEADLOCK : WT_OK); } -int wt_thd_dontwait(WT_THD *thd) +/** + register the fact that thd is not waiting anymore + + locks thd->waiting_for and calls stop_waiting_locked(). +*/ +static int stop_waiting(WT_THD *thd) { int ret; WT_RESOURCE *rc= thd->waiting_for; - DBUG_ENTER("wt_thd_dontwait"); + DBUG_ENTER("stop_waiting"); if (!rc) DBUG_RETURN(WT_OK); @@ -445,15 +664,20 @@ int wt_thd_dontwait(WT_THD *thd) as its waiter_count is guaranteed to be non-zero */ rc_wrlock(rc); - ret= wt_thd_dontwait_locked(thd); + ret= stop_waiting_locked(thd); DBUG_RETURN(ret); } -/* +/** + notify the system that a thread needs to wait for another thread + called by a *waiter* to declare what resource it will wait for. can be called many times, if many blockers own a blocking resource. but must always be called with the same resource id - a thread cannot wait for more than one resource at a time. + + As a new edge is added to the wait-for graph, a deadlock detection is + performed for this new edge. */ int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker, WT_RESOURCE_ID *resid) { @@ -466,7 +690,8 @@ int wt_thd_will_wait_for(WT_THD *thd, WT_THD *blocker, WT_RESOURCE_ID *resid) DBUG_PRINT("wt", ("enter: thd=%s, blocker=%s, resid=%llu", thd->name, blocker->name, resid->value.num)); - fix_thd_pins(thd); + if (fix_thd_pins(thd)) + DBUG_RETURN(WT_DEADLOCK); if (thd->waiting_for == 0) { @@ -487,14 +712,11 @@ retry: DBUG_PRINT("wt", ("failed to find rc in hash, inserting")); bzero(&tmp, sizeof(tmp)); - tmp.waiter_count= 0; tmp.id= *resid; tmp.state= ACTIVE; -#ifndef DBUG_OFF - tmp.mutex= 0; -#endif - lf_hash_insert(&reshash, thd->pins, &tmp); + if (lf_hash_insert(&reshash, thd->pins, &tmp) == -1) /* if OOM */ + DBUG_RETURN(WT_DEADLOCK); /* Two cases: either lf_hash_insert() failed - because another thread has just inserted a resource with the same id - and we need to retry. @@ -504,6 +726,9 @@ retry: And we need to repeat the loop anyway. */ } + if (rc == MY_ERRPTR) + DBUG_RETURN(WT_DEADLOCK); + DBUG_PRINT("wt", ("found in hash rc=%p", rc)); rc_wrlock(rc); @@ -520,7 +745,6 @@ retry: thd->waiting_for= rc; rc->waiter_count++; thd->killed= 0; - } else { @@ -539,7 +763,7 @@ retry: if (thd->killed) { - wt_thd_dontwait_locked(thd); + stop_waiting_locked(thd); DBUG_RETURN(WT_DEADLOCK); } } @@ -548,20 +772,29 @@ retry: break; if (i >= rc->owners.elements) { - push_dynamic(&blocker->my_resources, (void*)&rc); - push_dynamic(&rc->owners, (void*)&blocker); + if (push_dynamic(&blocker->my_resources, (void*)&rc)) + { + stop_waiting_locked(thd); + DBUG_RETURN(WT_DEADLOCK); /* deadlock and OOM use the same error code */ + } + if (push_dynamic(&rc->owners, (void*)&blocker)) + { + pop_dynamic(&blocker->my_resources); + stop_waiting_locked(thd); + DBUG_RETURN(WT_DEADLOCK); + } } rc_unlock(rc); if (deadlock(thd, blocker, 1, *thd->deadlock_search_depth_short)) { - wt_thd_dontwait(thd); + stop_waiting(thd); DBUG_RETURN(WT_DEADLOCK); } DBUG_RETURN(0); } -/* +/** called by a *waiter* to start waiting It's supposed to be a drop-in replacement for @@ -595,7 +828,7 @@ int wt_thd_cond_timedwait(WT_THD *thd, pthread_mutex_t *mutex) #endif rc_wrlock(rc); - if (rc->owners.elements == 0 && thd->killed) + if (rc->owners.elements == 0 || thd->killed) ret= WT_OK; rc_unlock(rc); @@ -614,7 +847,7 @@ int wt_thd_cond_timedwait(WT_THD *thd, pthread_mutex_t *mutex) } } after= my_getsystime(); - if (wt_thd_dontwait(thd) == WT_DEADLOCK) + if (stop_waiting(thd) == WT_DEADLOCK) /* if we're killed */ ret= WT_DEADLOCK; increment_wait_stats(after-before, ret); if (ret == WT_OK) @@ -622,23 +855,24 @@ int wt_thd_cond_timedwait(WT_THD *thd, pthread_mutex_t *mutex) DBUG_RETURN(ret); } -/* +/** called by a *blocker* when it releases a resource - when resid==0 all resources will be freed - - Note: it's conceptually similar to pthread_cond_broadcast, and must be done + it's conceptually similar to pthread_cond_broadcast, and must be done under the same mutex as wt_thd_cond_timedwait(). + + @param resid a resource to release. 0 to release all resources */ + void wt_thd_release(WT_THD *thd, WT_RESOURCE_ID *resid) { - WT_RESOURCE *rc; - uint i, j; + uint i; DBUG_ENTER("wt_thd_release"); for (i=0; i < thd->my_resources.elements; i++) { - rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**); + uint j; + WT_RESOURCE *rc= *dynamic_element(&thd->my_resources, i, WT_RESOURCE**); if (!resid || (resid->type->compare(&rc->id, resid) == 0)) { rc_wrlock(rc); |