diff options
author | Lukasz Dorau <lukasz.dorau@intel.com> | 2011-09-01 15:10:34 +0200 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2011-09-06 15:19:58 +1000 |
commit | ba714450698a966d184f5337235b100cbfa8685e (patch) | |
tree | b6a1f482af99439ef2cbc5e2d4afbbfa71cd1ae7 | |
parent | 3960e579bfe77b3e16a4a6d8546019ff794eb515 (diff) | |
download | mdadm-ba714450698a966d184f5337235b100cbfa8685e.tar.gz |
FIX: Mdmon crashes after changing RAID level from 1 to 0
Description of the bug:
Sometimes mdmon crashes after changing RAID level from 1 to 0 (takeover).
Cause of the bug:
The managemon marks an active_array for removal from monitoring
by assigning a->container to NULL value (in the "manage_member" function).
Sometimes (during stress test) it happens right when the monitor
is in the "read_and_act" function and a->container pointer is in use.
This causes the monitor crashes.
Solution:
The active array has to be marked for removal in another way
than setting NULL pointer when it can be in use.
A new field "to_remove" was added to the "active_array" structure.
It is used in the managemon to mark a container to remove
(instead of the old assigment: a->container = NULL)
and monitor checks it to determine if the array should be removed.
The field "to_remove" should be checked in some other places
to avoid managing of the array which is going to be removed.
Signed-off-by: Lukasz Dorau <lukasz.dorau@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r-- | managemon.c | 4 | ||||
-rw-r--r-- | mdmon.h | 1 | ||||
-rw-r--r-- | monitor.c | 8 |
3 files changed, 7 insertions, 6 deletions
diff --git a/managemon.c b/managemon.c index d020f82..9e0a34d 100644 --- a/managemon.c +++ b/managemon.c @@ -461,7 +461,7 @@ static void manage_member(struct mdstat_ent *mdstat, if (mdstat->level) { int level = map_name(pers, mdstat->level); if (level == 0 || level == LEVEL_LINEAR) { - a->container = NULL; + a->to_remove = 1; wakeup_monitor(); return; } @@ -739,7 +739,7 @@ void manage(struct mdstat_ent *mdstat, struct supertype *container) /* Looks like a member of this container */ for (a = container->arrays; a; a = a->next) { if (mdstat->devnum == a->devnum) { - if (a->container) + if (a->container && a->to_remove == 0) manage_member(mdstat, a); break; } @@ -28,6 +28,7 @@ struct active_array { struct mdinfo info; struct supertype *container; struct active_array *next, *replaces; + int to_remove; int action_fd; int resync_start_fd; @@ -479,7 +479,7 @@ static void reconcile_failed(struct active_array *aa, struct mdinfo *failed) struct mdinfo *victim; for (a = aa; a; a = a->next) { - if (!a->container) + if (!a->container || a->to_remove) continue; victim = find_device(a, failed->disk.major, failed->disk.minor); if (!victim) @@ -539,7 +539,7 @@ static int wait_and_act(struct supertype *container, int nowait) /* once an array has been deactivated we want to * ask the manager to discard it. */ - if (!a->container) { + if (!a->container || a->to_remove) { if (discard_this) { ap = &(*ap)->next; continue; @@ -642,7 +642,7 @@ static int wait_and_act(struct supertype *container, int nowait) /* FIXME check if device->state_fd need to be cleared?*/ signal_manager(); } - if (a->container) { + if (a->container && !a->to_remove) { is_dirty = read_and_act(a); rv |= 1; dirty_arrays += is_dirty; @@ -657,7 +657,7 @@ static int wait_and_act(struct supertype *container, int nowait) /* propagate failures across container members */ for (a = *aap; a ; a = a->next) { - if (!a->container) + if (!a->container || a->to_remove) continue; for (mdi = a->info.devs ; mdi ; mdi = mdi->next) if (mdi->curr_state & DS_FAULTY) |