FIX: Mdmon crashes after changing RAID level from 1 to 0

Description of the bug: Sometimes mdmon crashes after changing RAID level from 1 to 0 (takeover). Cause of the bug: The managemon marks an active_array for removal from monitoring by assigning a->container to NULL value (in the "manage_member" function). Sometimes (during stress test) it happens right when the monitor is in the "read_and_act" function and a->container pointer is in use. This causes the monitor crashes. Solution: The active array has to be marked for removal in another way than setting NULL pointer when it can be in use. A new field "to_remove" was added to the "active_array" structure. It is used in the managemon to mark a container to remove (instead of the old assigment: a->container = NULL) and monitor checks it to determine if the array should be removed. The field "to_remove" should be checked in some other places to avoid managing of the array which is going to be removed. Signed-off-by: Lukasz Dorau <lukasz.dorau@intel.com> Signed-off-by: NeilBrown <neilb@suse.de>
author: Lukasz Dorau <lukasz.dorau@intel.com> 2011-09-01 15:10:34 +0200
committer: NeilBrown <neilb@suse.de> 2011-09-06 15:19:58 +1000
commit: ba714450698a966d184f5337235b100cbfa8685e (patch)
tree: b6a1f482af99439ef2cbc5e2d4afbbfa71cd1ae7
parent: 3960e579bfe77b3e16a4a6d8546019ff794eb515 (diff)
download: mdadm-ba714450698a966d184f5337235b100cbfa8685e.tar.gz
3 files changed, 7 insertions, 6 deletions
diff --git a/managemon.c b/managemon.c
index d020f82..9e0a34d 100644
--- a/managemon.c
+++ b/managemon.c
@@ -461,7 +461,7 @@ static void manage_member(struct mdstat_ent *mdstat,
 	if (mdstat->level) {
 		int level = map_name(pers, mdstat->level);
 		if (level == 0 || level == LEVEL_LINEAR) {
-			a->container = NULL;
+			a->to_remove = 1;
 			wakeup_monitor();
 			return;
 		}
@@ -739,7 +739,7 @@ void manage(struct mdstat_ent *mdstat, struct supertype *container)
 		/* Looks like a member of this container */
 		for (a = container->arrays; a; a = a->next) {
 			if (mdstat->devnum == a->devnum) {
-				if (a->container)
+				if (a->container && a->to_remove == 0)
 					manage_member(mdstat, a);
 				break;
 			}
diff --git a/mdmon.h b/mdmon.h
index 6d1776f..59e1b53 100644
--- a/mdmon.h
+++ b/mdmon.h
@@ -28,6 +28,7 @@ struct active_array {
 	struct mdinfo info;
 	struct supertype *container;
 	struct active_array *next, *replaces;
+	int to_remove;
 
 	int action_fd;
 	int resync_start_fd;
diff --git a/monitor.c b/monitor.c
index 7ac5907..b002e90 100644
--- a/monitor.c
+++ b/monitor.c
@@ -479,7 +479,7 @@ static void reconcile_failed(struct active_array *aa, struct mdinfo *failed)
 	struct mdinfo *victim;
 
 	for (a = aa; a; a = a->next) {
-		if (!a->container)
+		if (!a->container || a->to_remove)
 			continue;
 		victim = find_device(a, failed->disk.major, failed->disk.minor);
 		if (!victim)
@@ -539,7 +539,7 @@ static int wait_and_act(struct supertype *container, int nowait)
 		/* once an array has been deactivated we want to
 		 * ask the manager to discard it.
 		 */
-		if (!a->container) {
+		if (!a->container || a->to_remove) {
 			if (discard_this) {
 				ap = &(*ap)->next;
 				continue;
@@ -642,7 +642,7 @@ static int wait_and_act(struct supertype *container, int nowait)
 			/* FIXME check if device->state_fd need to be cleared?*/
 			signal_manager();
 		}
-		if (a->container) {
+		if (a->container && !a->to_remove) {
 			is_dirty = read_and_act(a);
 			rv |= 1;
 			dirty_arrays += is_dirty;
@@ -657,7 +657,7 @@ static int wait_and_act(struct supertype *container, int nowait)
 
 	/* propagate failures across container members */
 	for (a = *aap; a ; a = a->next) {
-		if (!a->container)
+		if (!a->container || a->to_remove)
 			continue;
 		for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
 			if (mdi->curr_state & DS_FAULTY)
author	Lukasz Dorau <lukasz.dorau@intel.com>	2011-09-01 15:10:34 +0200
committer	NeilBrown <neilb@suse.de>	2011-09-06 15:19:58 +1000
commit	ba714450698a966d184f5337235b100cbfa8685e (patch)
tree	b6a1f482af99439ef2cbc5e2d4afbbfa71cd1ae7
parent	3960e579bfe77b3e16a4a6d8546019ff794eb515 (diff)
download	mdadm-ba714450698a966d184f5337235b100cbfa8685e.tar.gz