From e0cc1c8d8b29e28f0977fc16db54a38a44274765 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 25 May 2015 16:33:45 +1000 Subject: Grow: another attempt to fix stop-during-reshape race. When the array is stopped during a critical section, we sometimes erase the backup, which is bad. This happens when 'completed' is zero. This can happen easily when 'stop' freezes reshape. So try to be more careful and check 'reshape_position'. Signed-off-by: NeilBrown --- Grow.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/Grow.c b/Grow.c index f2cf46a..a20ff3e 100644 --- a/Grow.c +++ b/Grow.c @@ -3858,27 +3858,30 @@ int progress_reshape(struct mdinfo *info, struct reshape *reshape, } /* Some kernels reset 'sync_completed' to zero, * we need to have real point we are in md. - * But only if array is actually still reshaping, - * not stopped. + * So in that case, read 'reshape_position' from sysfs. */ if (completed == 0) { + unsigned long long reshapep; char action[20]; if (sysfs_get_str(info, NULL, "sync_action", action, 20) > 0 && - strncmp(action, "idle", 4) == 0) - completed = max_progress; - } - - /* some kernels can give an incorrectly high 'completed' number */ - completed /= (info->new_chunk/512); - completed *= (info->new_chunk/512); - /* Convert 'completed' back in to a 'progress' number */ - completed *= reshape->after.data_disks; - if (!advancing) { - completed = info->component_size * reshape->after.data_disks - - completed; + strncmp(action, "idle", 4) == 0 && + sysfs_get_ll(info, NULL, + "reshape_position", &reshapep) == 0) + *reshape_completed = reshapep; + } else { + /* some kernels can give an incorrectly high + * 'completed' number, so round down */ + completed /= (info->new_chunk/512); + completed *= (info->new_chunk/512); + /* Convert 'completed' back in to a 'progress' number */ + completed *= reshape->after.data_disks; + if (!advancing) + completed = (info->component_size + * reshape->after.data_disks + - completed); + *reshape_completed = completed; } - *reshape_completed = completed; close(fd); @@ -3898,7 +3901,6 @@ check_progress: * it was just a device failure that leaves us degraded but * functioning. */ - strcpy(buf, "hi"); if (sysfs_get_str(info, NULL, "reshape_position", buf, sizeof(buf)) < 0 || strncmp(buf, "none", 4) != 0) { /* The abort might only be temporary. Wait up to 10 -- cgit v1.2.1