From d47c8ad261f787af22a220ffcc2d07afba809223 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 5 Oct 2017 16:23:16 +1100 Subject: md: fix deadlock error in recent patch. A recent patch aimed to cause md_write_start() to fail (rather than block) when the mddev was suspending, so as to avoid deadlocks. Unfortunately the test in wait_event() was wrong, and it didn't change behaviour at all. We wait_event() must wait until the metadata is written OR the array is suspending. Fixes: cc27b0c78c79 ("md: fix deadlock between mddev_suspend() and md_write_start()") Cc: stable@vger.kernel.org Reported-by: Xiao Ni Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 0ff1bbf6c90e..8b2eb0f4122f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8039,7 +8039,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || + mddev->suspended); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { percpu_ref_put(&mddev->writes_pending); return false; -- cgit v1.2.1 From d1d90147c9680aaec4a5757932c2103c42c9c23b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Oct 2017 10:32:48 +0800 Subject: md: always set THREAD_WAKEUP and wake up wqueue if thread existed Since commit 4ad23a976413 ("MD: use per-cpu counter for writes_pending"), the wait_queue is only got invoked if THREAD_WAKEUP is not set previously. With above change, I can see process_metadata_update could always hang on the wait queue, because mddev->thread could stay on 'D' status and the THREAD_WAKEUP flag is not cleared since there are lots of place to wake up mddev->thread. Then deadlock happened as follows: linux175:~ # ps aux|grep md|grep D root 20117 0.0 0.0 0 0 ? D 03:45 0:00 [md0_raid1] root 20125 0.0 0.0 0 0 ? D 03:45 0:00 [md0_cluster_rec] linux175:~ # cat /proc/20117/stack [] dlm_lock_sync+0x94/0xd0 [md_cluster] [] lock_token+0x34/0xd0 [md_cluster] [] metadata_update_start+0x64/0x110 [md_cluster] [] md_update_sb.part.58+0x9b/0x860 [md_mod] [] md_update_sb+0x15/0x30 [md_mod] [] md_check_recovery+0x266/0x490 [md_mod] [] raid1d+0x42/0x810 [raid1] [] md_thread+0x122/0x150 [md_mod] [] kthread+0x101/0x140 linux175:~ # cat /proc/20125/stack [] recv_daemon+0x3f9/0x5c0 [md_cluster] [] md_thread+0x122/0x150 [md_mod] [] kthread+0x101/0x140 So let's revert the part of code in the commit to resovle the problem since we can't get lots of benefits of previous change. Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending") Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8b2eb0f4122f..707471e3cb01 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7468,8 +7468,8 @@ void md_wakeup_thread(struct md_thread *thread) { if (thread) { pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) - wake_up(&thread->wqueue); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); } } EXPORT_SYMBOL(md_wakeup_thread); -- cgit v1.2.1 From 935fe0983e09f4f7331ebf5ea4ae2124f6e9f9e8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 10 Oct 2017 17:02:41 -0400 Subject: md: rename some drivers/md/ files to have an "md-" prefix Motivated by the desire to illiminate the imprecise nature of DM-specific patches being unnecessarily sent to both the MD maintainer and mailing-list. Which is born out of the fact that DM files also reside in drivers/md/ Now all MD-specific files in drivers/md/ start with either "raid" or "md-" and the MAINTAINERS file has been updated accordingly. Shaohua: don't change module name Signed-off-by: Mike Snitzer Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 707471e3cb01..97afb28c6f51 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -69,7 +69,7 @@ #include #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #ifndef MODULE -- cgit v1.2.1 From 230b55fa8d64007339319539f8f8e68114d08529 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 14:24:09 +1100 Subject: md: forbid a RAID5 from having both a bitmap and a journal. Having both a bitmap and a journal is pointless. Attempting to do so can corrupt the bitmap if the journal replay happens before the bitmap is initialized. Rather than try to avoid this corruption, simply refuse to allow arrays with both a bitmap and a journal. So: - if raid5_run sees both are present, fail. - if adding a bitmap finds a journal is present, fail - if adding a journal finds a bitmap is present, fail. Cc: stable@vger.kernel.org (4.10+) Signed-off-by: NeilBrown Tested-by: Joshua Kinard Acked-by: Joshua Kinard Signed-off-by: Shaohua Li --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 97afb28c6f51..6f25e3f1a1cf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6362,7 +6362,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) break; } } - if (has_journal) { + if (has_journal || mddev->bitmap) { export_rdev(rdev); return -EBUSY; } -- cgit v1.2.1 From 4d5324f760aacaefeb721b172aa14bf66045c332 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 19 Oct 2017 12:17:16 +1100 Subject: md: always hold reconfig_mutex when calling mddev_suspend() Most often mddev_suspend() is called with reconfig_mutex held. Make this a requirement in preparation a subsequent patch. Also require reconfig_mutex to be held for mddev_resume(), partly for symmetry and partly to guarantee no races with incr/decr of mddev->suspend. Taking the mutex in r5c_disable_writeback_async() is a little tricky as this is called from a work queue via log->disable_writeback_work, and flush_work() is called on that while holding ->reconfig_mutex. If the work item hasn't run before flush_work() is called, the work function will not be able to get the mutex. So we use mddev_trylock() inside the wait_event() call, and have that abort when conf->log is set to NULL, which happens before flush_work() is called. We wait in mddev->sb_wait and ensure this is woken when any of the conditions change. This requires waking mddev->sb_wait in mddev_unlock(). This is only like to trigger extra wake_ups of threads that needn't be woken when metadata is being written, and that doesn't happen often enough that the cost would be noticeable. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 6f25e3f1a1cf..9767bb33df56 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -344,6 +344,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) void mddev_suspend(struct mddev *mddev) { WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->suspended++) return; synchronize_rcu(); @@ -357,6 +358,7 @@ EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { + lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; wake_up(&mddev->sb_wait); @@ -663,6 +665,7 @@ void mddev_unlock(struct mddev *mddev) */ spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); -- cgit v1.2.1 From 52a0d49de3d592a3118e13f35985e3d99eaf43df Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: md: don't call bitmap_create() while array is quiesced. bitmap_create() allocates memory with GFP_KERNEL and so can wait for IO. If called while the array is quiesced, it could wait indefinitely for write out to the array - deadlock. So call bitmap_create() before quiescing the array. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9767bb33df56..2cb49f639809 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6621,22 +6621,26 @@ static int set_bitmap_file(struct mddev *mddev, int fd) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); if (fd >= 0) { struct bitmap *bitmap; bitmap = bitmap_create(mddev, -1); + mddev->pers->quiesce(mddev, 1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); } else err = PTR_ERR(bitmap); - } - if (fd < 0 || err) { + if (err) { + bitmap_destroy(mddev); + fd = -1; + } + mddev->pers->quiesce(mddev, 0); + } else if (fd < 0) { + mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); - fd = -1; /* make sure to put the file */ + mddev->pers->quiesce(mddev, 0); } - mddev->pers->quiesce(mddev, 0); } if (fd < 0) { struct file *f = mddev->bitmap_info.file; @@ -6920,8 +6924,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - mddev->pers->quiesce(mddev, 1); bitmap = bitmap_create(mddev, -1); + mddev->pers->quiesce(mddev, 1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev); -- cgit v1.2.1 From b3143b9a38d5039bcd1f2d1c94039651bfba8043 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: md: move suspend_hi/lo handling into core md code responding to ->suspend_lo and ->suspend_hi is similar to responding to ->suspended. It is best to wait in the common core code without incrementing ->active_io. This allows mddev_suspend()/mddev_resume() to work while requests are waiting for suspend_lo/hi to change. This is will be important after a subsequent patch which uses mddev_suspend() to synchronize updating for suspend_lo/hi. So move the code for testing suspend_lo/hi out of raid1.c and raid5.c, and place it in md.c Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 2cb49f639809..68de2a6ee29a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -266,16 +266,31 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ +static bool is_suspended(struct mddev *mddev, struct bio *bio) +{ + if (mddev->suspended) + return true; + if (bio_data_dir(bio) != WRITE) + return false; + if (mddev->suspend_lo >= mddev->suspend_hi) + return false; + if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + return false; + if (bio_end_sector(bio) < mddev->suspend_lo) + return false; + return true; +} + void md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: rcu_read_lock(); - if (mddev->suspended) { + if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) + if (!is_suspended(mddev, bio)) break; rcu_read_unlock(); schedule(); @@ -4845,10 +4860,11 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; old = mddev->suspend_lo; mddev->suspend_lo = new; - if (new >= old) + if (new >= old) { /* Shrinking suspended region */ + wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 2); - else { + } else { /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -4888,10 +4904,11 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; old = mddev->suspend_hi; mddev->suspend_hi = new; - if (new <= old) + if (new <= old) { /* Shrinking suspended region */ + wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 2); - else { + } else { /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); -- cgit v1.2.1 From 9e1cc0a54556a6c63dc0cfb7cd7d60d43337bba6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: md: use mddev_suspend/resume instead of ->quiesce() mddev_suspend() is a more general interface than calling ->quiesce() and is so more extensible. A future patch will make use of this. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 68de2a6ee29a..5bd4f18763bd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4866,8 +4866,8 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 2); } else { /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); + mddev_suspend(mddev); + mddev_resume(mddev); } err = 0; unlock: @@ -4910,8 +4910,8 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 2); } else { /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); + mddev_suspend(mddev); + mddev_resume(mddev); } err = 0; unlock: @@ -6642,7 +6642,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) struct bitmap *bitmap; bitmap = bitmap_create(mddev, -1); - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); @@ -6652,11 +6652,11 @@ static int set_bitmap_file(struct mddev *mddev, int fd) bitmap_destroy(mddev); fd = -1; } - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else if (fd < 0) { - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } } if (fd < 0) { @@ -6942,7 +6942,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.space = mddev->bitmap_info.default_space; bitmap = bitmap_create(mddev, -1); - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev); @@ -6950,7 +6950,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -6973,9 +6973,9 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); } - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); mddev->bitmap_info.offset = 0; } } -- cgit v1.2.1 From 35bfc52187f6df8779d0f1cebdb52b7f797baf4e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: md: allow metadata update while suspending. There are various deadlocks that can occur when a thread holds reconfig_mutex and calls ->quiesce(mddev, 1). As some write request block waiting for metadata to be updated (e.g. to record device failure), and as the md thread updates the metadata while the reconfig mutex is held, holding the mutex can stop write requests completing, and this prevents ->quiesce(mddev, 1) from completing. ->quiesce() is now usually called from mddev_suspend(), and it is always called with reconfig_mutex held. So at this time it is safe for the thread to update metadata without explicitly taking the lock. So add 2 new flags, one which says the unlocked updates is allowed, and one which ways it is happening. Then allow it while the quiesce completes, and then wait for it to finish. Reported-and-tested-by: Xiao Ni Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5bd4f18763bd..9155f00dca20 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -364,8 +364,12 @@ void mddev_suspend(struct mddev *mddev) return; synchronize_rcu(); wake_up(&mddev->sb_wait); + set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); + smp_mb__after_atomic(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); + clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); + wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); del_timer_sync(&mddev->safemode_timer); } @@ -8838,6 +8842,16 @@ void md_check_recovery(struct mddev *mddev) unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); + } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { + /* Write superblock - thread that called mddev_suspend() + * holds reconfig_mutex for us. + */ + set_bit(MD_UPDATING_SB, &mddev->flags); + smp_mb__after_atomic(); + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) + md_update_sb(mddev, 0); + clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); + wake_up(&mddev->sb_wait); } } EXPORT_SYMBOL(md_check_recovery); -- cgit v1.2.1 From b03e0ccb5ab9df3efbe51c87843a1ffbecbafa1f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 19 Oct 2017 12:49:15 +1100 Subject: md: remove special meaning of ->quiesce(.., 2) The '2' argument means "wake up anything that is waiting". This is an inelegant part of the design and was added to help support management of suspend_lo/suspend_hi setting. Now that suspend_lo/hi is managed in mddev_suspend/resume, that need is gone. These is still a couple of places where we call 'quiesce' with an argument of '2', but they can safely be changed to call ->quiesce(.., 1); ->quiesce(.., 0) which achieve the same result at the small cost of pausing IO briefly. This removes a small "optimization" from suspend_{hi,lo}_store, but it isn't clear that optimization served a useful purpose. The code now is a lot clearer. Suggested-by: Shaohua Li Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9155f00dca20..d441b1d9846c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4846,7 +4846,7 @@ suspend_lo_show(struct mddev *mddev, char *page) static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4862,17 +4862,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers == NULL || mddev->pers->quiesce == NULL) goto unlock; - old = mddev->suspend_lo; + mddev_suspend(mddev); mddev->suspend_lo = new; - if (new >= old) { - /* Shrinking suspended region */ - wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 2); - } else { - /* Expanding suspended region - need to wait */ - mddev_suspend(mddev); - mddev_resume(mddev); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); @@ -4890,7 +4883,7 @@ suspend_hi_show(struct mddev *mddev, char *page) static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4903,20 +4896,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; err = -EINVAL; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) + if (mddev->pers == NULL) goto unlock; - old = mddev->suspend_hi; + + mddev_suspend(mddev); mddev->suspend_hi = new; - if (new <= old) { - /* Shrinking suspended region */ - wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 2); - } else { - /* Expanding suspended region - need to wait */ - mddev_suspend(mddev); - mddev_resume(mddev); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); -- cgit v1.2.1 From efa4b77b00b56138fb7e68d2fe8fd1b3c15cd503 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 18 Oct 2017 22:08:13 -0700 Subject: md: use lockdep_assert_held lockdep_assert_held is a better way to assert lock held, and it works for UP. Signed-off-by: Shaohua Li --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index d441b1d9846c..5a0ec1d1a6e8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2335,7 +2335,7 @@ static void export_array(struct mddev *mddev) static bool set_in_sync(struct mddev *mddev) { - WARN_ON_ONCE(NR_CPUS != 1 && !spin_is_locked(&mddev->lock)); + lockdep_assert_held(&mddev->lock); if (!mddev->in_sync) { mddev->sync_checkers++; spin_unlock(&mddev->lock); @@ -6749,7 +6749,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { - WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->external_size) return; -- cgit v1.2.1 From b90f6ff080c52e2f05364210733df120e3c4e597 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Thu, 26 Oct 2017 15:56:54 +0200 Subject: md: don't check MD_SB_CHANGE_CLEAN in md_allow_write Only MD_SB_CHANGE_PENDING should be used to wait for transition from clean to dirty. Checking also MD_SB_CHANGE_CLEAN is unnecessary and can race with e.g. md_do_sync(). This sporadically causes a hang when changing consistency policy during resync: INFO: task mdadm:6183 blocked for more than 30 seconds. Not tainted 4.14.0-rc3+ #391 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. mdadm D12752 6183 6022 0x00000000 Call Trace: __schedule+0x93f/0x990 schedule+0x6b/0x90 md_allow_write+0x100/0x130 [md_mod] ? do_wait_intr_irq+0x90/0x90 resize_stripes+0x3a/0x5b0 [raid456] ? kernfs_fop_write+0xbe/0x180 raid5_change_consistency_policy+0xa6/0x200 [raid456] consistency_policy_store+0x2e/0x70 [md_mod] md_attr_store+0x90/0xc0 [md_mod] sysfs_kf_write+0x42/0x50 kernfs_fop_write+0x119/0x180 __vfs_write+0x28/0x110 ? rcu_sync_lockdep_assert+0x12/0x60 ? __sb_start_write+0x15a/0x1c0 ? vfs_write+0xa3/0x1a0 vfs_write+0xb4/0x1a0 SyS_write+0x49/0xa0 entry_SYSCALL_64_fastpath+0x18/0xad Fixes: 2214c260c72b ("md: don't return -EAGAIN in md_allow_write for external metadata arrays") Cc: Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/md.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5a0ec1d1a6e8..2bf4cc41b4f8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8125,7 +8125,6 @@ void md_allow_write(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); /* wait for the dirty state to be recorded in the metadata */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); -- cgit v1.2.1 From db0505d320660b6ad92418847e7eca6b61b246ac Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 16:18:36 +1100 Subject: md: be cautious about using ->curr_resync_completed for ->recovery_offset The ->recovery_offset shows how much of a non-InSync device is actually in sync - how much has been recoveryed. When performing a recovery, ->curr_resync and ->curr_resync_completed follow the device address being recovered and so can be used to update ->recovery_offset. When performing a reshape, ->curr_resync* might follow the device addresses (raid5) or might follow array addresses (raid10), so cannot in general be used to set ->recovery_offset. When reshaping backwards, ->curre_resync* measures from the *end* of the array-or-device, so is particularly unhelpful. So change the common code in md.c to only use ->curr_resync_complete for the simple recovery case, and add code to raid5.c to update ->recovery_offset during a forwards reshape. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 2bf4cc41b4f8..15e4668f594c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2454,10 +2454,18 @@ repeat: } } - /* First make sure individual recovery_offsets are correct */ + /* + * First make sure individual recovery_offsets are correct + * curr_resync_completed can only be used during recovery. + * During reshape/resync it might use array-addresses rather + * that device addresses. + */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) @@ -8491,16 +8499,19 @@ void md_do_sync(struct md_thread *thread) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) - rdev->recovery_offset = mddev->curr_resync; - rcu_read_unlock(); + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + } } } skip: -- cgit v1.2.1 From 0202ce8a90efdc81600e7bf9712d8c324081a924 Mon Sep 17 00:00:00 2001 From: Zdenek Kabelac Date: Wed, 8 Nov 2017 13:44:55 +0100 Subject: md: release allocated bitset sync_set Patch fixes kmemleak on md_stop() path used likely only by dm-raid wrapper. Code of md is using mddev_put() where both bitsets are released however this freeing is not shared. Also set NULL to bio_set and sync_set pointers just like mddev_put is doing. Signed-off-by: Zdenek Kabelac Signed-off-by: Shaohua Li --- drivers/md/md.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/md/md.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 15e4668f594c..e014d39159d7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5852,8 +5852,14 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); - if (mddev->bio_set) + if (mddev->bio_set) { bioset_free(mddev->bio_set); + mddev->bio_set = NULL; + } + if (mddev->sync_set) { + bioset_free(mddev->sync_set); + mddev->sync_set = NULL; + } } EXPORT_SYMBOL_GPL(md_stop); -- cgit v1.2.1