From d47c8ad261f787af22a220ffcc2d07afba809223 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 5 Oct 2017 16:23:16 +1100 Subject: md: fix deadlock error in recent patch. A recent patch aimed to cause md_write_start() to fail (rather than block) when the mddev was suspending, so as to avoid deadlocks. Unfortunately the test in wait_event() was wrong, and it didn't change behaviour at all. We wait_event() must wait until the metadata is written OR the array is suspending. Fixes: cc27b0c78c79 ("md: fix deadlock between mddev_suspend() and md_write_start()") Cc: stable@vger.kernel.org Reported-by: Xiao Ni Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 0ff1bbf6c90e..8b2eb0f4122f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8039,7 +8039,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) if (did_change) sysfs_notify_dirent_safe(mddev->sysfs_state); wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && !mddev->suspended); + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || + mddev->suspended); if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) { percpu_ref_put(&mddev->writes_pending); return false; -- cgit v1.2.1 From d1d90147c9680aaec4a5757932c2103c42c9c23b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 9 Oct 2017 10:32:48 +0800 Subject: md: always set THREAD_WAKEUP and wake up wqueue if thread existed Since commit 4ad23a976413 ("MD: use per-cpu counter for writes_pending"), the wait_queue is only got invoked if THREAD_WAKEUP is not set previously. With above change, I can see process_metadata_update could always hang on the wait queue, because mddev->thread could stay on 'D' status and the THREAD_WAKEUP flag is not cleared since there are lots of place to wake up mddev->thread. Then deadlock happened as follows: linux175:~ # ps aux|grep md|grep D root 20117 0.0 0.0 0 0 ? D 03:45 0:00 [md0_raid1] root 20125 0.0 0.0 0 0 ? D 03:45 0:00 [md0_cluster_rec] linux175:~ # cat /proc/20117/stack [] dlm_lock_sync+0x94/0xd0 [md_cluster] [] lock_token+0x34/0xd0 [md_cluster] [] metadata_update_start+0x64/0x110 [md_cluster] [] md_update_sb.part.58+0x9b/0x860 [md_mod] [] md_update_sb+0x15/0x30 [md_mod] [] md_check_recovery+0x266/0x490 [md_mod] [] raid1d+0x42/0x810 [raid1] [] md_thread+0x122/0x150 [md_mod] [] kthread+0x101/0x140 linux175:~ # cat /proc/20125/stack [] recv_daemon+0x3f9/0x5c0 [md_cluster] [] md_thread+0x122/0x150 [md_mod] [] kthread+0x101/0x140 So let's revert the part of code in the commit to resovle the problem since we can't get lots of benefits of previous change. Fixes: 4ad23a976413 ("MD: use per-cpu counter for writes_pending") Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 8b2eb0f4122f..707471e3cb01 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7468,8 +7468,8 @@ void md_wakeup_thread(struct md_thread *thread) { if (thread) { pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); - if (!test_and_set_bit(THREAD_WAKEUP, &thread->flags)) - wake_up(&thread->wqueue); + set_bit(THREAD_WAKEUP, &thread->flags); + wake_up(&thread->wqueue); } } EXPORT_SYMBOL(md_wakeup_thread); -- cgit v1.2.1 From 938b533d479e7428b7fa1b8179283646d2e2c53d Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 16 Oct 2017 19:03:44 -0700 Subject: md/bitmap: revert a patch This reverts commit 8031c3ddc70a. That patches doesn't work well if PAGE_SIZE > 4k. We will fix the original problem with a different approach. Fix: 8031c3ddc70a(md/bitmap: copy correct data for bitmap super) Reported-by: Joshua Kinard Cc: stable@vger.kernel.org (4.10+) Suggested-by: Neil Brown Signed-off-by: Shaohua Li --- drivers/md/bitmap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index d2121637b4ab..cae57b5be817 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -625,7 +625,7 @@ re_read: err = read_sb_page(bitmap->mddev, offset, sb_page, - 0, PAGE_SIZE); + 0, sizeof(bitmap_super_t)); } if (err) return err; @@ -2123,7 +2123,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, if (store.sb_page && bitmap->storage.sb_page) memcpy(page_address(store.sb_page), page_address(bitmap->storage.sb_page), - PAGE_SIZE); + sizeof(bitmap_super_t)); bitmap_file_unmap(&bitmap->storage); bitmap->storage = store; -- cgit v1.2.1 From 385f4d7f946b08f36f68b0a28e95a319925b6b62 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 29 Sep 2017 09:16:43 +0800 Subject: md-cluster: fix wrong condition check in raid1_write_request The check used here is to avoid conflict between write and resync, however we used the wrong logic, it should be the inverse of the checking inside "if". Fixes: 589a1c4 ("Suspend writes in RAID1 if within range") Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index f3f3e40dc9d8..35264ad0ec70 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1325,12 +1325,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, sigset_t full, old; prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); - if (bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_iter.bi_sector >= mddev->suspend_hi || - (mddev_is_clustered(mddev) && + if ((bio_end_sector(bio) <= mddev->suspend_lo || + bio->bi_iter.bi_sector >= mddev->suspend_hi) && + (!mddev_is_clustered(mddev) || !md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, - bio_end_sector(bio)))) + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) break; sigfillset(&full); sigprocmask(SIG_BLOCK, &full, &old); -- cgit v1.2.1 From 611426e2737235cf05e1b8f27d2502b96a5e05d9 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Fri, 29 Sep 2017 22:54:18 +0200 Subject: raid5-ppl: don't resync after rebuild The check for degraded array is unnecessary and causes a resync to be performed after ppl recovery and rebuild when restarting an array during rebuilding after unclean shutdown. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5-ppl.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index cd026c88f7ef..76d6245427b8 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1296,8 +1296,7 @@ int ppl_init_log(struct r5conf *conf) if (ret) { goto err; - } else if (!mddev->pers && - mddev->recovery_cp == 0 && !mddev->degraded && + } else if (!mddev->pers && mddev->recovery_cp == 0 && ppl_conf->recovered_entries > 0 && ppl_conf->mismatch_count == 0) { /* -- cgit v1.2.1 From 07719ff767dcd8cc42050f185d332052f3816546 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Fri, 29 Sep 2017 22:54:19 +0200 Subject: raid5-ppl: check recovery_offset when performing ppl recovery If starting an array that is undergoing rebuild, make ppl recovery honor the recovery_offset of a member disk and don't read data that is not yet in-sync. Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/raid5-ppl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index 76d6245427b8..628c0bf7b9fd 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -758,7 +758,8 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, (unsigned long long)sector); rdev = conf->disks[dd_idx].rdev; - if (!rdev) { + if (!rdev || (!test_bit(In_sync, &rdev->flags) && + sector >= rdev->recovery_offset)) { pr_debug("%s:%*s data member disk %d missing\n", __func__, indent, "", dd_idx); update_parity = false; -- cgit v1.2.1 From 7a57157aeb157cd02ccdcff237bbf63440035b07 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 3 Oct 2017 10:51:17 +0100 Subject: md-cluster: make function cluster_check_sync_size static The function cluster_check_sync_size is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'cluster_check_sync_size' was not declared. Should it be static? Signed-off-by: Colin Ian King Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 03082e17c65c..bf41492a2cb0 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1094,7 +1094,7 @@ static void metadata_update_cancel(struct mddev *mddev) /* * return 0 if all the bitmaps have the same sync_size */ -int cluster_check_sync_size(struct mddev *mddev) +static int cluster_check_sync_size(struct mddev *mddev) { int i, rv; bitmap_super_t *sb; -- cgit v1.2.1 From 584ed9fa9532f8b9d5955628ff87ee3b2ab9f5a9 Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Thu, 5 Oct 2017 11:28:47 -0700 Subject: md: raid10: remove VLAIS The raid10 driver can't be built with clang since it uses a variable length array in a structure (VLAIS): drivers/md/raid10.c:4583:17: error: fields must have a constant size: 'variable length array in structure' extension will never be supported Allocate the r10bio struct with kmalloc instead of using the VLAIS construct. Shaohua: set the MD_RECOVERY_INTR bit Neil Brown: use GFP_NOIO Signed-off-by: Matthias Kaehlcke Reviewed-by: Guenter Roeck Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 374df5796649..950fbefbedbb 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4578,15 +4578,18 @@ static int handle_reshape_read_error(struct mddev *mddev, /* Use sync reads to get the blocks from somewhere else */ int sectors = r10_bio->sectors; struct r10conf *conf = mddev->private; - struct { - struct r10bio r10_bio; - struct r10dev devs[conf->copies]; - } on_stack; - struct r10bio *r10b = &on_stack.r10_bio; + struct r10bio *r10b; int slot = 0; int idx = 0; struct page **pages; + r10b = kmalloc(sizeof(*r10b) + + sizeof(struct r10dev) * conf->copies, GFP_NOIO); + if (!r10b) { + set_bit(MD_RECOVERY_INTR, &mddev->recovery); + return -ENOMEM; + } + /* reshape IOs share pages from .devs[0].bio */ pages = get_resync_pages(r10_bio->devs[0].bio)->pages; @@ -4635,11 +4638,13 @@ static int handle_reshape_read_error(struct mddev *mddev, /* couldn't read this block, must give up */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); + kfree(r10b); return -EIO; } sectors -= s; idx++; } + kfree(r10b); return 0; } -- cgit v1.2.1 From 935fe0983e09f4f7331ebf5ea4ae2124f6e9f9e8 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Tue, 10 Oct 2017 17:02:41 -0400 Subject: md: rename some drivers/md/ files to have an "md-" prefix Motivated by the desire to illiminate the imprecise nature of DM-specific patches being unnecessarily sent to both the MD maintainer and mailing-list. Which is born out of the fact that DM files also reside in drivers/md/ Now all MD-specific files in drivers/md/ start with either "raid" or "md-" and the MAINTAINERS file has been updated accordingly. Shaohua: don't change module name Signed-off-by: Mike Snitzer Signed-off-by: Shaohua Li --- drivers/md/Makefile | 5 +- drivers/md/bitmap.c | 2591 --------------------------------------------- drivers/md/bitmap.h | 277 ----- drivers/md/dm-raid.c | 2 +- drivers/md/faulty.c | 372 ------- drivers/md/linear.c | 348 ------ drivers/md/linear.h | 16 - drivers/md/md-bitmap.c | 2591 +++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md-bitmap.h | 277 +++++ drivers/md/md-cluster.c | 2 +- drivers/md/md-faulty.c | 372 +++++++ drivers/md/md-linear.c | 348 ++++++ drivers/md/md-linear.h | 16 + drivers/md/md-multipath.c | 509 +++++++++ drivers/md/md-multipath.h | 31 + drivers/md/md.c | 2 +- drivers/md/multipath.c | 509 --------- drivers/md/multipath.h | 31 - drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5-cache.c | 2 +- drivers/md/raid5.c | 2 +- 22 files changed, 4155 insertions(+), 4152 deletions(-) delete mode 100644 drivers/md/bitmap.c delete mode 100644 drivers/md/bitmap.h delete mode 100644 drivers/md/faulty.c delete mode 100644 drivers/md/linear.c delete mode 100644 drivers/md/linear.h create mode 100644 drivers/md/md-bitmap.c create mode 100644 drivers/md/md-bitmap.h create mode 100644 drivers/md/md-faulty.c create mode 100644 drivers/md/md-linear.c create mode 100644 drivers/md/md-linear.h create mode 100644 drivers/md/md-multipath.c create mode 100644 drivers/md/md-multipath.h delete mode 100644 drivers/md/multipath.c delete mode 100644 drivers/md/multipath.h (limited to 'drivers/md') diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 786ec9e86d65..693602ffdd38 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile @@ -18,9 +18,12 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \ dm-cache-smq-y += dm-cache-policy-smq.o dm-era-y += dm-era-target.o dm-verity-y += dm-verity-target.o -md-mod-y += md.o bitmap.o +md-mod-y += md.o md-bitmap.o raid456-y += raid5.o raid5-cache.o raid5-ppl.o dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o +linear-y += md-linear.o +multipath-y += md-multipath.o +faulty-y += md-faulty.o # Note: link order is important. All raid personalities # and must come before md.o, as they each initialise diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c deleted file mode 100644 index cae57b5be817..000000000000 --- a/drivers/md/bitmap.c +++ /dev/null @@ -1,2591 +0,0 @@ -/* - * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 - * - * bitmap_create - sets up the bitmap structure - * bitmap_destroy - destroys the bitmap structure - * - * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.: - * - added disk storage for bitmap - * - changes to allow various bitmap chunk sizes - */ - -/* - * Still to do: - * - * flush after percent set rather than just time based. (maybe both). - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "md.h" -#include "bitmap.h" - -static inline char *bmname(struct bitmap *bitmap) -{ - return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; -} - -/* - * check a page and, if necessary, allocate it (or hijack it if the alloc fails) - * - * 1) check to see if this page is allocated, if it's not then try to alloc - * 2) if the alloc fails, set the page's hijacked flag so we'll use the - * page pointer directly as a counter - * - * if we find our page, we increment the page's refcount so that it stays - * allocated while we're using it - */ -static int bitmap_checkpage(struct bitmap_counts *bitmap, - unsigned long page, int create, int no_hijack) -__releases(bitmap->lock) -__acquires(bitmap->lock) -{ - unsigned char *mappage; - - if (page >= bitmap->pages) { - /* This can happen if bitmap_start_sync goes beyond - * End-of-device while looking for a whole page. - * It is harmless. - */ - return -EINVAL; - } - - if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ - return 0; - - if (bitmap->bp[page].map) /* page is already allocated, just return */ - return 0; - - if (!create) - return -ENOENT; - - /* this page has not been allocated yet */ - - spin_unlock_irq(&bitmap->lock); - /* It is possible that this is being called inside a - * prepare_to_wait/finish_wait loop from raid5c:make_request(). - * In general it is not permitted to sleep in that context as it - * can cause the loop to spin freely. - * That doesn't apply here as we can only reach this point - * once with any loop. - * When this function completes, either bp[page].map or - * bp[page].hijacked. In either case, this function will - * abort before getting to this point again. So there is - * no risk of a free-spin, and so it is safe to assert - * that sleeping here is allowed. - */ - sched_annotate_sleep(); - mappage = kzalloc(PAGE_SIZE, GFP_NOIO); - spin_lock_irq(&bitmap->lock); - - if (mappage == NULL) { - pr_debug("md/bitmap: map page allocation failed, hijacking\n"); - /* We don't support hijack for cluster raid */ - if (no_hijack) - return -ENOMEM; - /* failed - set the hijacked flag so that we can use the - * pointer as a counter */ - if (!bitmap->bp[page].map) - bitmap->bp[page].hijacked = 1; - } else if (bitmap->bp[page].map || - bitmap->bp[page].hijacked) { - /* somebody beat us to getting the page */ - kfree(mappage); - } else { - - /* no page was in place and we have one, so install it */ - - bitmap->bp[page].map = mappage; - bitmap->missing_pages--; - } - return 0; -} - -/* if page is completely empty, put it back on the free list, or dealloc it */ -/* if page was hijacked, unmark the flag so it might get alloced next time */ -/* Note: lock should be held when calling this */ -static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) -{ - char *ptr; - - if (bitmap->bp[page].count) /* page is still busy */ - return; - - /* page is no longer in use, it can be released */ - - if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ - bitmap->bp[page].hijacked = 0; - bitmap->bp[page].map = NULL; - } else { - /* normal case, free the page */ - ptr = bitmap->bp[page].map; - bitmap->bp[page].map = NULL; - bitmap->missing_pages++; - kfree(ptr); - } -} - -/* - * bitmap file handling - read and write the bitmap file and its superblock - */ - -/* - * basic page I/O operations - */ - -/* IO operations when bitmap is stored near all superblocks */ -static int read_sb_page(struct mddev *mddev, loff_t offset, - struct page *page, - unsigned long index, int size) -{ - /* choose a good rdev and read the page from there */ - - struct md_rdev *rdev; - sector_t target; - - rdev_for_each(rdev, mddev) { - if (! test_bit(In_sync, &rdev->flags) - || test_bit(Faulty, &rdev->flags) - || test_bit(Bitmap_sync, &rdev->flags)) - continue; - - target = offset + index * (PAGE_SIZE/512); - - if (sync_page_io(rdev, target, - roundup(size, bdev_logical_block_size(rdev->bdev)), - page, REQ_OP_READ, 0, true)) { - page->index = index; - return 0; - } - } - return -EIO; -} - -static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) -{ - /* Iterate the disks of an mddev, using rcu to protect access to the - * linked list, and raising the refcount of devices we return to ensure - * they don't disappear while in use. - * As devices are only added or removed when raid_disk is < 0 and - * nr_pending is 0 and In_sync is clear, the entries we return will - * still be in the same position on the list when we re-enter - * list_for_each_entry_continue_rcu. - * - * Note that if entered with 'rdev == NULL' to start at the - * beginning, we temporarily assign 'rdev' to an address which - * isn't really an rdev, but which can be used by - * list_for_each_entry_continue_rcu() to find the first entry. - */ - rcu_read_lock(); - if (rdev == NULL) - /* start at the beginning */ - rdev = list_entry(&mddev->disks, struct md_rdev, same_set); - else { - /* release the previous rdev and start from there. */ - rdev_dec_pending(rdev, mddev); - } - list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { - if (rdev->raid_disk >= 0 && - !test_bit(Faulty, &rdev->flags)) { - /* this is a usable devices */ - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - return rdev; - } - } - rcu_read_unlock(); - return NULL; -} - -static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) -{ - struct md_rdev *rdev; - struct block_device *bdev; - struct mddev *mddev = bitmap->mddev; - struct bitmap_storage *store = &bitmap->storage; - -restart: - rdev = NULL; - while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { - int size = PAGE_SIZE; - loff_t offset = mddev->bitmap_info.offset; - - bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; - - if (page->index == store->file_pages-1) { - int last_page_size = store->bytes & (PAGE_SIZE-1); - if (last_page_size == 0) - last_page_size = PAGE_SIZE; - size = roundup(last_page_size, - bdev_logical_block_size(bdev)); - } - /* Just make sure we aren't corrupting data or - * metadata - */ - if (mddev->external) { - /* Bitmap could be anywhere. */ - if (rdev->sb_start + offset + (page->index - * (PAGE_SIZE/512)) - > rdev->data_offset - && - rdev->sb_start + offset - < (rdev->data_offset + mddev->dev_sectors - + (PAGE_SIZE/512))) - goto bad_alignment; - } else if (offset < 0) { - /* DATA BITMAP METADATA */ - if (offset - + (long)(page->index * (PAGE_SIZE/512)) - + size/512 > 0) - /* bitmap runs in to metadata */ - goto bad_alignment; - if (rdev->data_offset + mddev->dev_sectors - > rdev->sb_start + offset) - /* data runs in to bitmap */ - goto bad_alignment; - } else if (rdev->sb_start < rdev->data_offset) { - /* METADATA BITMAP DATA */ - if (rdev->sb_start - + offset - + page->index*(PAGE_SIZE/512) + size/512 - > rdev->data_offset) - /* bitmap runs in to data */ - goto bad_alignment; - } else { - /* DATA METADATA BITMAP - no problems */ - } - md_super_write(mddev, rdev, - rdev->sb_start + offset - + page->index * (PAGE_SIZE/512), - size, - page); - } - - if (wait && md_super_wait(mddev) < 0) - goto restart; - return 0; - - bad_alignment: - return -EINVAL; -} - -static void bitmap_file_kick(struct bitmap *bitmap); -/* - * write out a page to a file - */ -static void write_page(struct bitmap *bitmap, struct page *page, int wait) -{ - struct buffer_head *bh; - - if (bitmap->storage.file == NULL) { - switch (write_sb_page(bitmap, page, wait)) { - case -EINVAL: - set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); - } - } else { - - bh = page_buffers(page); - - while (bh && bh->b_blocknr) { - atomic_inc(&bitmap->pending_writes); - set_buffer_locked(bh); - set_buffer_mapped(bh); - submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); - bh = bh->b_this_page; - } - - if (wait) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - } - if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) - bitmap_file_kick(bitmap); -} - -static void end_bitmap_write(struct buffer_head *bh, int uptodate) -{ - struct bitmap *bitmap = bh->b_private; - - if (!uptodate) - set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); - if (atomic_dec_and_test(&bitmap->pending_writes)) - wake_up(&bitmap->write_wait); -} - -/* copied from buffer.c */ -static void -__clear_page_buffers(struct page *page) -{ - ClearPagePrivate(page); - set_page_private(page, 0); - put_page(page); -} -static void free_buffers(struct page *page) -{ - struct buffer_head *bh; - - if (!PagePrivate(page)) - return; - - bh = page_buffers(page); - while (bh) { - struct buffer_head *next = bh->b_this_page; - free_buffer_head(bh); - bh = next; - } - __clear_page_buffers(page); - put_page(page); -} - -/* read a page from a file. - * We both read the page, and attach buffers to the page to record the - * address of each block (using bmap). These addresses will be used - * to write the block later, completely bypassing the filesystem. - * This usage is similar to how swap files are handled, and allows us - * to write to a file with no concerns of memory allocation failing. - */ -static int read_page(struct file *file, unsigned long index, - struct bitmap *bitmap, - unsigned long count, - struct page *page) -{ - int ret = 0; - struct inode *inode = file_inode(file); - struct buffer_head *bh; - sector_t block; - - pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, - (unsigned long long)index << PAGE_SHIFT); - - bh = alloc_page_buffers(page, 1<i_blkbits, 0); - if (!bh) { - ret = -ENOMEM; - goto out; - } - attach_page_buffers(page, bh); - block = index << (PAGE_SHIFT - inode->i_blkbits); - while (bh) { - if (count == 0) - bh->b_blocknr = 0; - else { - bh->b_blocknr = bmap(inode, block); - if (bh->b_blocknr == 0) { - /* Cannot use this file! */ - ret = -EINVAL; - goto out; - } - bh->b_bdev = inode->i_sb->s_bdev; - if (count < (1<i_blkbits)) - count = 0; - else - count -= (1<i_blkbits); - - bh->b_end_io = end_bitmap_write; - bh->b_private = bitmap; - atomic_inc(&bitmap->pending_writes); - set_buffer_locked(bh); - set_buffer_mapped(bh); - submit_bh(REQ_OP_READ, 0, bh); - } - block++; - bh = bh->b_this_page; - } - page->index = index; - - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) - ret = -EIO; -out: - if (ret) - pr_err("md: bitmap read error: (%dB @ %llu): %d\n", - (int)PAGE_SIZE, - (unsigned long long)index << PAGE_SHIFT, - ret); - return ret; -} - -/* - * bitmap file superblock operations - */ - -/* - * bitmap_wait_writes() should be called before writing any bitmap - * blocks, to ensure previous writes, particularly from - * bitmap_daemon_work(), have completed. - */ -static void bitmap_wait_writes(struct bitmap *bitmap) -{ - if (bitmap->storage.file) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - else - /* Note that we ignore the return value. The writes - * might have failed, but that would just mean that - * some bits which should be cleared haven't been, - * which is safe. The relevant bitmap blocks will - * probably get written again, but there is no great - * loss if they aren't. - */ - md_super_wait(bitmap->mddev); -} - - -/* update the event counter and sync the superblock to disk */ -void bitmap_update_sb(struct bitmap *bitmap) -{ - bitmap_super_t *sb; - - if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ - return; - if (bitmap->mddev->bitmap_info.external) - return; - if (!bitmap->storage.sb_page) /* no superblock */ - return; - sb = kmap_atomic(bitmap->storage.sb_page); - sb->events = cpu_to_le64(bitmap->mddev->events); - if (bitmap->mddev->events < bitmap->events_cleared) - /* rocking back to read-only */ - bitmap->events_cleared = bitmap->mddev->events; - sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - sb->state = cpu_to_le32(bitmap->flags); - /* Just in case these have been changed via sysfs: */ - sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); - sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); - /* This might have been changed by a reshape */ - sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); - sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); - sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); - sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> - bitmap_info.space); - kunmap_atomic(sb); - write_page(bitmap, bitmap->storage.sb_page, 1); -} -EXPORT_SYMBOL(bitmap_update_sb); - -/* print out the bitmap file superblock */ -void bitmap_print_sb(struct bitmap *bitmap) -{ - bitmap_super_t *sb; - - if (!bitmap || !bitmap->storage.sb_page) - return; - sb = kmap_atomic(bitmap->storage.sb_page); - pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); - pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); - pr_debug(" version: %d\n", le32_to_cpu(sb->version)); - pr_debug(" uuid: %08x.%08x.%08x.%08x\n", - le32_to_cpu(*(__u32 *)(sb->uuid+0)), - le32_to_cpu(*(__u32 *)(sb->uuid+4)), - le32_to_cpu(*(__u32 *)(sb->uuid+8)), - le32_to_cpu(*(__u32 *)(sb->uuid+12))); - pr_debug(" events: %llu\n", - (unsigned long long) le64_to_cpu(sb->events)); - pr_debug("events cleared: %llu\n", - (unsigned long long) le64_to_cpu(sb->events_cleared)); - pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); - pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize)); - pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); - pr_debug(" sync size: %llu KB\n", - (unsigned long long)le64_to_cpu(sb->sync_size)/2); - pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind)); - kunmap_atomic(sb); -} - -/* - * bitmap_new_disk_sb - * @bitmap - * - * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb - * reads and verifies the on-disk bitmap superblock and populates bitmap_info. - * This function verifies 'bitmap_info' and populates the on-disk bitmap - * structure, which is to be written to disk. - * - * Returns: 0 on success, -Exxx on error - */ -static int bitmap_new_disk_sb(struct bitmap *bitmap) -{ - bitmap_super_t *sb; - unsigned long chunksize, daemon_sleep, write_behind; - - bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (bitmap->storage.sb_page == NULL) - return -ENOMEM; - bitmap->storage.sb_page->index = 0; - - sb = kmap_atomic(bitmap->storage.sb_page); - - sb->magic = cpu_to_le32(BITMAP_MAGIC); - sb->version = cpu_to_le32(BITMAP_MAJOR_HI); - - chunksize = bitmap->mddev->bitmap_info.chunksize; - BUG_ON(!chunksize); - if (!is_power_of_2(chunksize)) { - kunmap_atomic(sb); - pr_warn("bitmap chunksize not a power of 2\n"); - return -EINVAL; - } - sb->chunksize = cpu_to_le32(chunksize); - - daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; - if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { - pr_debug("Choosing daemon_sleep default (5 sec)\n"); - daemon_sleep = 5 * HZ; - } - sb->daemon_sleep = cpu_to_le32(daemon_sleep); - bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; - - /* - * FIXME: write_behind for RAID1. If not specified, what - * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. - */ - write_behind = bitmap->mddev->bitmap_info.max_write_behind; - if (write_behind > COUNTER_MAX) - write_behind = COUNTER_MAX / 2; - sb->write_behind = cpu_to_le32(write_behind); - bitmap->mddev->bitmap_info.max_write_behind = write_behind; - - /* keep the array size field of the bitmap superblock up to date */ - sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); - - memcpy(sb->uuid, bitmap->mddev->uuid, 16); - - set_bit(BITMAP_STALE, &bitmap->flags); - sb->state = cpu_to_le32(bitmap->flags); - bitmap->events_cleared = bitmap->mddev->events; - sb->events_cleared = cpu_to_le64(bitmap->mddev->events); - bitmap->mddev->bitmap_info.nodes = 0; - - kunmap_atomic(sb); - - return 0; -} - -/* read the superblock from the bitmap file and initialize some bitmap fields */ -static int bitmap_read_sb(struct bitmap *bitmap) -{ - char *reason = NULL; - bitmap_super_t *sb; - unsigned long chunksize, daemon_sleep, write_behind; - unsigned long long events; - int nodes = 0; - unsigned long sectors_reserved = 0; - int err = -EINVAL; - struct page *sb_page; - loff_t offset = bitmap->mddev->bitmap_info.offset; - - if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { - chunksize = 128 * 1024 * 1024; - daemon_sleep = 5 * HZ; - write_behind = 0; - set_bit(BITMAP_STALE, &bitmap->flags); - err = 0; - goto out_no_sb; - } - /* page 0 is the superblock, read it... */ - sb_page = alloc_page(GFP_KERNEL); - if (!sb_page) - return -ENOMEM; - bitmap->storage.sb_page = sb_page; - -re_read: - /* If cluster_slot is set, the cluster is setup */ - if (bitmap->cluster_slot >= 0) { - sector_t bm_blocks = bitmap->mddev->resync_max_sectors; - - sector_div(bm_blocks, - bitmap->mddev->bitmap_info.chunksize >> 9); - /* bits to bytes */ - bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); - /* to 4k blocks */ - bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); - offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); - pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, - bitmap->cluster_slot, offset); - } - - if (bitmap->storage.file) { - loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); - int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; - - err = read_page(bitmap->storage.file, 0, - bitmap, bytes, sb_page); - } else { - err = read_sb_page(bitmap->mddev, - offset, - sb_page, - 0, sizeof(bitmap_super_t)); - } - if (err) - return err; - - err = -EINVAL; - sb = kmap_atomic(sb_page); - - chunksize = le32_to_cpu(sb->chunksize); - daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; - write_behind = le32_to_cpu(sb->write_behind); - sectors_reserved = le32_to_cpu(sb->sectors_reserved); - /* Setup nodes/clustername only if bitmap version is - * cluster-compatible - */ - if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { - nodes = le32_to_cpu(sb->nodes); - strlcpy(bitmap->mddev->bitmap_info.cluster_name, - sb->cluster_name, 64); - } - - /* verify that the bitmap-specific fields are valid */ - if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) - reason = "bad magic"; - else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || - le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED) - reason = "unrecognized superblock version"; - else if (chunksize < 512) - reason = "bitmap chunksize too small"; - else if (!is_power_of_2(chunksize)) - reason = "bitmap chunksize not a power of 2"; - else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) - reason = "daemon sleep period out of range"; - else if (write_behind > COUNTER_MAX) - reason = "write-behind limit out of range (0 - 16383)"; - if (reason) { - pr_warn("%s: invalid bitmap file superblock: %s\n", - bmname(bitmap), reason); - goto out; - } - - /* keep the array size field of the bitmap superblock up to date */ - sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); - - if (bitmap->mddev->persistent) { - /* - * We have a persistent array superblock, so compare the - * bitmap's UUID and event counter to the mddev's - */ - if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { - pr_warn("%s: bitmap superblock UUID mismatch\n", - bmname(bitmap)); - goto out; - } - events = le64_to_cpu(sb->events); - if (!nodes && (events < bitmap->mddev->events)) { - pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n", - bmname(bitmap), events, - (unsigned long long) bitmap->mddev->events); - set_bit(BITMAP_STALE, &bitmap->flags); - } - } - - /* assign fields using values from superblock */ - bitmap->flags |= le32_to_cpu(sb->state); - if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) - set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); - bitmap->events_cleared = le64_to_cpu(sb->events_cleared); - strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); - err = 0; - -out: - kunmap_atomic(sb); - /* Assigning chunksize is required for "re_read" */ - bitmap->mddev->bitmap_info.chunksize = chunksize; - if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { - err = md_setup_cluster(bitmap->mddev, nodes); - if (err) { - pr_warn("%s: Could not setup cluster service (%d)\n", - bmname(bitmap), err); - goto out_no_sb; - } - bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); - goto re_read; - } - - -out_no_sb: - if (test_bit(BITMAP_STALE, &bitmap->flags)) - bitmap->events_cleared = bitmap->mddev->events; - bitmap->mddev->bitmap_info.chunksize = chunksize; - bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; - bitmap->mddev->bitmap_info.max_write_behind = write_behind; - bitmap->mddev->bitmap_info.nodes = nodes; - if (bitmap->mddev->bitmap_info.space == 0 || - bitmap->mddev->bitmap_info.space > sectors_reserved) - bitmap->mddev->bitmap_info.space = sectors_reserved; - if (err) { - bitmap_print_sb(bitmap); - if (bitmap->cluster_slot < 0) - md_cluster_stop(bitmap->mddev); - } - return err; -} - -/* - * general bitmap file operations - */ - -/* - * on-disk bitmap: - * - * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap - * file a page at a time. There's a superblock at the start of the file. - */ -/* calculate the index of the page that contains this bit */ -static inline unsigned long file_page_index(struct bitmap_storage *store, - unsigned long chunk) -{ - if (store->sb_page) - chunk += sizeof(bitmap_super_t) << 3; - return chunk >> PAGE_BIT_SHIFT; -} - -/* calculate the (bit) offset of this bit within a page */ -static inline unsigned long file_page_offset(struct bitmap_storage *store, - unsigned long chunk) -{ - if (store->sb_page) - chunk += sizeof(bitmap_super_t) << 3; - return chunk & (PAGE_BITS - 1); -} - -/* - * return a pointer to the page in the filemap that contains the given bit - * - */ -static inline struct page *filemap_get_page(struct bitmap_storage *store, - unsigned long chunk) -{ - if (file_page_index(store, chunk) >= store->file_pages) - return NULL; - return store->filemap[file_page_index(store, chunk)]; -} - -static int bitmap_storage_alloc(struct bitmap_storage *store, - unsigned long chunks, int with_super, - int slot_number) -{ - int pnum, offset = 0; - unsigned long num_pages; - unsigned long bytes; - - bytes = DIV_ROUND_UP(chunks, 8); - if (with_super) - bytes += sizeof(bitmap_super_t); - - num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); - offset = slot_number * num_pages; - - store->filemap = kmalloc(sizeof(struct page *) - * num_pages, GFP_KERNEL); - if (!store->filemap) - return -ENOMEM; - - if (with_super && !store->sb_page) { - store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (store->sb_page == NULL) - return -ENOMEM; - } - - pnum = 0; - if (store->sb_page) { - store->filemap[0] = store->sb_page; - pnum = 1; - store->sb_page->index = offset; - } - - for ( ; pnum < num_pages; pnum++) { - store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); - if (!store->filemap[pnum]) { - store->file_pages = pnum; - return -ENOMEM; - } - store->filemap[pnum]->index = pnum + offset; - } - store->file_pages = pnum; - - /* We need 4 bits per page, rounded up to a multiple - * of sizeof(unsigned long) */ - store->filemap_attr = kzalloc( - roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), - GFP_KERNEL); - if (!store->filemap_attr) - return -ENOMEM; - - store->bytes = bytes; - - return 0; -} - -static void bitmap_file_unmap(struct bitmap_storage *store) -{ - struct page **map, *sb_page; - int pages; - struct file *file; - - file = store->file; - map = store->filemap; - pages = store->file_pages; - sb_page = store->sb_page; - - while (pages--) - if (map[pages] != sb_page) /* 0 is sb_page, release it below */ - free_buffers(map[pages]); - kfree(map); - kfree(store->filemap_attr); - - if (sb_page) - free_buffers(sb_page); - - if (file) { - struct inode *inode = file_inode(file); - invalidate_mapping_pages(inode->i_mapping, 0, -1); - fput(file); - } -} - -/* - * bitmap_file_kick - if an error occurs while manipulating the bitmap file - * then it is no longer reliable, so we stop using it and we mark the file - * as failed in the superblock - */ -static void bitmap_file_kick(struct bitmap *bitmap) -{ - char *path, *ptr = NULL; - - if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { - bitmap_update_sb(bitmap); - - if (bitmap->storage.file) { - path = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (path) - ptr = file_path(bitmap->storage.file, - path, PAGE_SIZE); - - pr_warn("%s: kicking failed bitmap file %s from array!\n", - bmname(bitmap), IS_ERR(ptr) ? "" : ptr); - - kfree(path); - } else - pr_warn("%s: disabling internal bitmap due to errors\n", - bmname(bitmap)); - } -} - -enum bitmap_page_attr { - BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ - BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned. - * i.e. counter is 1 or 2. */ - BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ -}; - -static inline void set_page_attr(struct bitmap *bitmap, int pnum, - enum bitmap_page_attr attr) -{ - set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); -} - -static inline void clear_page_attr(struct bitmap *bitmap, int pnum, - enum bitmap_page_attr attr) -{ - clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); -} - -static inline int test_page_attr(struct bitmap *bitmap, int pnum, - enum bitmap_page_attr attr) -{ - return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); -} - -static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, - enum bitmap_page_attr attr) -{ - return test_and_clear_bit((pnum<<2) + attr, - bitmap->storage.filemap_attr); -} -/* - * bitmap_file_set_bit -- called before performing a write to the md device - * to set (and eventually sync) a particular bit in the bitmap file - * - * we set the bit immediately, then we record the page number so that - * when an unplug occurs, we can flush the dirty pages out to disk - */ -static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) -{ - unsigned long bit; - struct page *page; - void *kaddr; - unsigned long chunk = block >> bitmap->counts.chunkshift; - struct bitmap_storage *store = &bitmap->storage; - unsigned long node_offset = 0; - - if (mddev_is_clustered(bitmap->mddev)) - node_offset = bitmap->cluster_slot * store->file_pages; - - page = filemap_get_page(&bitmap->storage, chunk); - if (!page) - return; - bit = file_page_offset(&bitmap->storage, chunk); - - /* set the bit */ - kaddr = kmap_atomic(page); - if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) - set_bit(bit, kaddr); - else - set_bit_le(bit, kaddr); - kunmap_atomic(kaddr); - pr_debug("set file bit %lu page %lu\n", bit, page->index); - /* record page number so it gets flushed to disk when unplug occurs */ - set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); -} - -static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) -{ - unsigned long bit; - struct page *page; - void *paddr; - unsigned long chunk = block >> bitmap->counts.chunkshift; - struct bitmap_storage *store = &bitmap->storage; - unsigned long node_offset = 0; - - if (mddev_is_clustered(bitmap->mddev)) - node_offset = bitmap->cluster_slot * store->file_pages; - - page = filemap_get_page(&bitmap->storage, chunk); - if (!page) - return; - bit = file_page_offset(&bitmap->storage, chunk); - paddr = kmap_atomic(page); - if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) - clear_bit(bit, paddr); - else - clear_bit_le(bit, paddr); - kunmap_atomic(paddr); - if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { - set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); - bitmap->allclean = 0; - } -} - -static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) -{ - unsigned long bit; - struct page *page; - void *paddr; - unsigned long chunk = block >> bitmap->counts.chunkshift; - int set = 0; - - page = filemap_get_page(&bitmap->storage, chunk); - if (!page) - return -EINVAL; - bit = file_page_offset(&bitmap->storage, chunk); - paddr = kmap_atomic(page); - if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) - set = test_bit(bit, paddr); - else - set = test_bit_le(bit, paddr); - kunmap_atomic(paddr); - return set; -} - - -/* this gets called when the md device is ready to unplug its underlying - * (slave) device queues -- before we let any writes go down, we need to - * sync the dirty pages of the bitmap file to disk */ -void bitmap_unplug(struct bitmap *bitmap) -{ - unsigned long i; - int dirty, need_write; - int writing = 0; - - if (!bitmap || !bitmap->storage.filemap || - test_bit(BITMAP_STALE, &bitmap->flags)) - return; - - /* look at each page to see if there are any set bits that need to be - * flushed out to disk */ - for (i = 0; i < bitmap->storage.file_pages; i++) { - if (!bitmap->storage.filemap) - return; - dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); - need_write = test_and_clear_page_attr(bitmap, i, - BITMAP_PAGE_NEEDWRITE); - if (dirty || need_write) { - if (!writing) { - bitmap_wait_writes(bitmap); - if (bitmap->mddev->queue) - blk_add_trace_msg(bitmap->mddev->queue, - "md bitmap_unplug"); - } - clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); - write_page(bitmap, bitmap->storage.filemap[i], 0); - writing = 1; - } - } - if (writing) - bitmap_wait_writes(bitmap); - - if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) - bitmap_file_kick(bitmap); -} -EXPORT_SYMBOL(bitmap_unplug); - -static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); -/* * bitmap_init_from_disk -- called at bitmap_create time to initialize - * the in-memory bitmap from the on-disk bitmap -- also, sets up the - * memory mapping of the bitmap file - * Special cases: - * if there's no bitmap file, or if the bitmap file had been - * previously kicked from the array, we mark all the bits as - * 1's in order to cause a full resync. - * - * We ignore all bits for sectors that end earlier than 'start'. - * This is used when reading an out-of-date bitmap... - */ -static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) -{ - unsigned long i, chunks, index, oldindex, bit, node_offset = 0; - struct page *page = NULL; - unsigned long bit_cnt = 0; - struct file *file; - unsigned long offset; - int outofdate; - int ret = -ENOSPC; - void *paddr; - struct bitmap_storage *store = &bitmap->storage; - - chunks = bitmap->counts.chunks; - file = store->file; - - if (!file && !bitmap->mddev->bitmap_info.offset) { - /* No permanent bitmap - fill with '1s'. */ - store->filemap = NULL; - store->file_pages = 0; - for (i = 0; i < chunks ; i++) { - /* if the disk bit is set, set the memory bit */ - int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift) - >= start); - bitmap_set_memory_bits(bitmap, - (sector_t)i << bitmap->counts.chunkshift, - needed); - } - return 0; - } - - outofdate = test_bit(BITMAP_STALE, &bitmap->flags); - if (outofdate) - pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); - - if (file && i_size_read(file->f_mapping->host) < store->bytes) { - pr_warn("%s: bitmap file too short %lu < %lu\n", - bmname(bitmap), - (unsigned long) i_size_read(file->f_mapping->host), - store->bytes); - goto err; - } - - oldindex = ~0L; - offset = 0; - if (!bitmap->mddev->bitmap_info.external) - offset = sizeof(bitmap_super_t); - - if (mddev_is_clustered(bitmap->mddev)) - node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); - - for (i = 0; i < chunks; i++) { - int b; - index = file_page_index(&bitmap->storage, i); - bit = file_page_offset(&bitmap->storage, i); - if (index != oldindex) { /* this is a new page, read it in */ - int count; - /* unmap the old page, we're done with it */ - if (index == store->file_pages-1) - count = store->bytes - index * PAGE_SIZE; - else - count = PAGE_SIZE; - page = store->filemap[index]; - if (file) - ret = read_page(file, index, bitmap, - count, page); - else - ret = read_sb_page( - bitmap->mddev, - bitmap->mddev->bitmap_info.offset, - page, - index + node_offset, count); - - if (ret) - goto err; - - oldindex = index; - - if (outofdate) { - /* - * if bitmap is out of date, dirty the - * whole page and write it out - */ - paddr = kmap_atomic(page); - memset(paddr + offset, 0xff, - PAGE_SIZE - offset); - kunmap_atomic(paddr); - write_page(bitmap, page, 1); - - ret = -EIO; - if (test_bit(BITMAP_WRITE_ERROR, - &bitmap->flags)) - goto err; - } - } - paddr = kmap_atomic(page); - if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) - b = test_bit(bit, paddr); - else - b = test_bit_le(bit, paddr); - kunmap_atomic(paddr); - if (b) { - /* if the disk bit is set, set the memory bit */ - int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift - >= start); - bitmap_set_memory_bits(bitmap, - (sector_t)i << bitmap->counts.chunkshift, - needed); - bit_cnt++; - } - offset = 0; - } - - pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", - bmname(bitmap), store->file_pages, - bit_cnt, chunks); - - return 0; - - err: - pr_warn("%s: bitmap initialisation failed: %d\n", - bmname(bitmap), ret); - return ret; -} - -void bitmap_write_all(struct bitmap *bitmap) -{ - /* We don't actually write all bitmap blocks here, - * just flag them as needing to be written - */ - int i; - - if (!bitmap || !bitmap->storage.filemap) - return; - if (bitmap->storage.file) - /* Only one copy, so nothing needed */ - return; - - for (i = 0; i < bitmap->storage.file_pages; i++) - set_page_attr(bitmap, i, - BITMAP_PAGE_NEEDWRITE); - bitmap->allclean = 0; -} - -static void bitmap_count_page(struct bitmap_counts *bitmap, - sector_t offset, int inc) -{ - sector_t chunk = offset >> bitmap->chunkshift; - unsigned long page = chunk >> PAGE_COUNTER_SHIFT; - bitmap->bp[page].count += inc; - bitmap_checkfree(bitmap, page); -} - -static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) -{ - sector_t chunk = offset >> bitmap->chunkshift; - unsigned long page = chunk >> PAGE_COUNTER_SHIFT; - struct bitmap_page *bp = &bitmap->bp[page]; - - if (!bp->pending) - bp->pending = 1; -} - -static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, - sector_t offset, sector_t *blocks, - int create); - -/* - * bitmap daemon -- periodically wakes up to clean bits and flush pages - * out to disk - */ - -void bitmap_daemon_work(struct mddev *mddev) -{ - struct bitmap *bitmap; - unsigned long j; - unsigned long nextpage; - sector_t blocks; - struct bitmap_counts *counts; - - /* Use a mutex to guard daemon_work against - * bitmap_destroy. - */ - mutex_lock(&mddev->bitmap_info.mutex); - bitmap = mddev->bitmap; - if (bitmap == NULL) { - mutex_unlock(&mddev->bitmap_info.mutex); - return; - } - if (time_before(jiffies, bitmap->daemon_lastrun - + mddev->bitmap_info.daemon_sleep)) - goto done; - - bitmap->daemon_lastrun = jiffies; - if (bitmap->allclean) { - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - goto done; - } - bitmap->allclean = 1; - - if (bitmap->mddev->queue) - blk_add_trace_msg(bitmap->mddev->queue, - "md bitmap_daemon_work"); - - /* Any file-page which is PENDING now needs to be written. - * So set NEEDWRITE now, then after we make any last-minute changes - * we will write it. - */ - for (j = 0; j < bitmap->storage.file_pages; j++) - if (test_and_clear_page_attr(bitmap, j, - BITMAP_PAGE_PENDING)) - set_page_attr(bitmap, j, - BITMAP_PAGE_NEEDWRITE); - - if (bitmap->need_sync && - mddev->bitmap_info.external == 0) { - /* Arrange for superblock update as well as - * other changes */ - bitmap_super_t *sb; - bitmap->need_sync = 0; - if (bitmap->storage.filemap) { - sb = kmap_atomic(bitmap->storage.sb_page); - sb->events_cleared = - cpu_to_le64(bitmap->events_cleared); - kunmap_atomic(sb); - set_page_attr(bitmap, 0, - BITMAP_PAGE_NEEDWRITE); - } - } - /* Now look at the bitmap counters and if any are '2' or '1', - * decrement and handle accordingly. - */ - counts = &bitmap->counts; - spin_lock_irq(&counts->lock); - nextpage = 0; - for (j = 0; j < counts->chunks; j++) { - bitmap_counter_t *bmc; - sector_t block = (sector_t)j << counts->chunkshift; - - if (j == nextpage) { - nextpage += PAGE_COUNTER_RATIO; - if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) { - j |= PAGE_COUNTER_MASK; - continue; - } - counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; - } - bmc = bitmap_get_counter(counts, - block, - &blocks, 0); - - if (!bmc) { - j |= PAGE_COUNTER_MASK; - continue; - } - if (*bmc == 1 && !bitmap->need_sync) { - /* We can clear the bit */ - *bmc = 0; - bitmap_count_page(counts, block, -1); - bitmap_file_clear_bit(bitmap, block); - } else if (*bmc && *bmc <= 2) { - *bmc = 1; - bitmap_set_pending(counts, block); - bitmap->allclean = 0; - } - } - spin_unlock_irq(&counts->lock); - - bitmap_wait_writes(bitmap); - /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. - * DIRTY pages need to be written by bitmap_unplug so it can wait - * for them. - * If we find any DIRTY page we stop there and let bitmap_unplug - * handle all the rest. This is important in the case where - * the first blocking holds the superblock and it has been updated. - * We mustn't write any other blocks before the superblock. - */ - for (j = 0; - j < bitmap->storage.file_pages - && !test_bit(BITMAP_STALE, &bitmap->flags); - j++) { - if (test_page_attr(bitmap, j, - BITMAP_PAGE_DIRTY)) - /* bitmap_unplug will handle the rest */ - break; - if (test_and_clear_page_attr(bitmap, j, - BITMAP_PAGE_NEEDWRITE)) { - write_page(bitmap, bitmap->storage.filemap[j], 0); - } - } - - done: - if (bitmap->allclean == 0) - mddev->thread->timeout = - mddev->bitmap_info.daemon_sleep; - mutex_unlock(&mddev->bitmap_info.mutex); -} - -static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, - sector_t offset, sector_t *blocks, - int create) -__releases(bitmap->lock) -__acquires(bitmap->lock) -{ - /* If 'create', we might release the lock and reclaim it. - * The lock must have been taken with interrupts enabled. - * If !create, we don't release the lock. - */ - sector_t chunk = offset >> bitmap->chunkshift; - unsigned long page = chunk >> PAGE_COUNTER_SHIFT; - unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; - sector_t csize; - int err; - - err = bitmap_checkpage(bitmap, page, create, 0); - - if (bitmap->bp[page].hijacked || - bitmap->bp[page].map == NULL) - csize = ((sector_t)1) << (bitmap->chunkshift + - PAGE_COUNTER_SHIFT - 1); - else - csize = ((sector_t)1) << bitmap->chunkshift; - *blocks = csize - (offset & (csize - 1)); - - if (err < 0) - return NULL; - - /* now locked ... */ - - if (bitmap->bp[page].hijacked) { /* hijacked pointer */ - /* should we use the first or second counter field - * of the hijacked pointer? */ - int hi = (pageoff > PAGE_COUNTER_MASK); - return &((bitmap_counter_t *) - &bitmap->bp[page].map)[hi]; - } else /* page is allocated */ - return (bitmap_counter_t *) - &(bitmap->bp[page].map[pageoff]); -} - -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) -{ - if (!bitmap) - return 0; - - if (behind) { - int bw; - atomic_inc(&bitmap->behind_writes); - bw = atomic_read(&bitmap->behind_writes); - if (bw > bitmap->behind_writes_used) - bitmap->behind_writes_used = bw; - - pr_debug("inc write-behind count %d/%lu\n", - bw, bitmap->mddev->bitmap_info.max_write_behind); - } - - while (sectors) { - sector_t blocks; - bitmap_counter_t *bmc; - - spin_lock_irq(&bitmap->counts.lock); - bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); - if (!bmc) { - spin_unlock_irq(&bitmap->counts.lock); - return 0; - } - - if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { - DEFINE_WAIT(__wait); - /* note that it is safe to do the prepare_to_wait - * after the test as long as we do it before dropping - * the spinlock. - */ - prepare_to_wait(&bitmap->overflow_wait, &__wait, - TASK_UNINTERRUPTIBLE); - spin_unlock_irq(&bitmap->counts.lock); - schedule(); - finish_wait(&bitmap->overflow_wait, &__wait); - continue; - } - - switch (*bmc) { - case 0: - bitmap_file_set_bit(bitmap, offset); - bitmap_count_page(&bitmap->counts, offset, 1); - /* fall through */ - case 1: - *bmc = 2; - } - - (*bmc)++; - - spin_unlock_irq(&bitmap->counts.lock); - - offset += blocks; - if (sectors > blocks) - sectors -= blocks; - else - sectors = 0; - } - return 0; -} -EXPORT_SYMBOL(bitmap_startwrite); - -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, - int success, int behind) -{ - if (!bitmap) - return; - if (behind) { - if (atomic_dec_and_test(&bitmap->behind_writes)) - wake_up(&bitmap->behind_wait); - pr_debug("dec write-behind count %d/%lu\n", - atomic_read(&bitmap->behind_writes), - bitmap->mddev->bitmap_info.max_write_behind); - } - - while (sectors) { - sector_t blocks; - unsigned long flags; - bitmap_counter_t *bmc; - - spin_lock_irqsave(&bitmap->counts.lock, flags); - bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); - if (!bmc) { - spin_unlock_irqrestore(&bitmap->counts.lock, flags); - return; - } - - if (success && !bitmap->mddev->degraded && - bitmap->events_cleared < bitmap->mddev->events) { - bitmap->events_cleared = bitmap->mddev->events; - bitmap->need_sync = 1; - sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); - } - - if (!success && !NEEDED(*bmc)) - *bmc |= NEEDED_MASK; - - if (COUNTER(*bmc) == COUNTER_MAX) - wake_up(&bitmap->overflow_wait); - - (*bmc)--; - if (*bmc <= 2) { - bitmap_set_pending(&bitmap->counts, offset); - bitmap->allclean = 0; - } - spin_unlock_irqrestore(&bitmap->counts.lock, flags); - offset += blocks; - if (sectors > blocks) - sectors -= blocks; - else - sectors = 0; - } -} -EXPORT_SYMBOL(bitmap_endwrite); - -static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) -{ - bitmap_counter_t *bmc; - int rv; - if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ - *blocks = 1024; - return 1; /* always resync if no bitmap */ - } - spin_lock_irq(&bitmap->counts.lock); - bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); - rv = 0; - if (bmc) { - /* locked */ - if (RESYNC(*bmc)) - rv = 1; - else if (NEEDED(*bmc)) { - rv = 1; - if (!degraded) { /* don't set/clear bits if degraded */ - *bmc |= RESYNC_MASK; - *bmc &= ~NEEDED_MASK; - } - } - } - spin_unlock_irq(&bitmap->counts.lock); - return rv; -} - -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, - int degraded) -{ - /* bitmap_start_sync must always report on multiples of whole - * pages, otherwise resync (which is very PAGE_SIZE based) will - * get confused. - * So call __bitmap_start_sync repeatedly (if needed) until - * At least PAGE_SIZE>>9 blocks are covered. - * Return the 'or' of the result. - */ - int rv = 0; - sector_t blocks1; - - *blocks = 0; - while (*blocks < (PAGE_SIZE>>9)) { - rv |= __bitmap_start_sync(bitmap, offset, - &blocks1, degraded); - offset += blocks1; - *blocks += blocks1; - } - return rv; -} -EXPORT_SYMBOL(bitmap_start_sync); - -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) -{ - bitmap_counter_t *bmc; - unsigned long flags; - - if (bitmap == NULL) { - *blocks = 1024; - return; - } - spin_lock_irqsave(&bitmap->counts.lock, flags); - bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); - if (bmc == NULL) - goto unlock; - /* locked */ - if (RESYNC(*bmc)) { - *bmc &= ~RESYNC_MASK; - - if (!NEEDED(*bmc) && aborted) - *bmc |= NEEDED_MASK; - else { - if (*bmc <= 2) { - bitmap_set_pending(&bitmap->counts, offset); - bitmap->allclean = 0; - } - } - } - unlock: - spin_unlock_irqrestore(&bitmap->counts.lock, flags); -} -EXPORT_SYMBOL(bitmap_end_sync); - -void bitmap_close_sync(struct bitmap *bitmap) -{ - /* Sync has finished, and any bitmap chunks that weren't synced - * properly have been aborted. It remains to us to clear the - * RESYNC bit wherever it is still on - */ - sector_t sector = 0; - sector_t blocks; - if (!bitmap) - return; - while (sector < bitmap->mddev->resync_max_sectors) { - bitmap_end_sync(bitmap, sector, &blocks, 0); - sector += blocks; - } -} -EXPORT_SYMBOL(bitmap_close_sync); - -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) -{ - sector_t s = 0; - sector_t blocks; - - if (!bitmap) - return; - if (sector == 0) { - bitmap->last_end_sync = jiffies; - return; - } - if (!force && time_before(jiffies, (bitmap->last_end_sync - + bitmap->mddev->bitmap_info.daemon_sleep))) - return; - wait_event(bitmap->mddev->recovery_wait, - atomic_read(&bitmap->mddev->recovery_active) == 0); - - bitmap->mddev->curr_resync_completed = sector; - set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags); - sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); - s = 0; - while (s < sector && s < bitmap->mddev->resync_max_sectors) { - bitmap_end_sync(bitmap, s, &blocks, 0); - s += blocks; - } - bitmap->last_end_sync = jiffies; - sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); -} -EXPORT_SYMBOL(bitmap_cond_end_sync); - -void bitmap_sync_with_cluster(struct mddev *mddev, - sector_t old_lo, sector_t old_hi, - sector_t new_lo, sector_t new_hi) -{ - struct bitmap *bitmap = mddev->bitmap; - sector_t sector, blocks = 0; - - for (sector = old_lo; sector < new_lo; ) { - bitmap_end_sync(bitmap, sector, &blocks, 0); - sector += blocks; - } - WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); - - for (sector = old_hi; sector < new_hi; ) { - bitmap_start_sync(bitmap, sector, &blocks, 0); - sector += blocks; - } - WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); -} -EXPORT_SYMBOL(bitmap_sync_with_cluster); - -static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) -{ - /* For each chunk covered by any of these sectors, set the - * counter to 2 and possibly set resync_needed. They should all - * be 0 at this point - */ - - sector_t secs; - bitmap_counter_t *bmc; - spin_lock_irq(&bitmap->counts.lock); - bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1); - if (!bmc) { - spin_unlock_irq(&bitmap->counts.lock); - return; - } - if (!*bmc) { - *bmc = 2; - bitmap_count_page(&bitmap->counts, offset, 1); - bitmap_set_pending(&bitmap->counts, offset); - bitmap->allclean = 0; - } - if (needed) - *bmc |= NEEDED_MASK; - spin_unlock_irq(&bitmap->counts.lock); -} - -/* dirty the memory and file bits for bitmap chunks "s" to "e" */ -void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) -{ - unsigned long chunk; - - for (chunk = s; chunk <= e; chunk++) { - sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; - bitmap_set_memory_bits(bitmap, sec, 1); - bitmap_file_set_bit(bitmap, sec); - if (sec < bitmap->mddev->recovery_cp) - /* We are asserting that the array is dirty, - * so move the recovery_cp address back so - * that it is obvious that it is dirty - */ - bitmap->mddev->recovery_cp = sec; - } -} - -/* - * flush out any pending updates - */ -void bitmap_flush(struct mddev *mddev) -{ - struct bitmap *bitmap = mddev->bitmap; - long sleep; - - if (!bitmap) /* there was no bitmap */ - return; - - /* run the daemon_work three time to ensure everything is flushed - * that can be - */ - sleep = mddev->bitmap_info.daemon_sleep * 2; - bitmap->daemon_lastrun -= sleep; - bitmap_daemon_work(mddev); - bitmap->daemon_lastrun -= sleep; - bitmap_daemon_work(mddev); - bitmap->daemon_lastrun -= sleep; - bitmap_daemon_work(mddev); - bitmap_update_sb(bitmap); -} - -/* - * free memory that was allocated - */ -void bitmap_free(struct bitmap *bitmap) -{ - unsigned long k, pages; - struct bitmap_page *bp; - - if (!bitmap) /* there was no bitmap */ - return; - - if (bitmap->sysfs_can_clear) - sysfs_put(bitmap->sysfs_can_clear); - - if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && - bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) - md_cluster_stop(bitmap->mddev); - - /* Shouldn't be needed - but just in case.... */ - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes) == 0); - - /* release the bitmap file */ - bitmap_file_unmap(&bitmap->storage); - - bp = bitmap->counts.bp; - pages = bitmap->counts.pages; - - /* free all allocated memory */ - - if (bp) /* deallocate the page memory */ - for (k = 0; k < pages; k++) - if (bp[k].map && !bp[k].hijacked) - kfree(bp[k].map); - kfree(bp); - kfree(bitmap); -} -EXPORT_SYMBOL(bitmap_free); - -void bitmap_wait_behind_writes(struct mddev *mddev) -{ - struct bitmap *bitmap = mddev->bitmap; - - /* wait for behind writes to complete */ - if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - pr_debug("md:%s: behind writes in progress - waiting to stop.\n", - mdname(mddev)); - /* need to kick something here to make sure I/O goes? */ - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } -} - -void bitmap_destroy(struct mddev *mddev) -{ - struct bitmap *bitmap = mddev->bitmap; - - if (!bitmap) /* there was no bitmap */ - return; - - bitmap_wait_behind_writes(mddev); - - mutex_lock(&mddev->bitmap_info.mutex); - spin_lock(&mddev->lock); - mddev->bitmap = NULL; /* disconnect from the md device */ - spin_unlock(&mddev->lock); - mutex_unlock(&mddev->bitmap_info.mutex); - if (mddev->thread) - mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; - - bitmap_free(bitmap); -} - -/* - * initialize the bitmap structure - * if this returns an error, bitmap_destroy must be called to do clean up - * once mddev->bitmap is set - */ -struct bitmap *bitmap_create(struct mddev *mddev, int slot) -{ - struct bitmap *bitmap; - sector_t blocks = mddev->resync_max_sectors; - struct file *file = mddev->bitmap_info.file; - int err; - struct kernfs_node *bm = NULL; - - BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); - - BUG_ON(file && mddev->bitmap_info.offset); - - bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); - if (!bitmap) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&bitmap->counts.lock); - atomic_set(&bitmap->pending_writes, 0); - init_waitqueue_head(&bitmap->write_wait); - init_waitqueue_head(&bitmap->overflow_wait); - init_waitqueue_head(&bitmap->behind_wait); - - bitmap->mddev = mddev; - bitmap->cluster_slot = slot; - - if (mddev->kobj.sd) - bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); - if (bm) { - bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); - sysfs_put(bm); - } else - bitmap->sysfs_can_clear = NULL; - - bitmap->storage.file = file; - if (file) { - get_file(file); - /* As future accesses to this file will use bmap, - * and bypass the page cache, we must sync the file - * first. - */ - vfs_fsync(file, 1); - } - /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ - if (!mddev->bitmap_info.external) { - /* - * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is - * instructing us to create a new on-disk bitmap instance. - */ - if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags)) - err = bitmap_new_disk_sb(bitmap); - else - err = bitmap_read_sb(bitmap); - } else { - err = 0; - if (mddev->bitmap_info.chunksize == 0 || - mddev->bitmap_info.daemon_sleep == 0) - /* chunksize and time_base need to be - * set first. */ - err = -EINVAL; - } - if (err) - goto error; - - bitmap->daemon_lastrun = jiffies; - err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); - if (err) - goto error; - - pr_debug("created bitmap (%lu pages) for device %s\n", - bitmap->counts.pages, bmname(bitmap)); - - err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; - if (err) - goto error; - - return bitmap; - error: - bitmap_free(bitmap); - return ERR_PTR(err); -} - -int bitmap_load(struct mddev *mddev) -{ - int err = 0; - sector_t start = 0; - sector_t sector = 0; - struct bitmap *bitmap = mddev->bitmap; - - if (!bitmap) - goto out; - - if (mddev_is_clustered(mddev)) - md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); - - /* Clear out old bitmap info first: Either there is none, or we - * are resuming after someone else has possibly changed things, - * so we should forget old cached info. - * All chunks should be clean, but some might need_sync. - */ - while (sector < mddev->resync_max_sectors) { - sector_t blocks; - bitmap_start_sync(bitmap, sector, &blocks, 0); - sector += blocks; - } - bitmap_close_sync(bitmap); - - if (mddev->degraded == 0 - || bitmap->events_cleared == mddev->events) - /* no need to keep dirty bits to optimise a - * re-add of a missing device */ - start = mddev->recovery_cp; - - mutex_lock(&mddev->bitmap_info.mutex); - err = bitmap_init_from_disk(bitmap, start); - mutex_unlock(&mddev->bitmap_info.mutex); - - if (err) - goto out; - clear_bit(BITMAP_STALE, &bitmap->flags); - - /* Kick recovery in case any bits were set */ - set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); - - mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; - md_wakeup_thread(mddev->thread); - - bitmap_update_sb(bitmap); - - if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) - err = -EIO; -out: - return err; -} -EXPORT_SYMBOL_GPL(bitmap_load); - -struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) -{ - int rv = 0; - struct bitmap *bitmap; - - bitmap = bitmap_create(mddev, slot); - if (IS_ERR(bitmap)) { - rv = PTR_ERR(bitmap); - return ERR_PTR(rv); - } - - rv = bitmap_init_from_disk(bitmap, 0); - if (rv) { - bitmap_free(bitmap); - return ERR_PTR(rv); - } - - return bitmap; -} -EXPORT_SYMBOL(get_bitmap_from_slot); - -/* Loads the bitmap associated with slot and copies the resync information - * to our bitmap - */ -int bitmap_copy_from_slot(struct mddev *mddev, int slot, - sector_t *low, sector_t *high, bool clear_bits) -{ - int rv = 0, i, j; - sector_t block, lo = 0, hi = 0; - struct bitmap_counts *counts; - struct bitmap *bitmap; - - bitmap = get_bitmap_from_slot(mddev, slot); - if (IS_ERR(bitmap)) { - pr_err("%s can't get bitmap from slot %d\n", __func__, slot); - return -1; - } - - counts = &bitmap->counts; - for (j = 0; j < counts->chunks; j++) { - block = (sector_t)j << counts->chunkshift; - if (bitmap_file_test_bit(bitmap, block)) { - if (!lo) - lo = block; - hi = block; - bitmap_file_clear_bit(bitmap, block); - bitmap_set_memory_bits(mddev->bitmap, block, 1); - bitmap_file_set_bit(mddev->bitmap, block); - } - } - - if (clear_bits) { - bitmap_update_sb(bitmap); - /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs - * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ - for (i = 0; i < bitmap->storage.file_pages; i++) - if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) - set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); - bitmap_unplug(bitmap); - } - bitmap_unplug(mddev->bitmap); - *low = lo; - *high = hi; - - return rv; -} -EXPORT_SYMBOL_GPL(bitmap_copy_from_slot); - - -void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) -{ - unsigned long chunk_kb; - struct bitmap_counts *counts; - - if (!bitmap) - return; - - counts = &bitmap->counts; - - chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - counts->pages - counts->missing_pages, - counts->pages, - (counts->pages - counts->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->storage.file) { - seq_printf(seq, ", file: "); - seq_file_path(seq, bitmap->storage.file, " \t\n"); - } - - seq_printf(seq, "\n"); -} - -int bitmap_resize(struct bitmap *bitmap, sector_t blocks, - int chunksize, int init) -{ - /* If chunk_size is 0, choose an appropriate chunk size. - * Then possibly allocate new storage space. - * Then quiesce, copy bits, replace bitmap, and re-start - * - * This function is called both to set up the initial bitmap - * and to resize the bitmap while the array is active. - * If this happens as a result of the array being resized, - * chunksize will be zero, and we need to choose a suitable - * chunksize, otherwise we use what we are given. - */ - struct bitmap_storage store; - struct bitmap_counts old_counts; - unsigned long chunks; - sector_t block; - sector_t old_blocks, new_blocks; - int chunkshift; - int ret = 0; - long pages; - struct bitmap_page *new_bp; - - if (bitmap->storage.file && !init) { - pr_info("md: cannot resize file-based bitmap\n"); - return -EINVAL; - } - - if (chunksize == 0) { - /* If there is enough space, leave the chunk size unchanged, - * else increase by factor of two until there is enough space. - */ - long bytes; - long space = bitmap->mddev->bitmap_info.space; - - if (space == 0) { - /* We don't know how much space there is, so limit - * to current size - in sectors. - */ - bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8); - if (!bitmap->mddev->bitmap_info.external) - bytes += sizeof(bitmap_super_t); - space = DIV_ROUND_UP(bytes, 512); - bitmap->mddev->bitmap_info.space = space; - } - chunkshift = bitmap->counts.chunkshift; - chunkshift--; - do { - /* 'chunkshift' is shift from block size to chunk size */ - chunkshift++; - chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); - bytes = DIV_ROUND_UP(chunks, 8); - if (!bitmap->mddev->bitmap_info.external) - bytes += sizeof(bitmap_super_t); - } while (bytes > (space << 9)); - } else - chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; - - chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); - memset(&store, 0, sizeof(store)); - if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) - ret = bitmap_storage_alloc(&store, chunks, - !bitmap->mddev->bitmap_info.external, - mddev_is_clustered(bitmap->mddev) - ? bitmap->cluster_slot : 0); - if (ret) { - bitmap_file_unmap(&store); - goto err; - } - - pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); - - new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL); - ret = -ENOMEM; - if (!new_bp) { - bitmap_file_unmap(&store); - goto err; - } - - if (!init) - bitmap->mddev->pers->quiesce(bitmap->mddev, 1); - - store.file = bitmap->storage.file; - bitmap->storage.file = NULL; - - if (store.sb_page && bitmap->storage.sb_page) - memcpy(page_address(store.sb_page), - page_address(bitmap->storage.sb_page), - sizeof(bitmap_super_t)); - bitmap_file_unmap(&bitmap->storage); - bitmap->storage = store; - - old_counts = bitmap->counts; - bitmap->counts.bp = new_bp; - bitmap->counts.pages = pages; - bitmap->counts.missing_pages = pages; - bitmap->counts.chunkshift = chunkshift; - bitmap->counts.chunks = chunks; - bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + - BITMAP_BLOCK_SHIFT); - - blocks = min(old_counts.chunks << old_counts.chunkshift, - chunks << chunkshift); - - spin_lock_irq(&bitmap->counts.lock); - /* For cluster raid, need to pre-allocate bitmap */ - if (mddev_is_clustered(bitmap->mddev)) { - unsigned long page; - for (page = 0; page < pages; page++) { - ret = bitmap_checkpage(&bitmap->counts, page, 1, 1); - if (ret) { - unsigned long k; - - /* deallocate the page memory */ - for (k = 0; k < page; k++) { - kfree(new_bp[k].map); - } - - /* restore some fields from old_counts */ - bitmap->counts.bp = old_counts.bp; - bitmap->counts.pages = old_counts.pages; - bitmap->counts.missing_pages = old_counts.pages; - bitmap->counts.chunkshift = old_counts.chunkshift; - bitmap->counts.chunks = old_counts.chunks; - bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + - BITMAP_BLOCK_SHIFT); - blocks = old_counts.chunks << old_counts.chunkshift; - pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); - break; - } else - bitmap->counts.bp[page].count += 1; - } - } - - for (block = 0; block < blocks; ) { - bitmap_counter_t *bmc_old, *bmc_new; - int set; - - bmc_old = bitmap_get_counter(&old_counts, block, - &old_blocks, 0); - set = bmc_old && NEEDED(*bmc_old); - - if (set) { - bmc_new = bitmap_get_counter(&bitmap->counts, block, - &new_blocks, 1); - if (*bmc_new == 0) { - /* need to set on-disk bits too. */ - sector_t end = block + new_blocks; - sector_t start = block >> chunkshift; - start <<= chunkshift; - while (start < end) { - bitmap_file_set_bit(bitmap, block); - start += 1 << chunkshift; - } - *bmc_new = 2; - bitmap_count_page(&bitmap->counts, - block, 1); - bitmap_set_pending(&bitmap->counts, - block); - } - *bmc_new |= NEEDED_MASK; - if (new_blocks < old_blocks) - old_blocks = new_blocks; - } - block += old_blocks; - } - - if (!init) { - int i; - while (block < (chunks << chunkshift)) { - bitmap_counter_t *bmc; - bmc = bitmap_get_counter(&bitmap->counts, block, - &new_blocks, 1); - if (bmc) { - /* new space. It needs to be resynced, so - * we set NEEDED_MASK. - */ - if (*bmc == 0) { - *bmc = NEEDED_MASK | 2; - bitmap_count_page(&bitmap->counts, - block, 1); - bitmap_set_pending(&bitmap->counts, - block); - } - } - block += new_blocks; - } - for (i = 0; i < bitmap->storage.file_pages; i++) - set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); - } - spin_unlock_irq(&bitmap->counts.lock); - - if (!init) { - bitmap_unplug(bitmap); - bitmap->mddev->pers->quiesce(bitmap->mddev, 0); - } - ret = 0; -err: - return ret; -} -EXPORT_SYMBOL_GPL(bitmap_resize); - -static ssize_t -location_show(struct mddev *mddev, char *page) -{ - ssize_t len; - if (mddev->bitmap_info.file) - len = sprintf(page, "file"); - else if (mddev->bitmap_info.offset) - len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); - else - len = sprintf(page, "none"); - len += sprintf(page+len, "\n"); - return len; -} - -static ssize_t -location_store(struct mddev *mddev, const char *buf, size_t len) -{ - int rv; - - rv = mddev_lock(mddev); - if (rv) - return rv; - if (mddev->pers) { - if (!mddev->pers->quiesce) { - rv = -EBUSY; - goto out; - } - if (mddev->recovery || mddev->sync_thread) { - rv = -EBUSY; - goto out; - } - } - - if (mddev->bitmap || mddev->bitmap_info.file || - mddev->bitmap_info.offset) { - /* bitmap already configured. Only option is to clear it */ - if (strncmp(buf, "none", 4) != 0) { - rv = -EBUSY; - goto out; - } - if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); - bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); - } - mddev->bitmap_info.offset = 0; - if (mddev->bitmap_info.file) { - struct file *f = mddev->bitmap_info.file; - mddev->bitmap_info.file = NULL; - fput(f); - } - } else { - /* No bitmap, OK to set a location */ - long long offset; - if (strncmp(buf, "none", 4) == 0) - /* nothing to be done */; - else if (strncmp(buf, "file:", 5) == 0) { - /* Not supported yet */ - rv = -EINVAL; - goto out; - } else { - if (buf[0] == '+') - rv = kstrtoll(buf+1, 10, &offset); - else - rv = kstrtoll(buf, 10, &offset); - if (rv) - goto out; - if (offset == 0) { - rv = -EINVAL; - goto out; - } - if (mddev->bitmap_info.external == 0 && - mddev->major_version == 0 && - offset != mddev->bitmap_info.default_offset) { - rv = -EINVAL; - goto out; - } - mddev->bitmap_info.offset = offset; - if (mddev->pers) { - struct bitmap *bitmap; - mddev->pers->quiesce(mddev, 1); - bitmap = bitmap_create(mddev, -1); - if (IS_ERR(bitmap)) - rv = PTR_ERR(bitmap); - else { - mddev->bitmap = bitmap; - rv = bitmap_load(mddev); - if (rv) - mddev->bitmap_info.offset = 0; - } - mddev->pers->quiesce(mddev, 0); - if (rv) { - bitmap_destroy(mddev); - goto out; - } - } - } - } - if (!mddev->external) { - /* Ensure new bitmap info is stored in - * metadata promptly. - */ - set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - md_wakeup_thread(mddev->thread); - } - rv = 0; -out: - mddev_unlock(mddev); - if (rv) - return rv; - return len; -} - -static struct md_sysfs_entry bitmap_location = -__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); - -/* 'bitmap/space' is the space available at 'location' for the - * bitmap. This allows the kernel to know when it is safe to - * resize the bitmap to match a resized array. - */ -static ssize_t -space_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%lu\n", mddev->bitmap_info.space); -} - -static ssize_t -space_store(struct mddev *mddev, const char *buf, size_t len) -{ - unsigned long sectors; - int rv; - - rv = kstrtoul(buf, 10, §ors); - if (rv) - return rv; - - if (sectors == 0) - return -EINVAL; - - if (mddev->bitmap && - sectors < (mddev->bitmap->storage.bytes + 511) >> 9) - return -EFBIG; /* Bitmap is too big for this small space */ - - /* could make sure it isn't too big, but that isn't really - * needed - user-space should be careful. - */ - mddev->bitmap_info.space = sectors; - return len; -} - -static struct md_sysfs_entry bitmap_space = -__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store); - -static ssize_t -timeout_show(struct mddev *mddev, char *page) -{ - ssize_t len; - unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; - unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; - - len = sprintf(page, "%lu", secs); - if (jifs) - len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); - len += sprintf(page+len, "\n"); - return len; -} - -static ssize_t -timeout_store(struct mddev *mddev, const char *buf, size_t len) -{ - /* timeout can be set at any time */ - unsigned long timeout; - int rv = strict_strtoul_scaled(buf, &timeout, 4); - if (rv) - return rv; - - /* just to make sure we don't overflow... */ - if (timeout >= LONG_MAX / HZ) - return -EINVAL; - - timeout = timeout * HZ / 10000; - - if (timeout >= MAX_SCHEDULE_TIMEOUT) - timeout = MAX_SCHEDULE_TIMEOUT-1; - if (timeout < 1) - timeout = 1; - mddev->bitmap_info.daemon_sleep = timeout; - if (mddev->thread) { - /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then - * the bitmap is all clean and we don't need to - * adjust the timeout right now - */ - if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { - mddev->thread->timeout = timeout; - md_wakeup_thread(mddev->thread); - } - } - return len; -} - -static struct md_sysfs_entry bitmap_timeout = -__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); - -static ssize_t -backlog_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); -} - -static ssize_t -backlog_store(struct mddev *mddev, const char *buf, size_t len) -{ - unsigned long backlog; - int rv = kstrtoul(buf, 10, &backlog); - if (rv) - return rv; - if (backlog > COUNTER_MAX) - return -EINVAL; - mddev->bitmap_info.max_write_behind = backlog; - return len; -} - -static struct md_sysfs_entry bitmap_backlog = -__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); - -static ssize_t -chunksize_show(struct mddev *mddev, char *page) -{ - return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); -} - -static ssize_t -chunksize_store(struct mddev *mddev, const char *buf, size_t len) -{ - /* Can only be changed when no bitmap is active */ - int rv; - unsigned long csize; - if (mddev->bitmap) - return -EBUSY; - rv = kstrtoul(buf, 10, &csize); - if (rv) - return rv; - if (csize < 512 || - !is_power_of_2(csize)) - return -EINVAL; - mddev->bitmap_info.chunksize = csize; - return len; -} - -static struct md_sysfs_entry bitmap_chunksize = -__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); - -static ssize_t metadata_show(struct mddev *mddev, char *page) -{ - if (mddev_is_clustered(mddev)) - return sprintf(page, "clustered\n"); - return sprintf(page, "%s\n", (mddev->bitmap_info.external - ? "external" : "internal")); -} - -static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) -{ - if (mddev->bitmap || - mddev->bitmap_info.file || - mddev->bitmap_info.offset) - return -EBUSY; - if (strncmp(buf, "external", 8) == 0) - mddev->bitmap_info.external = 1; - else if ((strncmp(buf, "internal", 8) == 0) || - (strncmp(buf, "clustered", 9) == 0)) - mddev->bitmap_info.external = 0; - else - return -EINVAL; - return len; -} - -static struct md_sysfs_entry bitmap_metadata = -__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); - -static ssize_t can_clear_show(struct mddev *mddev, char *page) -{ - int len; - spin_lock(&mddev->lock); - if (mddev->bitmap) - len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? - "false" : "true")); - else - len = sprintf(page, "\n"); - spin_unlock(&mddev->lock); - return len; -} - -static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) -{ - if (mddev->bitmap == NULL) - return -ENOENT; - if (strncmp(buf, "false", 5) == 0) - mddev->bitmap->need_sync = 1; - else if (strncmp(buf, "true", 4) == 0) { - if (mddev->degraded) - return -EBUSY; - mddev->bitmap->need_sync = 0; - } else - return -EINVAL; - return len; -} - -static struct md_sysfs_entry bitmap_can_clear = -__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); - -static ssize_t -behind_writes_used_show(struct mddev *mddev, char *page) -{ - ssize_t ret; - spin_lock(&mddev->lock); - if (mddev->bitmap == NULL) - ret = sprintf(page, "0\n"); - else - ret = sprintf(page, "%lu\n", - mddev->bitmap->behind_writes_used); - spin_unlock(&mddev->lock); - return ret; -} - -static ssize_t -behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) -{ - if (mddev->bitmap) - mddev->bitmap->behind_writes_used = 0; - return len; -} - -static struct md_sysfs_entry max_backlog_used = -__ATTR(max_backlog_used, S_IRUGO | S_IWUSR, - behind_writes_used_show, behind_writes_used_reset); - -static struct attribute *md_bitmap_attrs[] = { - &bitmap_location.attr, - &bitmap_space.attr, - &bitmap_timeout.attr, - &bitmap_backlog.attr, - &bitmap_chunksize.attr, - &bitmap_metadata.attr, - &bitmap_can_clear.attr, - &max_backlog_used.attr, - NULL -}; -struct attribute_group md_bitmap_group = { - .name = "bitmap", - .attrs = md_bitmap_attrs, -}; - diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h deleted file mode 100644 index d15721ac07a6..000000000000 --- a/drivers/md/bitmap.h +++ /dev/null @@ -1,277 +0,0 @@ -/* - * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 - * - * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. - */ -#ifndef BITMAP_H -#define BITMAP_H 1 - -#define BITMAP_MAJOR_LO 3 -/* version 4 insists the bitmap is in little-endian order - * with version 3, it is host-endian which is non-portable - * Version 5 is currently set only for clustered devices - */ -#define BITMAP_MAJOR_HI 4 -#define BITMAP_MAJOR_CLUSTERED 5 -#define BITMAP_MAJOR_HOSTENDIAN 3 - -/* - * in-memory bitmap: - * - * Use 16 bit block counters to track pending writes to each "chunk". - * The 2 high order bits are special-purpose, the first is a flag indicating - * whether a resync is needed. The second is a flag indicating whether a - * resync is active. - * This means that the counter is actually 14 bits: - * - * +--------+--------+------------------------------------------------+ - * | resync | resync | counter | - * | needed | active | | - * | (0-1) | (0-1) | (0-16383) | - * +--------+--------+------------------------------------------------+ - * - * The "resync needed" bit is set when: - * a '1' bit is read from storage at startup. - * a write request fails on some drives - * a resync is aborted on a chunk with 'resync active' set - * It is cleared (and resync-active set) when a resync starts across all drives - * of the chunk. - * - * - * The "resync active" bit is set when: - * a resync is started on all drives, and resync_needed is set. - * resync_needed will be cleared (as long as resync_active wasn't already set). - * It is cleared when a resync completes. - * - * The counter counts pending write requests, plus the on-disk bit. - * When the counter is '1' and the resync bits are clear, the on-disk - * bit can be cleared as well, thus setting the counter to 0. - * When we set a bit, or in the counter (to start a write), if the fields is - * 0, we first set the disk bit and set the counter to 1. - * - * If the counter is 0, the on-disk bit is clear and the stripe is clean - * Anything that dirties the stripe pushes the counter to 2 (at least) - * and sets the on-disk bit (lazily). - * If a periodic sweep find the counter at 2, it is decremented to 1. - * If the sweep find the counter at 1, the on-disk bit is cleared and the - * counter goes to zero. - * - * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block - * counters as a fallback when "page" memory cannot be allocated: - * - * Normal case (page memory allocated): - * - * page pointer (32-bit) - * - * [ ] ------+ - * | - * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) - * c1 c2 c2048 - * - * Hijacked case (page memory allocation failed): - * - * hijacked page pointer (32-bit) - * - * [ ][ ] (no page memory allocated) - * counter #1 (16-bit) counter #2 (16-bit) - * - */ - -#ifdef __KERNEL__ - -#define PAGE_BITS (PAGE_SIZE << 3) -#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) - -typedef __u16 bitmap_counter_t; -#define COUNTER_BITS 16 -#define COUNTER_BIT_SHIFT 4 -#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) - -#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) -#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) -#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) -#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) -#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) -#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) - -/* how many counters per page? */ -#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) -/* same, except a shift value for more efficient bitops */ -#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) -/* same, except a mask value for more efficient bitops */ -#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) - -#define BITMAP_BLOCK_SHIFT 9 - -#endif - -/* - * bitmap structures: - */ - -#define BITMAP_MAGIC 0x6d746962 - -/* use these for bitmap->flags and bitmap->sb->state bit-fields */ -enum bitmap_state { - BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ - BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ - BITMAP_HOSTENDIAN =15, -}; - -/* the superblock at the front of the bitmap file -- little endian */ -typedef struct bitmap_super_s { - __le32 magic; /* 0 BITMAP_MAGIC */ - __le32 version; /* 4 the bitmap major for now, could change... */ - __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ - __le64 events; /* 24 event counter for the bitmap (1)*/ - __le64 events_cleared;/*32 event counter when last bit cleared (2) */ - __le64 sync_size; /* 40 the size of the md device's sync range(3) */ - __le32 state; /* 48 bitmap state information */ - __le32 chunksize; /* 52 the bitmap chunk size in bytes */ - __le32 daemon_sleep; /* 56 seconds between disk flushes */ - __le32 write_behind; /* 60 number of outstanding write-behind writes */ - __le32 sectors_reserved; /* 64 number of 512-byte sectors that are - * reserved for the bitmap. */ - __le32 nodes; /* 68 the maximum number of nodes in cluster. */ - __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ - __u8 pad[256 - 136]; /* set to zero */ -} bitmap_super_t; - -/* notes: - * (1) This event counter is updated before the eventcounter in the md superblock - * When a bitmap is loaded, it is only accepted if this event counter is equal - * to, or one greater than, the event counter in the superblock. - * (2) This event counter is updated when the other one is *if*and*only*if* the - * array is not degraded. As bits are not cleared when the array is degraded, - * this represents the last time that any bits were cleared. - * If a device is being added that has an event count with this value or - * higher, it is accepted as conforming to the bitmap. - * (3)This is the number of sectors represented by the bitmap, and is the range that - * resync happens across. For raid1 and raid5/6 it is the size of individual - * devices. For raid10 it is the size of the array. - */ - -#ifdef __KERNEL__ - -/* the in-memory bitmap is represented by bitmap_pages */ -struct bitmap_page { - /* - * map points to the actual memory page - */ - char *map; - /* - * in emergencies (when map cannot be alloced), hijack the map - * pointer and use it as two counters itself - */ - unsigned int hijacked:1; - /* - * If any counter in this page is '1' or '2' - and so could be - * cleared then that page is marked as 'pending' - */ - unsigned int pending:1; - /* - * count of dirty bits on the page - */ - unsigned int count:30; -}; - -/* the main bitmap structure - one per mddev */ -struct bitmap { - - struct bitmap_counts { - spinlock_t lock; - struct bitmap_page *bp; - unsigned long pages; /* total number of pages - * in the bitmap */ - unsigned long missing_pages; /* number of pages - * not yet allocated */ - unsigned long chunkshift; /* chunksize = 2^chunkshift - * (for bitops) */ - unsigned long chunks; /* Total number of data - * chunks for the array */ - } counts; - - struct mddev *mddev; /* the md device that the bitmap is for */ - - __u64 events_cleared; - int need_sync; - - struct bitmap_storage { - struct file *file; /* backing disk file */ - struct page *sb_page; /* cached copy of the bitmap - * file superblock */ - struct page **filemap; /* list of cache pages for - * the file */ - unsigned long *filemap_attr; /* attributes associated - * w/ filemap pages */ - unsigned long file_pages; /* number of pages in the file*/ - unsigned long bytes; /* total bytes in the bitmap */ - } storage; - - unsigned long flags; - - int allclean; - - atomic_t behind_writes; - unsigned long behind_writes_used; /* highest actual value at runtime */ - - /* - * the bitmap daemon - periodically wakes up and sweeps the bitmap - * file, cleaning up bits and flushing out pages to disk as necessary - */ - unsigned long daemon_lastrun; /* jiffies of last run */ - unsigned long last_end_sync; /* when we lasted called end_sync to - * update bitmap with resync progress */ - - atomic_t pending_writes; /* pending writes to the bitmap file */ - wait_queue_head_t write_wait; - wait_queue_head_t overflow_wait; - wait_queue_head_t behind_wait; - - struct kernfs_node *sysfs_can_clear; - int cluster_slot; /* Slot offset for clustered env */ -}; - -/* the bitmap API */ - -/* these are used only by md/bitmap */ -struct bitmap *bitmap_create(struct mddev *mddev, int slot); -int bitmap_load(struct mddev *mddev); -void bitmap_flush(struct mddev *mddev); -void bitmap_destroy(struct mddev *mddev); - -void bitmap_print_sb(struct bitmap *bitmap); -void bitmap_update_sb(struct bitmap *bitmap); -void bitmap_status(struct seq_file *seq, struct bitmap *bitmap); - -int bitmap_setallbits(struct bitmap *bitmap); -void bitmap_write_all(struct bitmap *bitmap); - -void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); - -/* these are exported */ -int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int behind); -void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind); -int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); -void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); -void bitmap_close_sync(struct bitmap *bitmap); -void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); -void bitmap_sync_with_cluster(struct mddev *mddev, - sector_t old_lo, sector_t old_hi, - sector_t new_lo, sector_t new_hi); - -void bitmap_unplug(struct bitmap *bitmap); -void bitmap_daemon_work(struct mddev *mddev); - -int bitmap_resize(struct bitmap *bitmap, sector_t blocks, - int chunksize, int init); -struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); -int bitmap_copy_from_slot(struct mddev *mddev, int slot, - sector_t *lo, sector_t *hi, bool clear_bits); -void bitmap_free(struct bitmap *bitmap); -void bitmap_wait_behind_writes(struct mddev *mddev); -#endif - -#endif diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 1ac58c5651b7..252770696a05 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -12,7 +12,7 @@ #include "raid1.h" #include "raid5.h" #include "raid10.h" -#include "bitmap.h" +#include "md-bitmap.h" #include diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c deleted file mode 100644 index 38264b38420f..000000000000 --- a/drivers/md/faulty.c +++ /dev/null @@ -1,372 +0,0 @@ -/* - * faulty.c : Multiple Devices driver for Linux - * - * Copyright (C) 2004 Neil Brown - * - * fautly-device-simulator personality for md - * - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - - -/* - * The "faulty" personality causes some requests to fail. - * - * Possible failure modes are: - * reads fail "randomly" but succeed on retry - * writes fail "randomly" but succeed on retry - * reads for some address fail and then persist until a write - * reads for some address fail and then persist irrespective of write - * writes for some address fail and persist - * all writes fail - * - * Different modes can be active at a time, but only - * one can be set at array creation. Others can be added later. - * A mode can be one-shot or recurrent with the recurrence being - * once in every N requests. - * The bottom 5 bits of the "layout" indicate the mode. The - * remainder indicate a period, or 0 for one-shot. - * - * There is an implementation limit on the number of concurrently - * persisting-faulty blocks. When a new fault is requested that would - * exceed the limit, it is ignored. - * All current faults can be clear using a layout of "0". - * - * Requests are always sent to the device. If they are to fail, - * we clone the bio and insert a new b_end_io into the chain. - */ - -#define WriteTransient 0 -#define ReadTransient 1 -#define WritePersistent 2 -#define ReadPersistent 3 -#define WriteAll 4 /* doesn't go to device */ -#define ReadFixable 5 -#define Modes 6 - -#define ClearErrors 31 -#define ClearFaults 30 - -#define AllPersist 100 /* internal use only */ -#define NoPersist 101 - -#define ModeMask 0x1f -#define ModeShift 5 - -#define MaxFault 50 -#include -#include -#include -#include -#include "md.h" -#include - - -static void faulty_fail(struct bio *bio) -{ - struct bio *b = bio->bi_private; - - b->bi_iter.bi_size = bio->bi_iter.bi_size; - b->bi_iter.bi_sector = bio->bi_iter.bi_sector; - - bio_put(bio); - - bio_io_error(b); -} - -struct faulty_conf { - int period[Modes]; - atomic_t counters[Modes]; - sector_t faults[MaxFault]; - int modes[MaxFault]; - int nfaults; - struct md_rdev *rdev; -}; - -static int check_mode(struct faulty_conf *conf, int mode) -{ - if (conf->period[mode] == 0 && - atomic_read(&conf->counters[mode]) <= 0) - return 0; /* no failure, no decrement */ - - - if (atomic_dec_and_test(&conf->counters[mode])) { - if (conf->period[mode]) - atomic_set(&conf->counters[mode], conf->period[mode]); - return 1; - } - return 0; -} - -static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir) -{ - /* If we find a ReadFixable sector, we fix it ... */ - int i; - for (i=0; infaults; i++) - if (conf->faults[i] >= start && - conf->faults[i] < end) { - /* found it ... */ - switch (conf->modes[i] * 2 + dir) { - case WritePersistent*2+WRITE: return 1; - case ReadPersistent*2+READ: return 1; - case ReadFixable*2+READ: return 1; - case ReadFixable*2+WRITE: - conf->modes[i] = NoPersist; - return 0; - case AllPersist*2+READ: - case AllPersist*2+WRITE: return 1; - default: - return 0; - } - } - return 0; -} - -static void add_sector(struct faulty_conf *conf, sector_t start, int mode) -{ - int i; - int n = conf->nfaults; - for (i=0; infaults; i++) - if (conf->faults[i] == start) { - switch(mode) { - case NoPersist: conf->modes[i] = mode; return; - case WritePersistent: - if (conf->modes[i] == ReadPersistent || - conf->modes[i] == ReadFixable) - conf->modes[i] = AllPersist; - else - conf->modes[i] = WritePersistent; - return; - case ReadPersistent: - if (conf->modes[i] == WritePersistent) - conf->modes[i] = AllPersist; - else - conf->modes[i] = ReadPersistent; - return; - case ReadFixable: - if (conf->modes[i] == WritePersistent || - conf->modes[i] == ReadPersistent) - conf->modes[i] = AllPersist; - else - conf->modes[i] = ReadFixable; - return; - } - } else if (conf->modes[i] == NoPersist) - n = i; - - if (n >= MaxFault) - return; - conf->faults[n] = start; - conf->modes[n] = mode; - if (conf->nfaults == n) - conf->nfaults = n+1; -} - -static bool faulty_make_request(struct mddev *mddev, struct bio *bio) -{ - struct faulty_conf *conf = mddev->private; - int failit = 0; - - if (bio_data_dir(bio) == WRITE) { - /* write request */ - if (atomic_read(&conf->counters[WriteAll])) { - /* special case - don't decrement, don't generic_make_request, - * just fail immediately - */ - bio_io_error(bio); - return true; - } - - if (check_sector(conf, bio->bi_iter.bi_sector, - bio_end_sector(bio), WRITE)) - failit = 1; - if (check_mode(conf, WritePersistent)) { - add_sector(conf, bio->bi_iter.bi_sector, - WritePersistent); - failit = 1; - } - if (check_mode(conf, WriteTransient)) - failit = 1; - } else { - /* read request */ - if (check_sector(conf, bio->bi_iter.bi_sector, - bio_end_sector(bio), READ)) - failit = 1; - if (check_mode(conf, ReadTransient)) - failit = 1; - if (check_mode(conf, ReadPersistent)) { - add_sector(conf, bio->bi_iter.bi_sector, - ReadPersistent); - failit = 1; - } - if (check_mode(conf, ReadFixable)) { - add_sector(conf, bio->bi_iter.bi_sector, - ReadFixable); - failit = 1; - } - } - if (failit) { - struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); - - bio_set_dev(b, conf->rdev->bdev); - b->bi_private = bio; - b->bi_end_io = faulty_fail; - bio = b; - } else - bio_set_dev(bio, conf->rdev->bdev); - - generic_make_request(bio); - return true; -} - -static void faulty_status(struct seq_file *seq, struct mddev *mddev) -{ - struct faulty_conf *conf = mddev->private; - int n; - - if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) - seq_printf(seq, " WriteTransient=%d(%d)", - n, conf->period[WriteTransient]); - - if ((n=atomic_read(&conf->counters[ReadTransient])) != 0) - seq_printf(seq, " ReadTransient=%d(%d)", - n, conf->period[ReadTransient]); - - if ((n=atomic_read(&conf->counters[WritePersistent])) != 0) - seq_printf(seq, " WritePersistent=%d(%d)", - n, conf->period[WritePersistent]); - - if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0) - seq_printf(seq, " ReadPersistent=%d(%d)", - n, conf->period[ReadPersistent]); - - - if ((n=atomic_read(&conf->counters[ReadFixable])) != 0) - seq_printf(seq, " ReadFixable=%d(%d)", - n, conf->period[ReadFixable]); - - if ((n=atomic_read(&conf->counters[WriteAll])) != 0) - seq_printf(seq, " WriteAll"); - - seq_printf(seq, " nfaults=%d", conf->nfaults); -} - - -static int faulty_reshape(struct mddev *mddev) -{ - int mode = mddev->new_layout & ModeMask; - int count = mddev->new_layout >> ModeShift; - struct faulty_conf *conf = mddev->private; - - if (mddev->new_layout < 0) - return 0; - - /* new layout */ - if (mode == ClearFaults) - conf->nfaults = 0; - else if (mode == ClearErrors) { - int i; - for (i=0 ; i < Modes ; i++) { - conf->period[i] = 0; - atomic_set(&conf->counters[i], 0); - } - } else if (mode < Modes) { - conf->period[mode] = count; - if (!count) count++; - atomic_set(&conf->counters[mode], count); - } else - return -EINVAL; - mddev->new_layout = -1; - mddev->layout = -1; /* makes sure further changes come through */ - return 0; -} - -static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - WARN_ONCE(raid_disks, - "%s does not support generic reshape\n", __func__); - - if (sectors == 0) - return mddev->dev_sectors; - - return sectors; -} - -static int faulty_run(struct mddev *mddev) -{ - struct md_rdev *rdev; - int i; - struct faulty_conf *conf; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - - conf = kmalloc(sizeof(*conf), GFP_KERNEL); - if (!conf) - return -ENOMEM; - - for (i=0; icounters[i], 0); - conf->period[i] = 0; - } - conf->nfaults = 0; - - rdev_for_each(rdev, mddev) { - conf->rdev = rdev; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - } - - md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); - mddev->private = conf; - - faulty_reshape(mddev); - - return 0; -} - -static void faulty_free(struct mddev *mddev, void *priv) -{ - struct faulty_conf *conf = priv; - - kfree(conf); -} - -static struct md_personality faulty_personality = -{ - .name = "faulty", - .level = LEVEL_FAULTY, - .owner = THIS_MODULE, - .make_request = faulty_make_request, - .run = faulty_run, - .free = faulty_free, - .status = faulty_status, - .check_reshape = faulty_reshape, - .size = faulty_size, -}; - -static int __init raid_init(void) -{ - return register_md_personality(&faulty_personality); -} - -static void raid_exit(void) -{ - unregister_md_personality(&faulty_personality); -} - -module_init(raid_init); -module_exit(raid_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Fault injection personality for MD"); -MODULE_ALIAS("md-personality-10"); /* faulty */ -MODULE_ALIAS("md-faulty"); -MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/linear.c b/drivers/md/linear.c deleted file mode 100644 index c464fb48039a..000000000000 --- a/drivers/md/linear.c +++ /dev/null @@ -1,348 +0,0 @@ -/* - linear.c : Multiple Devices driver for Linux - Copyright (C) 1994-96 Marc ZYNGIER - or - - - Linear mode management functions. - - This program is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation; either version 2, or (at your option) - any later version. - - You should have received a copy of the GNU General Public License - (for example /usr/src/linux/COPYING); if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -*/ - -#include -#include -#include -#include -#include -#include -#include "md.h" -#include "linear.h" - -/* - * find which device holds a particular offset - */ -static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) -{ - int lo, mid, hi; - struct linear_conf *conf; - - lo = 0; - hi = mddev->raid_disks - 1; - conf = mddev->private; - - /* - * Binary Search - */ - - while (hi > lo) { - - mid = (hi + lo) / 2; - if (sector < conf->disks[mid].end_sector) - hi = mid; - else - lo = mid + 1; - } - - return conf->disks + lo; -} - -/* - * In linear_congested() conf->raid_disks is used as a copy of - * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks - * and conf->disks[] are created in linear_conf(), they are always - * consitent with each other, but mddev->raid_disks does not. - */ -static int linear_congested(struct mddev *mddev, int bits) -{ - struct linear_conf *conf; - int i, ret = 0; - - rcu_read_lock(); - conf = rcu_dereference(mddev->private); - - for (i = 0; i < conf->raid_disks && !ret ; i++) { - struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); - ret |= bdi_congested(q->backing_dev_info, bits); - } - - rcu_read_unlock(); - return ret; -} - -static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - struct linear_conf *conf; - sector_t array_sectors; - - conf = mddev->private; - WARN_ONCE(sectors || raid_disks, - "%s does not support generic reshape\n", __func__); - array_sectors = conf->array_sectors; - - return array_sectors; -} - -static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) -{ - struct linear_conf *conf; - struct md_rdev *rdev; - int i, cnt; - bool discard_supported = false; - - conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), - GFP_KERNEL); - if (!conf) - return NULL; - - cnt = 0; - conf->array_sectors = 0; - - rdev_for_each(rdev, mddev) { - int j = rdev->raid_disk; - struct dev_info *disk = conf->disks + j; - sector_t sectors; - - if (j < 0 || j >= raid_disks || disk->rdev) { - pr_warn("md/linear:%s: disk numbering problem. Aborting!\n", - mdname(mddev)); - goto out; - } - - disk->rdev = rdev; - if (mddev->chunk_sectors) { - sectors = rdev->sectors; - sector_div(sectors, mddev->chunk_sectors); - rdev->sectors = sectors * mddev->chunk_sectors; - } - - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - conf->array_sectors += rdev->sectors; - cnt++; - - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) - discard_supported = true; - } - if (cnt != raid_disks) { - pr_warn("md/linear:%s: not enough drives present. Aborting!\n", - mdname(mddev)); - goto out; - } - - if (!discard_supported) - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - else - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); - - /* - * Here we calculate the device offsets. - */ - conf->disks[0].end_sector = conf->disks[0].rdev->sectors; - - for (i = 1; i < raid_disks; i++) - conf->disks[i].end_sector = - conf->disks[i-1].end_sector + - conf->disks[i].rdev->sectors; - - /* - * conf->raid_disks is copy of mddev->raid_disks. The reason to - * keep a copy of mddev->raid_disks in struct linear_conf is, - * mddev->raid_disks may not be consistent with pointers number of - * conf->disks[] when it is updated in linear_add() and used to - * iterate old conf->disks[] earray in linear_congested(). - * Here conf->raid_disks is always consitent with number of - * pointers in conf->disks[] array, and mddev->private is updated - * with rcu_assign_pointer() in linear_addr(), such race can be - * avoided. - */ - conf->raid_disks = raid_disks; - - return conf; - -out: - kfree(conf); - return NULL; -} - -static int linear_run (struct mddev *mddev) -{ - struct linear_conf *conf; - int ret; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - conf = linear_conf(mddev, mddev->raid_disks); - - if (!conf) - return 1; - mddev->private = conf; - md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - - ret = md_integrity_register(mddev); - if (ret) { - kfree(conf); - mddev->private = NULL; - } - return ret; -} - -static int linear_add(struct mddev *mddev, struct md_rdev *rdev) -{ - /* Adding a drive to a linear array allows the array to grow. - * It is permitted if the new drive has a matching superblock - * already on it, with raid_disk equal to raid_disks. - * It is achieved by creating a new linear_private_data structure - * and swapping it in in-place of the current one. - * The current one is never freed until the array is stopped. - * This avoids races. - */ - struct linear_conf *newconf, *oldconf; - - if (rdev->saved_raid_disk != mddev->raid_disks) - return -EINVAL; - - rdev->raid_disk = rdev->saved_raid_disk; - rdev->saved_raid_disk = -1; - - newconf = linear_conf(mddev,mddev->raid_disks+1); - - if (!newconf) - return -ENOMEM; - - /* newconf->raid_disks already keeps a copy of * the increased - * value of mddev->raid_disks, WARN_ONCE() is just used to make - * sure of this. It is possible that oldconf is still referenced - * in linear_congested(), therefore kfree_rcu() is used to free - * oldconf until no one uses it anymore. - */ - mddev_suspend(mddev); - oldconf = rcu_dereference_protected(mddev->private, - lockdep_is_held(&mddev->reconfig_mutex)); - mddev->raid_disks++; - WARN_ONCE(mddev->raid_disks != newconf->raid_disks, - "copied raid_disks doesn't match mddev->raid_disks"); - rcu_assign_pointer(mddev->private, newconf); - md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); - set_capacity(mddev->gendisk, mddev->array_sectors); - mddev_resume(mddev); - revalidate_disk(mddev->gendisk); - kfree_rcu(oldconf, rcu); - return 0; -} - -static void linear_free(struct mddev *mddev, void *priv) -{ - struct linear_conf *conf = priv; - - kfree(conf); -} - -static bool linear_make_request(struct mddev *mddev, struct bio *bio) -{ - char b[BDEVNAME_SIZE]; - struct dev_info *tmp_dev; - sector_t start_sector, end_sector, data_offset; - sector_t bio_sector = bio->bi_iter.bi_sector; - - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); - return true; - } - - tmp_dev = which_dev(mddev, bio_sector); - start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; - end_sector = tmp_dev->end_sector; - data_offset = tmp_dev->rdev->data_offset; - - if (unlikely(bio_sector >= end_sector || - bio_sector < start_sector)) - goto out_of_bounds; - - if (unlikely(bio_end_sector(bio) > end_sector)) { - /* This bio crosses a device boundary, so we have to split it */ - struct bio *split = bio_split(bio, end_sector - bio_sector, - GFP_NOIO, mddev->bio_set); - bio_chain(split, bio); - generic_make_request(bio); - bio = split; - } - - bio_set_dev(bio, tmp_dev->rdev->bdev); - bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - - start_sector + data_offset; - - if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) { - /* Just ignore it */ - bio_endio(bio); - } else { - if (mddev->gendisk) - trace_block_bio_remap(bio->bi_disk->queue, - bio, disk_devt(mddev->gendisk), - bio_sector); - mddev_check_writesame(mddev, bio); - mddev_check_write_zeroes(mddev, bio); - generic_make_request(bio); - } - return true; - -out_of_bounds: - pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n", - mdname(mddev), - (unsigned long long)bio->bi_iter.bi_sector, - bdevname(tmp_dev->rdev->bdev, b), - (unsigned long long)tmp_dev->rdev->sectors, - (unsigned long long)start_sector); - bio_io_error(bio); - return true; -} - -static void linear_status (struct seq_file *seq, struct mddev *mddev) -{ - seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); -} - -static void linear_quiesce(struct mddev *mddev, int state) -{ -} - -static struct md_personality linear_personality = -{ - .name = "linear", - .level = LEVEL_LINEAR, - .owner = THIS_MODULE, - .make_request = linear_make_request, - .run = linear_run, - .free = linear_free, - .status = linear_status, - .hot_add_disk = linear_add, - .size = linear_size, - .quiesce = linear_quiesce, - .congested = linear_congested, -}; - -static int __init linear_init (void) -{ - return register_md_personality (&linear_personality); -} - -static void linear_exit (void) -{ - unregister_md_personality (&linear_personality); -} - -module_init(linear_init); -module_exit(linear_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Linear device concatenation personality for MD"); -MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ -MODULE_ALIAS("md-linear"); -MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/linear.h b/drivers/md/linear.h deleted file mode 100644 index 8d392e6098b3..000000000000 --- a/drivers/md/linear.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _LINEAR_H -#define _LINEAR_H - -struct dev_info { - struct md_rdev *rdev; - sector_t end_sector; -}; - -struct linear_conf -{ - struct rcu_head rcu; - sector_t array_sectors; - int raid_disks; /* a copy of mddev->raid_disks */ - struct dev_info disks[0]; -}; -#endif diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c new file mode 100644 index 000000000000..b843b53b0f65 --- /dev/null +++ b/drivers/md/md-bitmap.c @@ -0,0 +1,2591 @@ +/* + * bitmap.c two-level bitmap (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * bitmap_create - sets up the bitmap structure + * bitmap_destroy - destroys the bitmap structure + * + * additions, Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.: + * - added disk storage for bitmap + * - changes to allow various bitmap chunk sizes + */ + +/* + * Still to do: + * + * flush after percent set rather than just time based. (maybe both). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "md.h" +#include "md-bitmap.h" + +static inline char *bmname(struct bitmap *bitmap) +{ + return bitmap->mddev ? mdname(bitmap->mddev) : "mdX"; +} + +/* + * check a page and, if necessary, allocate it (or hijack it if the alloc fails) + * + * 1) check to see if this page is allocated, if it's not then try to alloc + * 2) if the alloc fails, set the page's hijacked flag so we'll use the + * page pointer directly as a counter + * + * if we find our page, we increment the page's refcount so that it stays + * allocated while we're using it + */ +static int bitmap_checkpage(struct bitmap_counts *bitmap, + unsigned long page, int create, int no_hijack) +__releases(bitmap->lock) +__acquires(bitmap->lock) +{ + unsigned char *mappage; + + if (page >= bitmap->pages) { + /* This can happen if bitmap_start_sync goes beyond + * End-of-device while looking for a whole page. + * It is harmless. + */ + return -EINVAL; + } + + if (bitmap->bp[page].hijacked) /* it's hijacked, don't try to alloc */ + return 0; + + if (bitmap->bp[page].map) /* page is already allocated, just return */ + return 0; + + if (!create) + return -ENOENT; + + /* this page has not been allocated yet */ + + spin_unlock_irq(&bitmap->lock); + /* It is possible that this is being called inside a + * prepare_to_wait/finish_wait loop from raid5c:make_request(). + * In general it is not permitted to sleep in that context as it + * can cause the loop to spin freely. + * That doesn't apply here as we can only reach this point + * once with any loop. + * When this function completes, either bp[page].map or + * bp[page].hijacked. In either case, this function will + * abort before getting to this point again. So there is + * no risk of a free-spin, and so it is safe to assert + * that sleeping here is allowed. + */ + sched_annotate_sleep(); + mappage = kzalloc(PAGE_SIZE, GFP_NOIO); + spin_lock_irq(&bitmap->lock); + + if (mappage == NULL) { + pr_debug("md/bitmap: map page allocation failed, hijacking\n"); + /* We don't support hijack for cluster raid */ + if (no_hijack) + return -ENOMEM; + /* failed - set the hijacked flag so that we can use the + * pointer as a counter */ + if (!bitmap->bp[page].map) + bitmap->bp[page].hijacked = 1; + } else if (bitmap->bp[page].map || + bitmap->bp[page].hijacked) { + /* somebody beat us to getting the page */ + kfree(mappage); + } else { + + /* no page was in place and we have one, so install it */ + + bitmap->bp[page].map = mappage; + bitmap->missing_pages--; + } + return 0; +} + +/* if page is completely empty, put it back on the free list, or dealloc it */ +/* if page was hijacked, unmark the flag so it might get alloced next time */ +/* Note: lock should be held when calling this */ +static void bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page) +{ + char *ptr; + + if (bitmap->bp[page].count) /* page is still busy */ + return; + + /* page is no longer in use, it can be released */ + + if (bitmap->bp[page].hijacked) { /* page was hijacked, undo this now */ + bitmap->bp[page].hijacked = 0; + bitmap->bp[page].map = NULL; + } else { + /* normal case, free the page */ + ptr = bitmap->bp[page].map; + bitmap->bp[page].map = NULL; + bitmap->missing_pages++; + kfree(ptr); + } +} + +/* + * bitmap file handling - read and write the bitmap file and its superblock + */ + +/* + * basic page I/O operations + */ + +/* IO operations when bitmap is stored near all superblocks */ +static int read_sb_page(struct mddev *mddev, loff_t offset, + struct page *page, + unsigned long index, int size) +{ + /* choose a good rdev and read the page from there */ + + struct md_rdev *rdev; + sector_t target; + + rdev_for_each(rdev, mddev) { + if (! test_bit(In_sync, &rdev->flags) + || test_bit(Faulty, &rdev->flags) + || test_bit(Bitmap_sync, &rdev->flags)) + continue; + + target = offset + index * (PAGE_SIZE/512); + + if (sync_page_io(rdev, target, + roundup(size, bdev_logical_block_size(rdev->bdev)), + page, REQ_OP_READ, 0, true)) { + page->index = index; + return 0; + } + } + return -EIO; +} + +static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mddev) +{ + /* Iterate the disks of an mddev, using rcu to protect access to the + * linked list, and raising the refcount of devices we return to ensure + * they don't disappear while in use. + * As devices are only added or removed when raid_disk is < 0 and + * nr_pending is 0 and In_sync is clear, the entries we return will + * still be in the same position on the list when we re-enter + * list_for_each_entry_continue_rcu. + * + * Note that if entered with 'rdev == NULL' to start at the + * beginning, we temporarily assign 'rdev' to an address which + * isn't really an rdev, but which can be used by + * list_for_each_entry_continue_rcu() to find the first entry. + */ + rcu_read_lock(); + if (rdev == NULL) + /* start at the beginning */ + rdev = list_entry(&mddev->disks, struct md_rdev, same_set); + else { + /* release the previous rdev and start from there. */ + rdev_dec_pending(rdev, mddev); + } + list_for_each_entry_continue_rcu(rdev, &mddev->disks, same_set) { + if (rdev->raid_disk >= 0 && + !test_bit(Faulty, &rdev->flags)) { + /* this is a usable devices */ + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + return rdev; + } + } + rcu_read_unlock(); + return NULL; +} + +static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) +{ + struct md_rdev *rdev; + struct block_device *bdev; + struct mddev *mddev = bitmap->mddev; + struct bitmap_storage *store = &bitmap->storage; + +restart: + rdev = NULL; + while ((rdev = next_active_rdev(rdev, mddev)) != NULL) { + int size = PAGE_SIZE; + loff_t offset = mddev->bitmap_info.offset; + + bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; + + if (page->index == store->file_pages-1) { + int last_page_size = store->bytes & (PAGE_SIZE-1); + if (last_page_size == 0) + last_page_size = PAGE_SIZE; + size = roundup(last_page_size, + bdev_logical_block_size(bdev)); + } + /* Just make sure we aren't corrupting data or + * metadata + */ + if (mddev->external) { + /* Bitmap could be anywhere. */ + if (rdev->sb_start + offset + (page->index + * (PAGE_SIZE/512)) + > rdev->data_offset + && + rdev->sb_start + offset + < (rdev->data_offset + mddev->dev_sectors + + (PAGE_SIZE/512))) + goto bad_alignment; + } else if (offset < 0) { + /* DATA BITMAP METADATA */ + if (offset + + (long)(page->index * (PAGE_SIZE/512)) + + size/512 > 0) + /* bitmap runs in to metadata */ + goto bad_alignment; + if (rdev->data_offset + mddev->dev_sectors + > rdev->sb_start + offset) + /* data runs in to bitmap */ + goto bad_alignment; + } else if (rdev->sb_start < rdev->data_offset) { + /* METADATA BITMAP DATA */ + if (rdev->sb_start + + offset + + page->index*(PAGE_SIZE/512) + size/512 + > rdev->data_offset) + /* bitmap runs in to data */ + goto bad_alignment; + } else { + /* DATA METADATA BITMAP - no problems */ + } + md_super_write(mddev, rdev, + rdev->sb_start + offset + + page->index * (PAGE_SIZE/512), + size, + page); + } + + if (wait && md_super_wait(mddev) < 0) + goto restart; + return 0; + + bad_alignment: + return -EINVAL; +} + +static void bitmap_file_kick(struct bitmap *bitmap); +/* + * write out a page to a file + */ +static void write_page(struct bitmap *bitmap, struct page *page, int wait) +{ + struct buffer_head *bh; + + if (bitmap->storage.file == NULL) { + switch (write_sb_page(bitmap, page, wait)) { + case -EINVAL: + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); + } + } else { + + bh = page_buffers(page); + + while (bh && bh->b_blocknr) { + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(REQ_OP_WRITE, REQ_SYNC, bh); + bh = bh->b_this_page; + } + + if (wait) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + } + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + bitmap_file_kick(bitmap); +} + +static void end_bitmap_write(struct buffer_head *bh, int uptodate) +{ + struct bitmap *bitmap = bh->b_private; + + if (!uptodate) + set_bit(BITMAP_WRITE_ERROR, &bitmap->flags); + if (atomic_dec_and_test(&bitmap->pending_writes)) + wake_up(&bitmap->write_wait); +} + +/* copied from buffer.c */ +static void +__clear_page_buffers(struct page *page) +{ + ClearPagePrivate(page); + set_page_private(page, 0); + put_page(page); +} +static void free_buffers(struct page *page) +{ + struct buffer_head *bh; + + if (!PagePrivate(page)) + return; + + bh = page_buffers(page); + while (bh) { + struct buffer_head *next = bh->b_this_page; + free_buffer_head(bh); + bh = next; + } + __clear_page_buffers(page); + put_page(page); +} + +/* read a page from a file. + * We both read the page, and attach buffers to the page to record the + * address of each block (using bmap). These addresses will be used + * to write the block later, completely bypassing the filesystem. + * This usage is similar to how swap files are handled, and allows us + * to write to a file with no concerns of memory allocation failing. + */ +static int read_page(struct file *file, unsigned long index, + struct bitmap *bitmap, + unsigned long count, + struct page *page) +{ + int ret = 0; + struct inode *inode = file_inode(file); + struct buffer_head *bh; + sector_t block; + + pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, + (unsigned long long)index << PAGE_SHIFT); + + bh = alloc_page_buffers(page, 1<i_blkbits, 0); + if (!bh) { + ret = -ENOMEM; + goto out; + } + attach_page_buffers(page, bh); + block = index << (PAGE_SHIFT - inode->i_blkbits); + while (bh) { + if (count == 0) + bh->b_blocknr = 0; + else { + bh->b_blocknr = bmap(inode, block); + if (bh->b_blocknr == 0) { + /* Cannot use this file! */ + ret = -EINVAL; + goto out; + } + bh->b_bdev = inode->i_sb->s_bdev; + if (count < (1<i_blkbits)) + count = 0; + else + count -= (1<i_blkbits); + + bh->b_end_io = end_bitmap_write; + bh->b_private = bitmap; + atomic_inc(&bitmap->pending_writes); + set_buffer_locked(bh); + set_buffer_mapped(bh); + submit_bh(REQ_OP_READ, 0, bh); + } + block++; + bh = bh->b_this_page; + } + page->index = index; + + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + ret = -EIO; +out: + if (ret) + pr_err("md: bitmap read error: (%dB @ %llu): %d\n", + (int)PAGE_SIZE, + (unsigned long long)index << PAGE_SHIFT, + ret); + return ret; +} + +/* + * bitmap file superblock operations + */ + +/* + * bitmap_wait_writes() should be called before writing any bitmap + * blocks, to ensure previous writes, particularly from + * bitmap_daemon_work(), have completed. + */ +static void bitmap_wait_writes(struct bitmap *bitmap) +{ + if (bitmap->storage.file) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + else + /* Note that we ignore the return value. The writes + * might have failed, but that would just mean that + * some bits which should be cleared haven't been, + * which is safe. The relevant bitmap blocks will + * probably get written again, but there is no great + * loss if they aren't. + */ + md_super_wait(bitmap->mddev); +} + + +/* update the event counter and sync the superblock to disk */ +void bitmap_update_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + + if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ + return; + if (bitmap->mddev->bitmap_info.external) + return; + if (!bitmap->storage.sb_page) /* no superblock */ + return; + sb = kmap_atomic(bitmap->storage.sb_page); + sb->events = cpu_to_le64(bitmap->mddev->events); + if (bitmap->mddev->events < bitmap->events_cleared) + /* rocking back to read-only */ + bitmap->events_cleared = bitmap->mddev->events; + sb->events_cleared = cpu_to_le64(bitmap->events_cleared); + sb->state = cpu_to_le32(bitmap->flags); + /* Just in case these have been changed via sysfs: */ + sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); + sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); + /* This might have been changed by a reshape */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize); + sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes); + sb->sectors_reserved = cpu_to_le32(bitmap->mddev-> + bitmap_info.space); + kunmap_atomic(sb); + write_page(bitmap, bitmap->storage.sb_page, 1); +} +EXPORT_SYMBOL(bitmap_update_sb); + +/* print out the bitmap file superblock */ +void bitmap_print_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + + if (!bitmap || !bitmap->storage.sb_page) + return; + sb = kmap_atomic(bitmap->storage.sb_page); + pr_debug("%s: bitmap file superblock:\n", bmname(bitmap)); + pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic)); + pr_debug(" version: %d\n", le32_to_cpu(sb->version)); + pr_debug(" uuid: %08x.%08x.%08x.%08x\n", + le32_to_cpu(*(__u32 *)(sb->uuid+0)), + le32_to_cpu(*(__u32 *)(sb->uuid+4)), + le32_to_cpu(*(__u32 *)(sb->uuid+8)), + le32_to_cpu(*(__u32 *)(sb->uuid+12))); + pr_debug(" events: %llu\n", + (unsigned long long) le64_to_cpu(sb->events)); + pr_debug("events cleared: %llu\n", + (unsigned long long) le64_to_cpu(sb->events_cleared)); + pr_debug(" state: %08x\n", le32_to_cpu(sb->state)); + pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize)); + pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); + pr_debug(" sync size: %llu KB\n", + (unsigned long long)le64_to_cpu(sb->sync_size)/2); + pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind)); + kunmap_atomic(sb); +} + +/* + * bitmap_new_disk_sb + * @bitmap + * + * This function is somewhat the reverse of bitmap_read_sb. bitmap_read_sb + * reads and verifies the on-disk bitmap superblock and populates bitmap_info. + * This function verifies 'bitmap_info' and populates the on-disk bitmap + * structure, which is to be written to disk. + * + * Returns: 0 on success, -Exxx on error + */ +static int bitmap_new_disk_sb(struct bitmap *bitmap) +{ + bitmap_super_t *sb; + unsigned long chunksize, daemon_sleep, write_behind; + + bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (bitmap->storage.sb_page == NULL) + return -ENOMEM; + bitmap->storage.sb_page->index = 0; + + sb = kmap_atomic(bitmap->storage.sb_page); + + sb->magic = cpu_to_le32(BITMAP_MAGIC); + sb->version = cpu_to_le32(BITMAP_MAJOR_HI); + + chunksize = bitmap->mddev->bitmap_info.chunksize; + BUG_ON(!chunksize); + if (!is_power_of_2(chunksize)) { + kunmap_atomic(sb); + pr_warn("bitmap chunksize not a power of 2\n"); + return -EINVAL; + } + sb->chunksize = cpu_to_le32(chunksize); + + daemon_sleep = bitmap->mddev->bitmap_info.daemon_sleep; + if (!daemon_sleep || (daemon_sleep > MAX_SCHEDULE_TIMEOUT)) { + pr_debug("Choosing daemon_sleep default (5 sec)\n"); + daemon_sleep = 5 * HZ; + } + sb->daemon_sleep = cpu_to_le32(daemon_sleep); + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + + /* + * FIXME: write_behind for RAID1. If not specified, what + * is a good choice? We choose COUNTER_MAX / 2 arbitrarily. + */ + write_behind = bitmap->mddev->bitmap_info.max_write_behind; + if (write_behind > COUNTER_MAX) + write_behind = COUNTER_MAX / 2; + sb->write_behind = cpu_to_le32(write_behind); + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + + /* keep the array size field of the bitmap superblock up to date */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + + memcpy(sb->uuid, bitmap->mddev->uuid, 16); + + set_bit(BITMAP_STALE, &bitmap->flags); + sb->state = cpu_to_le32(bitmap->flags); + bitmap->events_cleared = bitmap->mddev->events; + sb->events_cleared = cpu_to_le64(bitmap->mddev->events); + bitmap->mddev->bitmap_info.nodes = 0; + + kunmap_atomic(sb); + + return 0; +} + +/* read the superblock from the bitmap file and initialize some bitmap fields */ +static int bitmap_read_sb(struct bitmap *bitmap) +{ + char *reason = NULL; + bitmap_super_t *sb; + unsigned long chunksize, daemon_sleep, write_behind; + unsigned long long events; + int nodes = 0; + unsigned long sectors_reserved = 0; + int err = -EINVAL; + struct page *sb_page; + loff_t offset = bitmap->mddev->bitmap_info.offset; + + if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) { + chunksize = 128 * 1024 * 1024; + daemon_sleep = 5 * HZ; + write_behind = 0; + set_bit(BITMAP_STALE, &bitmap->flags); + err = 0; + goto out_no_sb; + } + /* page 0 is the superblock, read it... */ + sb_page = alloc_page(GFP_KERNEL); + if (!sb_page) + return -ENOMEM; + bitmap->storage.sb_page = sb_page; + +re_read: + /* If cluster_slot is set, the cluster is setup */ + if (bitmap->cluster_slot >= 0) { + sector_t bm_blocks = bitmap->mddev->resync_max_sectors; + + sector_div(bm_blocks, + bitmap->mddev->bitmap_info.chunksize >> 9); + /* bits to bytes */ + bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t); + /* to 4k blocks */ + bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096); + offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3)); + pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__, + bitmap->cluster_slot, offset); + } + + if (bitmap->storage.file) { + loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host); + int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize; + + err = read_page(bitmap->storage.file, 0, + bitmap, bytes, sb_page); + } else { + err = read_sb_page(bitmap->mddev, + offset, + sb_page, + 0, sizeof(bitmap_super_t)); + } + if (err) + return err; + + err = -EINVAL; + sb = kmap_atomic(sb_page); + + chunksize = le32_to_cpu(sb->chunksize); + daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; + write_behind = le32_to_cpu(sb->write_behind); + sectors_reserved = le32_to_cpu(sb->sectors_reserved); + /* Setup nodes/clustername only if bitmap version is + * cluster-compatible + */ + if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) { + nodes = le32_to_cpu(sb->nodes); + strlcpy(bitmap->mddev->bitmap_info.cluster_name, + sb->cluster_name, 64); + } + + /* verify that the bitmap-specific fields are valid */ + if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) + reason = "bad magic"; + else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || + le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED) + reason = "unrecognized superblock version"; + else if (chunksize < 512) + reason = "bitmap chunksize too small"; + else if (!is_power_of_2(chunksize)) + reason = "bitmap chunksize not a power of 2"; + else if (daemon_sleep < 1 || daemon_sleep > MAX_SCHEDULE_TIMEOUT) + reason = "daemon sleep period out of range"; + else if (write_behind > COUNTER_MAX) + reason = "write-behind limit out of range (0 - 16383)"; + if (reason) { + pr_warn("%s: invalid bitmap file superblock: %s\n", + bmname(bitmap), reason); + goto out; + } + + /* keep the array size field of the bitmap superblock up to date */ + sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); + + if (bitmap->mddev->persistent) { + /* + * We have a persistent array superblock, so compare the + * bitmap's UUID and event counter to the mddev's + */ + if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { + pr_warn("%s: bitmap superblock UUID mismatch\n", + bmname(bitmap)); + goto out; + } + events = le64_to_cpu(sb->events); + if (!nodes && (events < bitmap->mddev->events)) { + pr_warn("%s: bitmap file is out of date (%llu < %llu) -- forcing full recovery\n", + bmname(bitmap), events, + (unsigned long long) bitmap->mddev->events); + set_bit(BITMAP_STALE, &bitmap->flags); + } + } + + /* assign fields using values from superblock */ + bitmap->flags |= le32_to_cpu(sb->state); + if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) + set_bit(BITMAP_HOSTENDIAN, &bitmap->flags); + bitmap->events_cleared = le64_to_cpu(sb->events_cleared); + strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64); + err = 0; + +out: + kunmap_atomic(sb); + /* Assigning chunksize is required for "re_read" */ + bitmap->mddev->bitmap_info.chunksize = chunksize; + if (err == 0 && nodes && (bitmap->cluster_slot < 0)) { + err = md_setup_cluster(bitmap->mddev, nodes); + if (err) { + pr_warn("%s: Could not setup cluster service (%d)\n", + bmname(bitmap), err); + goto out_no_sb; + } + bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev); + goto re_read; + } + + +out_no_sb: + if (test_bit(BITMAP_STALE, &bitmap->flags)) + bitmap->events_cleared = bitmap->mddev->events; + bitmap->mddev->bitmap_info.chunksize = chunksize; + bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; + bitmap->mddev->bitmap_info.max_write_behind = write_behind; + bitmap->mddev->bitmap_info.nodes = nodes; + if (bitmap->mddev->bitmap_info.space == 0 || + bitmap->mddev->bitmap_info.space > sectors_reserved) + bitmap->mddev->bitmap_info.space = sectors_reserved; + if (err) { + bitmap_print_sb(bitmap); + if (bitmap->cluster_slot < 0) + md_cluster_stop(bitmap->mddev); + } + return err; +} + +/* + * general bitmap file operations + */ + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ +/* calculate the index of the page that contains this bit */ +static inline unsigned long file_page_index(struct bitmap_storage *store, + unsigned long chunk) +{ + if (store->sb_page) + chunk += sizeof(bitmap_super_t) << 3; + return chunk >> PAGE_BIT_SHIFT; +} + +/* calculate the (bit) offset of this bit within a page */ +static inline unsigned long file_page_offset(struct bitmap_storage *store, + unsigned long chunk) +{ + if (store->sb_page) + chunk += sizeof(bitmap_super_t) << 3; + return chunk & (PAGE_BITS - 1); +} + +/* + * return a pointer to the page in the filemap that contains the given bit + * + */ +static inline struct page *filemap_get_page(struct bitmap_storage *store, + unsigned long chunk) +{ + if (file_page_index(store, chunk) >= store->file_pages) + return NULL; + return store->filemap[file_page_index(store, chunk)]; +} + +static int bitmap_storage_alloc(struct bitmap_storage *store, + unsigned long chunks, int with_super, + int slot_number) +{ + int pnum, offset = 0; + unsigned long num_pages; + unsigned long bytes; + + bytes = DIV_ROUND_UP(chunks, 8); + if (with_super) + bytes += sizeof(bitmap_super_t); + + num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE); + offset = slot_number * num_pages; + + store->filemap = kmalloc(sizeof(struct page *) + * num_pages, GFP_KERNEL); + if (!store->filemap) + return -ENOMEM; + + if (with_super && !store->sb_page) { + store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (store->sb_page == NULL) + return -ENOMEM; + } + + pnum = 0; + if (store->sb_page) { + store->filemap[0] = store->sb_page; + pnum = 1; + store->sb_page->index = offset; + } + + for ( ; pnum < num_pages; pnum++) { + store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO); + if (!store->filemap[pnum]) { + store->file_pages = pnum; + return -ENOMEM; + } + store->filemap[pnum]->index = pnum + offset; + } + store->file_pages = pnum; + + /* We need 4 bits per page, rounded up to a multiple + * of sizeof(unsigned long) */ + store->filemap_attr = kzalloc( + roundup(DIV_ROUND_UP(num_pages*4, 8), sizeof(unsigned long)), + GFP_KERNEL); + if (!store->filemap_attr) + return -ENOMEM; + + store->bytes = bytes; + + return 0; +} + +static void bitmap_file_unmap(struct bitmap_storage *store) +{ + struct page **map, *sb_page; + int pages; + struct file *file; + + file = store->file; + map = store->filemap; + pages = store->file_pages; + sb_page = store->sb_page; + + while (pages--) + if (map[pages] != sb_page) /* 0 is sb_page, release it below */ + free_buffers(map[pages]); + kfree(map); + kfree(store->filemap_attr); + + if (sb_page) + free_buffers(sb_page); + + if (file) { + struct inode *inode = file_inode(file); + invalidate_mapping_pages(inode->i_mapping, 0, -1); + fput(file); + } +} + +/* + * bitmap_file_kick - if an error occurs while manipulating the bitmap file + * then it is no longer reliable, so we stop using it and we mark the file + * as failed in the superblock + */ +static void bitmap_file_kick(struct bitmap *bitmap) +{ + char *path, *ptr = NULL; + + if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) { + bitmap_update_sb(bitmap); + + if (bitmap->storage.file) { + path = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (path) + ptr = file_path(bitmap->storage.file, + path, PAGE_SIZE); + + pr_warn("%s: kicking failed bitmap file %s from array!\n", + bmname(bitmap), IS_ERR(ptr) ? "" : ptr); + + kfree(path); + } else + pr_warn("%s: disabling internal bitmap due to errors\n", + bmname(bitmap)); + } +} + +enum bitmap_page_attr { + BITMAP_PAGE_DIRTY = 0, /* there are set bits that need to be synced */ + BITMAP_PAGE_PENDING = 1, /* there are bits that are being cleaned. + * i.e. counter is 1 or 2. */ + BITMAP_PAGE_NEEDWRITE = 2, /* there are cleared bits that need to be synced */ +}; + +static inline void set_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) +{ + set_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); +} + +static inline void clear_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) +{ + clear_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); +} + +static inline int test_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) +{ + return test_bit((pnum<<2) + attr, bitmap->storage.filemap_attr); +} + +static inline int test_and_clear_page_attr(struct bitmap *bitmap, int pnum, + enum bitmap_page_attr attr) +{ + return test_and_clear_bit((pnum<<2) + attr, + bitmap->storage.filemap_attr); +} +/* + * bitmap_file_set_bit -- called before performing a write to the md device + * to set (and eventually sync) a particular bit in the bitmap file + * + * we set the bit immediately, then we record the page number so that + * when an unplug occurs, we can flush the dirty pages out to disk + */ +static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *kaddr; + unsigned long chunk = block >> bitmap->counts.chunkshift; + struct bitmap_storage *store = &bitmap->storage; + unsigned long node_offset = 0; + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * store->file_pages; + + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) + return; + bit = file_page_offset(&bitmap->storage, chunk); + + /* set the bit */ + kaddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + set_bit(bit, kaddr); + else + set_bit_le(bit, kaddr); + kunmap_atomic(kaddr); + pr_debug("set file bit %lu page %lu\n", bit, page->index); + /* record page number so it gets flushed to disk when unplug occurs */ + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY); +} + +static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *paddr; + unsigned long chunk = block >> bitmap->counts.chunkshift; + struct bitmap_storage *store = &bitmap->storage; + unsigned long node_offset = 0; + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * store->file_pages; + + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) + return; + bit = file_page_offset(&bitmap->storage, chunk); + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + clear_bit(bit, paddr); + else + clear_bit_le(bit, paddr); + kunmap_atomic(paddr); + if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) { + set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING); + bitmap->allclean = 0; + } +} + +static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block) +{ + unsigned long bit; + struct page *page; + void *paddr; + unsigned long chunk = block >> bitmap->counts.chunkshift; + int set = 0; + + page = filemap_get_page(&bitmap->storage, chunk); + if (!page) + return -EINVAL; + bit = file_page_offset(&bitmap->storage, chunk); + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + set = test_bit(bit, paddr); + else + set = test_bit_le(bit, paddr); + kunmap_atomic(paddr); + return set; +} + + +/* this gets called when the md device is ready to unplug its underlying + * (slave) device queues -- before we let any writes go down, we need to + * sync the dirty pages of the bitmap file to disk */ +void bitmap_unplug(struct bitmap *bitmap) +{ + unsigned long i; + int dirty, need_write; + int writing = 0; + + if (!bitmap || !bitmap->storage.filemap || + test_bit(BITMAP_STALE, &bitmap->flags)) + return; + + /* look at each page to see if there are any set bits that need to be + * flushed out to disk */ + for (i = 0; i < bitmap->storage.file_pages; i++) { + if (!bitmap->storage.filemap) + return; + dirty = test_and_clear_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); + need_write = test_and_clear_page_attr(bitmap, i, + BITMAP_PAGE_NEEDWRITE); + if (dirty || need_write) { + if (!writing) { + bitmap_wait_writes(bitmap); + if (bitmap->mddev->queue) + blk_add_trace_msg(bitmap->mddev->queue, + "md bitmap_unplug"); + } + clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); + write_page(bitmap, bitmap->storage.filemap[i], 0); + writing = 1; + } + } + if (writing) + bitmap_wait_writes(bitmap); + + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + bitmap_file_kick(bitmap); +} +EXPORT_SYMBOL(bitmap_unplug); + +static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed); +/* * bitmap_init_from_disk -- called at bitmap_create time to initialize + * the in-memory bitmap from the on-disk bitmap -- also, sets up the + * memory mapping of the bitmap file + * Special cases: + * if there's no bitmap file, or if the bitmap file had been + * previously kicked from the array, we mark all the bits as + * 1's in order to cause a full resync. + * + * We ignore all bits for sectors that end earlier than 'start'. + * This is used when reading an out-of-date bitmap... + */ +static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) +{ + unsigned long i, chunks, index, oldindex, bit, node_offset = 0; + struct page *page = NULL; + unsigned long bit_cnt = 0; + struct file *file; + unsigned long offset; + int outofdate; + int ret = -ENOSPC; + void *paddr; + struct bitmap_storage *store = &bitmap->storage; + + chunks = bitmap->counts.chunks; + file = store->file; + + if (!file && !bitmap->mddev->bitmap_info.offset) { + /* No permanent bitmap - fill with '1s'. */ + store->filemap = NULL; + store->file_pages = 0; + for (i = 0; i < chunks ; i++) { + /* if the disk bit is set, set the memory bit */ + int needed = ((sector_t)(i+1) << (bitmap->counts.chunkshift) + >= start); + bitmap_set_memory_bits(bitmap, + (sector_t)i << bitmap->counts.chunkshift, + needed); + } + return 0; + } + + outofdate = test_bit(BITMAP_STALE, &bitmap->flags); + if (outofdate) + pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap)); + + if (file && i_size_read(file->f_mapping->host) < store->bytes) { + pr_warn("%s: bitmap file too short %lu < %lu\n", + bmname(bitmap), + (unsigned long) i_size_read(file->f_mapping->host), + store->bytes); + goto err; + } + + oldindex = ~0L; + offset = 0; + if (!bitmap->mddev->bitmap_info.external) + offset = sizeof(bitmap_super_t); + + if (mddev_is_clustered(bitmap->mddev)) + node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE)); + + for (i = 0; i < chunks; i++) { + int b; + index = file_page_index(&bitmap->storage, i); + bit = file_page_offset(&bitmap->storage, i); + if (index != oldindex) { /* this is a new page, read it in */ + int count; + /* unmap the old page, we're done with it */ + if (index == store->file_pages-1) + count = store->bytes - index * PAGE_SIZE; + else + count = PAGE_SIZE; + page = store->filemap[index]; + if (file) + ret = read_page(file, index, bitmap, + count, page); + else + ret = read_sb_page( + bitmap->mddev, + bitmap->mddev->bitmap_info.offset, + page, + index + node_offset, count); + + if (ret) + goto err; + + oldindex = index; + + if (outofdate) { + /* + * if bitmap is out of date, dirty the + * whole page and write it out + */ + paddr = kmap_atomic(page); + memset(paddr + offset, 0xff, + PAGE_SIZE - offset); + kunmap_atomic(paddr); + write_page(bitmap, page, 1); + + ret = -EIO; + if (test_bit(BITMAP_WRITE_ERROR, + &bitmap->flags)) + goto err; + } + } + paddr = kmap_atomic(page); + if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags)) + b = test_bit(bit, paddr); + else + b = test_bit_le(bit, paddr); + kunmap_atomic(paddr); + if (b) { + /* if the disk bit is set, set the memory bit */ + int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift + >= start); + bitmap_set_memory_bits(bitmap, + (sector_t)i << bitmap->counts.chunkshift, + needed); + bit_cnt++; + } + offset = 0; + } + + pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n", + bmname(bitmap), store->file_pages, + bit_cnt, chunks); + + return 0; + + err: + pr_warn("%s: bitmap initialisation failed: %d\n", + bmname(bitmap), ret); + return ret; +} + +void bitmap_write_all(struct bitmap *bitmap) +{ + /* We don't actually write all bitmap blocks here, + * just flag them as needing to be written + */ + int i; + + if (!bitmap || !bitmap->storage.filemap) + return; + if (bitmap->storage.file) + /* Only one copy, so nothing needed */ + return; + + for (i = 0; i < bitmap->storage.file_pages; i++) + set_page_attr(bitmap, i, + BITMAP_PAGE_NEEDWRITE); + bitmap->allclean = 0; +} + +static void bitmap_count_page(struct bitmap_counts *bitmap, + sector_t offset, int inc) +{ + sector_t chunk = offset >> bitmap->chunkshift; + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + bitmap->bp[page].count += inc; + bitmap_checkfree(bitmap, page); +} + +static void bitmap_set_pending(struct bitmap_counts *bitmap, sector_t offset) +{ + sector_t chunk = offset >> bitmap->chunkshift; + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + struct bitmap_page *bp = &bitmap->bp[page]; + + if (!bp->pending) + bp->pending = 1; +} + +static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, + sector_t offset, sector_t *blocks, + int create); + +/* + * bitmap daemon -- periodically wakes up to clean bits and flush pages + * out to disk + */ + +void bitmap_daemon_work(struct mddev *mddev) +{ + struct bitmap *bitmap; + unsigned long j; + unsigned long nextpage; + sector_t blocks; + struct bitmap_counts *counts; + + /* Use a mutex to guard daemon_work against + * bitmap_destroy. + */ + mutex_lock(&mddev->bitmap_info.mutex); + bitmap = mddev->bitmap; + if (bitmap == NULL) { + mutex_unlock(&mddev->bitmap_info.mutex); + return; + } + if (time_before(jiffies, bitmap->daemon_lastrun + + mddev->bitmap_info.daemon_sleep)) + goto done; + + bitmap->daemon_lastrun = jiffies; + if (bitmap->allclean) { + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + goto done; + } + bitmap->allclean = 1; + + if (bitmap->mddev->queue) + blk_add_trace_msg(bitmap->mddev->queue, + "md bitmap_daemon_work"); + + /* Any file-page which is PENDING now needs to be written. + * So set NEEDWRITE now, then after we make any last-minute changes + * we will write it. + */ + for (j = 0; j < bitmap->storage.file_pages; j++) + if (test_and_clear_page_attr(bitmap, j, + BITMAP_PAGE_PENDING)) + set_page_attr(bitmap, j, + BITMAP_PAGE_NEEDWRITE); + + if (bitmap->need_sync && + mddev->bitmap_info.external == 0) { + /* Arrange for superblock update as well as + * other changes */ + bitmap_super_t *sb; + bitmap->need_sync = 0; + if (bitmap->storage.filemap) { + sb = kmap_atomic(bitmap->storage.sb_page); + sb->events_cleared = + cpu_to_le64(bitmap->events_cleared); + kunmap_atomic(sb); + set_page_attr(bitmap, 0, + BITMAP_PAGE_NEEDWRITE); + } + } + /* Now look at the bitmap counters and if any are '2' or '1', + * decrement and handle accordingly. + */ + counts = &bitmap->counts; + spin_lock_irq(&counts->lock); + nextpage = 0; + for (j = 0; j < counts->chunks; j++) { + bitmap_counter_t *bmc; + sector_t block = (sector_t)j << counts->chunkshift; + + if (j == nextpage) { + nextpage += PAGE_COUNTER_RATIO; + if (!counts->bp[j >> PAGE_COUNTER_SHIFT].pending) { + j |= PAGE_COUNTER_MASK; + continue; + } + counts->bp[j >> PAGE_COUNTER_SHIFT].pending = 0; + } + bmc = bitmap_get_counter(counts, + block, + &blocks, 0); + + if (!bmc) { + j |= PAGE_COUNTER_MASK; + continue; + } + if (*bmc == 1 && !bitmap->need_sync) { + /* We can clear the bit */ + *bmc = 0; + bitmap_count_page(counts, block, -1); + bitmap_file_clear_bit(bitmap, block); + } else if (*bmc && *bmc <= 2) { + *bmc = 1; + bitmap_set_pending(counts, block); + bitmap->allclean = 0; + } + } + spin_unlock_irq(&counts->lock); + + bitmap_wait_writes(bitmap); + /* Now start writeout on any page in NEEDWRITE that isn't DIRTY. + * DIRTY pages need to be written by bitmap_unplug so it can wait + * for them. + * If we find any DIRTY page we stop there and let bitmap_unplug + * handle all the rest. This is important in the case where + * the first blocking holds the superblock and it has been updated. + * We mustn't write any other blocks before the superblock. + */ + for (j = 0; + j < bitmap->storage.file_pages + && !test_bit(BITMAP_STALE, &bitmap->flags); + j++) { + if (test_page_attr(bitmap, j, + BITMAP_PAGE_DIRTY)) + /* bitmap_unplug will handle the rest */ + break; + if (test_and_clear_page_attr(bitmap, j, + BITMAP_PAGE_NEEDWRITE)) { + write_page(bitmap, bitmap->storage.filemap[j], 0); + } + } + + done: + if (bitmap->allclean == 0) + mddev->thread->timeout = + mddev->bitmap_info.daemon_sleep; + mutex_unlock(&mddev->bitmap_info.mutex); +} + +static bitmap_counter_t *bitmap_get_counter(struct bitmap_counts *bitmap, + sector_t offset, sector_t *blocks, + int create) +__releases(bitmap->lock) +__acquires(bitmap->lock) +{ + /* If 'create', we might release the lock and reclaim it. + * The lock must have been taken with interrupts enabled. + * If !create, we don't release the lock. + */ + sector_t chunk = offset >> bitmap->chunkshift; + unsigned long page = chunk >> PAGE_COUNTER_SHIFT; + unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; + sector_t csize; + int err; + + err = bitmap_checkpage(bitmap, page, create, 0); + + if (bitmap->bp[page].hijacked || + bitmap->bp[page].map == NULL) + csize = ((sector_t)1) << (bitmap->chunkshift + + PAGE_COUNTER_SHIFT - 1); + else + csize = ((sector_t)1) << bitmap->chunkshift; + *blocks = csize - (offset & (csize - 1)); + + if (err < 0) + return NULL; + + /* now locked ... */ + + if (bitmap->bp[page].hijacked) { /* hijacked pointer */ + /* should we use the first or second counter field + * of the hijacked pointer? */ + int hi = (pageoff > PAGE_COUNTER_MASK); + return &((bitmap_counter_t *) + &bitmap->bp[page].map)[hi]; + } else /* page is allocated */ + return (bitmap_counter_t *) + &(bitmap->bp[page].map[pageoff]); +} + +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) +{ + if (!bitmap) + return 0; + + if (behind) { + int bw; + atomic_inc(&bitmap->behind_writes); + bw = atomic_read(&bitmap->behind_writes); + if (bw > bitmap->behind_writes_used) + bitmap->behind_writes_used = bw; + + pr_debug("inc write-behind count %d/%lu\n", + bw, bitmap->mddev->bitmap_info.max_write_behind); + } + + while (sectors) { + sector_t blocks; + bitmap_counter_t *bmc; + + spin_lock_irq(&bitmap->counts.lock); + bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 1); + if (!bmc) { + spin_unlock_irq(&bitmap->counts.lock); + return 0; + } + + if (unlikely(COUNTER(*bmc) == COUNTER_MAX)) { + DEFINE_WAIT(__wait); + /* note that it is safe to do the prepare_to_wait + * after the test as long as we do it before dropping + * the spinlock. + */ + prepare_to_wait(&bitmap->overflow_wait, &__wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_irq(&bitmap->counts.lock); + schedule(); + finish_wait(&bitmap->overflow_wait, &__wait); + continue; + } + + switch (*bmc) { + case 0: + bitmap_file_set_bit(bitmap, offset); + bitmap_count_page(&bitmap->counts, offset, 1); + /* fall through */ + case 1: + *bmc = 2; + } + + (*bmc)++; + + spin_unlock_irq(&bitmap->counts.lock); + + offset += blocks; + if (sectors > blocks) + sectors -= blocks; + else + sectors = 0; + } + return 0; +} +EXPORT_SYMBOL(bitmap_startwrite); + +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, + int success, int behind) +{ + if (!bitmap) + return; + if (behind) { + if (atomic_dec_and_test(&bitmap->behind_writes)) + wake_up(&bitmap->behind_wait); + pr_debug("dec write-behind count %d/%lu\n", + atomic_read(&bitmap->behind_writes), + bitmap->mddev->bitmap_info.max_write_behind); + } + + while (sectors) { + sector_t blocks; + unsigned long flags; + bitmap_counter_t *bmc; + + spin_lock_irqsave(&bitmap->counts.lock, flags); + bmc = bitmap_get_counter(&bitmap->counts, offset, &blocks, 0); + if (!bmc) { + spin_unlock_irqrestore(&bitmap->counts.lock, flags); + return; + } + + if (success && !bitmap->mddev->degraded && + bitmap->events_cleared < bitmap->mddev->events) { + bitmap->events_cleared = bitmap->mddev->events; + bitmap->need_sync = 1; + sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); + } + + if (!success && !NEEDED(*bmc)) + *bmc |= NEEDED_MASK; + + if (COUNTER(*bmc) == COUNTER_MAX) + wake_up(&bitmap->overflow_wait); + + (*bmc)--; + if (*bmc <= 2) { + bitmap_set_pending(&bitmap->counts, offset); + bitmap->allclean = 0; + } + spin_unlock_irqrestore(&bitmap->counts.lock, flags); + offset += blocks; + if (sectors > blocks) + sectors -= blocks; + else + sectors = 0; + } +} +EXPORT_SYMBOL(bitmap_endwrite); + +static int __bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, + int degraded) +{ + bitmap_counter_t *bmc; + int rv; + if (bitmap == NULL) {/* FIXME or bitmap set as 'failed' */ + *blocks = 1024; + return 1; /* always resync if no bitmap */ + } + spin_lock_irq(&bitmap->counts.lock); + bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); + rv = 0; + if (bmc) { + /* locked */ + if (RESYNC(*bmc)) + rv = 1; + else if (NEEDED(*bmc)) { + rv = 1; + if (!degraded) { /* don't set/clear bits if degraded */ + *bmc |= RESYNC_MASK; + *bmc &= ~NEEDED_MASK; + } + } + } + spin_unlock_irq(&bitmap->counts.lock); + return rv; +} + +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, + int degraded) +{ + /* bitmap_start_sync must always report on multiples of whole + * pages, otherwise resync (which is very PAGE_SIZE based) will + * get confused. + * So call __bitmap_start_sync repeatedly (if needed) until + * At least PAGE_SIZE>>9 blocks are covered. + * Return the 'or' of the result. + */ + int rv = 0; + sector_t blocks1; + + *blocks = 0; + while (*blocks < (PAGE_SIZE>>9)) { + rv |= __bitmap_start_sync(bitmap, offset, + &blocks1, degraded); + offset += blocks1; + *blocks += blocks1; + } + return rv; +} +EXPORT_SYMBOL(bitmap_start_sync); + +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted) +{ + bitmap_counter_t *bmc; + unsigned long flags; + + if (bitmap == NULL) { + *blocks = 1024; + return; + } + spin_lock_irqsave(&bitmap->counts.lock, flags); + bmc = bitmap_get_counter(&bitmap->counts, offset, blocks, 0); + if (bmc == NULL) + goto unlock; + /* locked */ + if (RESYNC(*bmc)) { + *bmc &= ~RESYNC_MASK; + + if (!NEEDED(*bmc) && aborted) + *bmc |= NEEDED_MASK; + else { + if (*bmc <= 2) { + bitmap_set_pending(&bitmap->counts, offset); + bitmap->allclean = 0; + } + } + } + unlock: + spin_unlock_irqrestore(&bitmap->counts.lock, flags); +} +EXPORT_SYMBOL(bitmap_end_sync); + +void bitmap_close_sync(struct bitmap *bitmap) +{ + /* Sync has finished, and any bitmap chunks that weren't synced + * properly have been aborted. It remains to us to clear the + * RESYNC bit wherever it is still on + */ + sector_t sector = 0; + sector_t blocks; + if (!bitmap) + return; + while (sector < bitmap->mddev->resync_max_sectors) { + bitmap_end_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } +} +EXPORT_SYMBOL(bitmap_close_sync); + +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) +{ + sector_t s = 0; + sector_t blocks; + + if (!bitmap) + return; + if (sector == 0) { + bitmap->last_end_sync = jiffies; + return; + } + if (!force && time_before(jiffies, (bitmap->last_end_sync + + bitmap->mddev->bitmap_info.daemon_sleep))) + return; + wait_event(bitmap->mddev->recovery_wait, + atomic_read(&bitmap->mddev->recovery_active) == 0); + + bitmap->mddev->curr_resync_completed = sector; + set_bit(MD_SB_CHANGE_CLEAN, &bitmap->mddev->sb_flags); + sector &= ~((1ULL << bitmap->counts.chunkshift) - 1); + s = 0; + while (s < sector && s < bitmap->mddev->resync_max_sectors) { + bitmap_end_sync(bitmap, s, &blocks, 0); + s += blocks; + } + bitmap->last_end_sync = jiffies; + sysfs_notify(&bitmap->mddev->kobj, NULL, "sync_completed"); +} +EXPORT_SYMBOL(bitmap_cond_end_sync); + +void bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi) +{ + struct bitmap *bitmap = mddev->bitmap; + sector_t sector, blocks = 0; + + for (sector = old_lo; sector < new_lo; ) { + bitmap_end_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + WARN((blocks > new_lo) && old_lo, "alignment is not correct for lo\n"); + + for (sector = old_hi; sector < new_hi; ) { + bitmap_start_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + WARN((blocks > new_hi) && old_hi, "alignment is not correct for hi\n"); +} +EXPORT_SYMBOL(bitmap_sync_with_cluster); + +static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed) +{ + /* For each chunk covered by any of these sectors, set the + * counter to 2 and possibly set resync_needed. They should all + * be 0 at this point + */ + + sector_t secs; + bitmap_counter_t *bmc; + spin_lock_irq(&bitmap->counts.lock); + bmc = bitmap_get_counter(&bitmap->counts, offset, &secs, 1); + if (!bmc) { + spin_unlock_irq(&bitmap->counts.lock); + return; + } + if (!*bmc) { + *bmc = 2; + bitmap_count_page(&bitmap->counts, offset, 1); + bitmap_set_pending(&bitmap->counts, offset); + bitmap->allclean = 0; + } + if (needed) + *bmc |= NEEDED_MASK; + spin_unlock_irq(&bitmap->counts.lock); +} + +/* dirty the memory and file bits for bitmap chunks "s" to "e" */ +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) +{ + unsigned long chunk; + + for (chunk = s; chunk <= e; chunk++) { + sector_t sec = (sector_t)chunk << bitmap->counts.chunkshift; + bitmap_set_memory_bits(bitmap, sec, 1); + bitmap_file_set_bit(bitmap, sec); + if (sec < bitmap->mddev->recovery_cp) + /* We are asserting that the array is dirty, + * so move the recovery_cp address back so + * that it is obvious that it is dirty + */ + bitmap->mddev->recovery_cp = sec; + } +} + +/* + * flush out any pending updates + */ +void bitmap_flush(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + long sleep; + + if (!bitmap) /* there was no bitmap */ + return; + + /* run the daemon_work three time to ensure everything is flushed + * that can be + */ + sleep = mddev->bitmap_info.daemon_sleep * 2; + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); + bitmap->daemon_lastrun -= sleep; + bitmap_daemon_work(mddev); + bitmap_update_sb(bitmap); +} + +/* + * free memory that was allocated + */ +void bitmap_free(struct bitmap *bitmap) +{ + unsigned long k, pages; + struct bitmap_page *bp; + + if (!bitmap) /* there was no bitmap */ + return; + + if (bitmap->sysfs_can_clear) + sysfs_put(bitmap->sysfs_can_clear); + + if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info && + bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev)) + md_cluster_stop(bitmap->mddev); + + /* Shouldn't be needed - but just in case.... */ + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes) == 0); + + /* release the bitmap file */ + bitmap_file_unmap(&bitmap->storage); + + bp = bitmap->counts.bp; + pages = bitmap->counts.pages; + + /* free all allocated memory */ + + if (bp) /* deallocate the page memory */ + for (k = 0; k < pages; k++) + if (bp[k].map && !bp[k].hijacked) + kfree(bp[k].map); + kfree(bp); + kfree(bitmap); +} +EXPORT_SYMBOL(bitmap_free); + +void bitmap_wait_behind_writes(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + /* wait for behind writes to complete */ + if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { + pr_debug("md:%s: behind writes in progress - waiting to stop.\n", + mdname(mddev)); + /* need to kick something here to make sure I/O goes? */ + wait_event(bitmap->behind_wait, + atomic_read(&bitmap->behind_writes) == 0); + } +} + +void bitmap_destroy(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) /* there was no bitmap */ + return; + + bitmap_wait_behind_writes(mddev); + + mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); + mddev->bitmap = NULL; /* disconnect from the md device */ + spin_unlock(&mddev->lock); + mutex_unlock(&mddev->bitmap_info.mutex); + if (mddev->thread) + mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; + + bitmap_free(bitmap); +} + +/* + * initialize the bitmap structure + * if this returns an error, bitmap_destroy must be called to do clean up + * once mddev->bitmap is set + */ +struct bitmap *bitmap_create(struct mddev *mddev, int slot) +{ + struct bitmap *bitmap; + sector_t blocks = mddev->resync_max_sectors; + struct file *file = mddev->bitmap_info.file; + int err; + struct kernfs_node *bm = NULL; + + BUILD_BUG_ON(sizeof(bitmap_super_t) != 256); + + BUG_ON(file && mddev->bitmap_info.offset); + + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); + if (!bitmap) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&bitmap->counts.lock); + atomic_set(&bitmap->pending_writes, 0); + init_waitqueue_head(&bitmap->write_wait); + init_waitqueue_head(&bitmap->overflow_wait); + init_waitqueue_head(&bitmap->behind_wait); + + bitmap->mddev = mddev; + bitmap->cluster_slot = slot; + + if (mddev->kobj.sd) + bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap"); + if (bm) { + bitmap->sysfs_can_clear = sysfs_get_dirent(bm, "can_clear"); + sysfs_put(bm); + } else + bitmap->sysfs_can_clear = NULL; + + bitmap->storage.file = file; + if (file) { + get_file(file); + /* As future accesses to this file will use bmap, + * and bypass the page cache, we must sync the file + * first. + */ + vfs_fsync(file, 1); + } + /* read superblock from bitmap file (this sets mddev->bitmap_info.chunksize) */ + if (!mddev->bitmap_info.external) { + /* + * If 'MD_ARRAY_FIRST_USE' is set, then device-mapper is + * instructing us to create a new on-disk bitmap instance. + */ + if (test_and_clear_bit(MD_ARRAY_FIRST_USE, &mddev->flags)) + err = bitmap_new_disk_sb(bitmap); + else + err = bitmap_read_sb(bitmap); + } else { + err = 0; + if (mddev->bitmap_info.chunksize == 0 || + mddev->bitmap_info.daemon_sleep == 0) + /* chunksize and time_base need to be + * set first. */ + err = -EINVAL; + } + if (err) + goto error; + + bitmap->daemon_lastrun = jiffies; + err = bitmap_resize(bitmap, blocks, mddev->bitmap_info.chunksize, 1); + if (err) + goto error; + + pr_debug("created bitmap (%lu pages) for device %s\n", + bitmap->counts.pages, bmname(bitmap)); + + err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0; + if (err) + goto error; + + return bitmap; + error: + bitmap_free(bitmap); + return ERR_PTR(err); +} + +int bitmap_load(struct mddev *mddev) +{ + int err = 0; + sector_t start = 0; + sector_t sector = 0; + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + goto out; + + if (mddev_is_clustered(mddev)) + md_cluster_ops->load_bitmaps(mddev, mddev->bitmap_info.nodes); + + /* Clear out old bitmap info first: Either there is none, or we + * are resuming after someone else has possibly changed things, + * so we should forget old cached info. + * All chunks should be clean, but some might need_sync. + */ + while (sector < mddev->resync_max_sectors) { + sector_t blocks; + bitmap_start_sync(bitmap, sector, &blocks, 0); + sector += blocks; + } + bitmap_close_sync(bitmap); + + if (mddev->degraded == 0 + || bitmap->events_cleared == mddev->events) + /* no need to keep dirty bits to optimise a + * re-add of a missing device */ + start = mddev->recovery_cp; + + mutex_lock(&mddev->bitmap_info.mutex); + err = bitmap_init_from_disk(bitmap, start); + mutex_unlock(&mddev->bitmap_info.mutex); + + if (err) + goto out; + clear_bit(BITMAP_STALE, &bitmap->flags); + + /* Kick recovery in case any bits were set */ + set_bit(MD_RECOVERY_NEEDED, &bitmap->mddev->recovery); + + mddev->thread->timeout = mddev->bitmap_info.daemon_sleep; + md_wakeup_thread(mddev->thread); + + bitmap_update_sb(bitmap); + + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + err = -EIO; +out: + return err; +} +EXPORT_SYMBOL_GPL(bitmap_load); + +struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot) +{ + int rv = 0; + struct bitmap *bitmap; + + bitmap = bitmap_create(mddev, slot); + if (IS_ERR(bitmap)) { + rv = PTR_ERR(bitmap); + return ERR_PTR(rv); + } + + rv = bitmap_init_from_disk(bitmap, 0); + if (rv) { + bitmap_free(bitmap); + return ERR_PTR(rv); + } + + return bitmap; +} +EXPORT_SYMBOL(get_bitmap_from_slot); + +/* Loads the bitmap associated with slot and copies the resync information + * to our bitmap + */ +int bitmap_copy_from_slot(struct mddev *mddev, int slot, + sector_t *low, sector_t *high, bool clear_bits) +{ + int rv = 0, i, j; + sector_t block, lo = 0, hi = 0; + struct bitmap_counts *counts; + struct bitmap *bitmap; + + bitmap = get_bitmap_from_slot(mddev, slot); + if (IS_ERR(bitmap)) { + pr_err("%s can't get bitmap from slot %d\n", __func__, slot); + return -1; + } + + counts = &bitmap->counts; + for (j = 0; j < counts->chunks; j++) { + block = (sector_t)j << counts->chunkshift; + if (bitmap_file_test_bit(bitmap, block)) { + if (!lo) + lo = block; + hi = block; + bitmap_file_clear_bit(bitmap, block); + bitmap_set_memory_bits(mddev->bitmap, block, 1); + bitmap_file_set_bit(mddev->bitmap, block); + } + } + + if (clear_bits) { + bitmap_update_sb(bitmap); + /* BITMAP_PAGE_PENDING is set, but bitmap_unplug needs + * BITMAP_PAGE_DIRTY or _NEEDWRITE to write ... */ + for (i = 0; i < bitmap->storage.file_pages; i++) + if (test_page_attr(bitmap, i, BITMAP_PAGE_PENDING)) + set_page_attr(bitmap, i, BITMAP_PAGE_NEEDWRITE); + bitmap_unplug(bitmap); + } + bitmap_unplug(mddev->bitmap); + *low = lo; + *high = hi; + + return rv; +} +EXPORT_SYMBOL_GPL(bitmap_copy_from_slot); + + +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +{ + unsigned long chunk_kb; + struct bitmap_counts *counts; + + if (!bitmap) + return; + + counts = &bitmap->counts; + + chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " + "%lu%s chunk", + counts->pages - counts->missing_pages, + counts->pages, + (counts->pages - counts->missing_pages) + << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + if (bitmap->storage.file) { + seq_printf(seq, ", file: "); + seq_file_path(seq, bitmap->storage.file, " \t\n"); + } + + seq_printf(seq, "\n"); +} + +int bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, int init) +{ + /* If chunk_size is 0, choose an appropriate chunk size. + * Then possibly allocate new storage space. + * Then quiesce, copy bits, replace bitmap, and re-start + * + * This function is called both to set up the initial bitmap + * and to resize the bitmap while the array is active. + * If this happens as a result of the array being resized, + * chunksize will be zero, and we need to choose a suitable + * chunksize, otherwise we use what we are given. + */ + struct bitmap_storage store; + struct bitmap_counts old_counts; + unsigned long chunks; + sector_t block; + sector_t old_blocks, new_blocks; + int chunkshift; + int ret = 0; + long pages; + struct bitmap_page *new_bp; + + if (bitmap->storage.file && !init) { + pr_info("md: cannot resize file-based bitmap\n"); + return -EINVAL; + } + + if (chunksize == 0) { + /* If there is enough space, leave the chunk size unchanged, + * else increase by factor of two until there is enough space. + */ + long bytes; + long space = bitmap->mddev->bitmap_info.space; + + if (space == 0) { + /* We don't know how much space there is, so limit + * to current size - in sectors. + */ + bytes = DIV_ROUND_UP(bitmap->counts.chunks, 8); + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); + space = DIV_ROUND_UP(bytes, 512); + bitmap->mddev->bitmap_info.space = space; + } + chunkshift = bitmap->counts.chunkshift; + chunkshift--; + do { + /* 'chunkshift' is shift from block size to chunk size */ + chunkshift++; + chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); + bytes = DIV_ROUND_UP(chunks, 8); + if (!bitmap->mddev->bitmap_info.external) + bytes += sizeof(bitmap_super_t); + } while (bytes > (space << 9)); + } else + chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT; + + chunks = DIV_ROUND_UP_SECTOR_T(blocks, 1 << chunkshift); + memset(&store, 0, sizeof(store)); + if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file) + ret = bitmap_storage_alloc(&store, chunks, + !bitmap->mddev->bitmap_info.external, + mddev_is_clustered(bitmap->mddev) + ? bitmap->cluster_slot : 0); + if (ret) { + bitmap_file_unmap(&store); + goto err; + } + + pages = DIV_ROUND_UP(chunks, PAGE_COUNTER_RATIO); + + new_bp = kzalloc(pages * sizeof(*new_bp), GFP_KERNEL); + ret = -ENOMEM; + if (!new_bp) { + bitmap_file_unmap(&store); + goto err; + } + + if (!init) + bitmap->mddev->pers->quiesce(bitmap->mddev, 1); + + store.file = bitmap->storage.file; + bitmap->storage.file = NULL; + + if (store.sb_page && bitmap->storage.sb_page) + memcpy(page_address(store.sb_page), + page_address(bitmap->storage.sb_page), + sizeof(bitmap_super_t)); + bitmap_file_unmap(&bitmap->storage); + bitmap->storage = store; + + old_counts = bitmap->counts; + bitmap->counts.bp = new_bp; + bitmap->counts.pages = pages; + bitmap->counts.missing_pages = pages; + bitmap->counts.chunkshift = chunkshift; + bitmap->counts.chunks = chunks; + bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift + + BITMAP_BLOCK_SHIFT); + + blocks = min(old_counts.chunks << old_counts.chunkshift, + chunks << chunkshift); + + spin_lock_irq(&bitmap->counts.lock); + /* For cluster raid, need to pre-allocate bitmap */ + if (mddev_is_clustered(bitmap->mddev)) { + unsigned long page; + for (page = 0; page < pages; page++) { + ret = bitmap_checkpage(&bitmap->counts, page, 1, 1); + if (ret) { + unsigned long k; + + /* deallocate the page memory */ + for (k = 0; k < page; k++) { + kfree(new_bp[k].map); + } + + /* restore some fields from old_counts */ + bitmap->counts.bp = old_counts.bp; + bitmap->counts.pages = old_counts.pages; + bitmap->counts.missing_pages = old_counts.pages; + bitmap->counts.chunkshift = old_counts.chunkshift; + bitmap->counts.chunks = old_counts.chunks; + bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift + + BITMAP_BLOCK_SHIFT); + blocks = old_counts.chunks << old_counts.chunkshift; + pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n"); + break; + } else + bitmap->counts.bp[page].count += 1; + } + } + + for (block = 0; block < blocks; ) { + bitmap_counter_t *bmc_old, *bmc_new; + int set; + + bmc_old = bitmap_get_counter(&old_counts, block, + &old_blocks, 0); + set = bmc_old && NEEDED(*bmc_old); + + if (set) { + bmc_new = bitmap_get_counter(&bitmap->counts, block, + &new_blocks, 1); + if (*bmc_new == 0) { + /* need to set on-disk bits too. */ + sector_t end = block + new_blocks; + sector_t start = block >> chunkshift; + start <<= chunkshift; + while (start < end) { + bitmap_file_set_bit(bitmap, block); + start += 1 << chunkshift; + } + *bmc_new = 2; + bitmap_count_page(&bitmap->counts, + block, 1); + bitmap_set_pending(&bitmap->counts, + block); + } + *bmc_new |= NEEDED_MASK; + if (new_blocks < old_blocks) + old_blocks = new_blocks; + } + block += old_blocks; + } + + if (!init) { + int i; + while (block < (chunks << chunkshift)) { + bitmap_counter_t *bmc; + bmc = bitmap_get_counter(&bitmap->counts, block, + &new_blocks, 1); + if (bmc) { + /* new space. It needs to be resynced, so + * we set NEEDED_MASK. + */ + if (*bmc == 0) { + *bmc = NEEDED_MASK | 2; + bitmap_count_page(&bitmap->counts, + block, 1); + bitmap_set_pending(&bitmap->counts, + block); + } + } + block += new_blocks; + } + for (i = 0; i < bitmap->storage.file_pages; i++) + set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY); + } + spin_unlock_irq(&bitmap->counts.lock); + + if (!init) { + bitmap_unplug(bitmap); + bitmap->mddev->pers->quiesce(bitmap->mddev, 0); + } + ret = 0; +err: + return ret; +} +EXPORT_SYMBOL_GPL(bitmap_resize); + +static ssize_t +location_show(struct mddev *mddev, char *page) +{ + ssize_t len; + if (mddev->bitmap_info.file) + len = sprintf(page, "file"); + else if (mddev->bitmap_info.offset) + len = sprintf(page, "%+lld", (long long)mddev->bitmap_info.offset); + else + len = sprintf(page, "none"); + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +location_store(struct mddev *mddev, const char *buf, size_t len) +{ + int rv; + + rv = mddev_lock(mddev); + if (rv) + return rv; + if (mddev->pers) { + if (!mddev->pers->quiesce) { + rv = -EBUSY; + goto out; + } + if (mddev->recovery || mddev->sync_thread) { + rv = -EBUSY; + goto out; + } + } + + if (mddev->bitmap || mddev->bitmap_info.file || + mddev->bitmap_info.offset) { + /* bitmap already configured. Only option is to clear it */ + if (strncmp(buf, "none", 4) != 0) { + rv = -EBUSY; + goto out; + } + if (mddev->pers) { + mddev->pers->quiesce(mddev, 1); + bitmap_destroy(mddev); + mddev->pers->quiesce(mddev, 0); + } + mddev->bitmap_info.offset = 0; + if (mddev->bitmap_info.file) { + struct file *f = mddev->bitmap_info.file; + mddev->bitmap_info.file = NULL; + fput(f); + } + } else { + /* No bitmap, OK to set a location */ + long long offset; + if (strncmp(buf, "none", 4) == 0) + /* nothing to be done */; + else if (strncmp(buf, "file:", 5) == 0) { + /* Not supported yet */ + rv = -EINVAL; + goto out; + } else { + if (buf[0] == '+') + rv = kstrtoll(buf+1, 10, &offset); + else + rv = kstrtoll(buf, 10, &offset); + if (rv) + goto out; + if (offset == 0) { + rv = -EINVAL; + goto out; + } + if (mddev->bitmap_info.external == 0 && + mddev->major_version == 0 && + offset != mddev->bitmap_info.default_offset) { + rv = -EINVAL; + goto out; + } + mddev->bitmap_info.offset = offset; + if (mddev->pers) { + struct bitmap *bitmap; + mddev->pers->quiesce(mddev, 1); + bitmap = bitmap_create(mddev, -1); + if (IS_ERR(bitmap)) + rv = PTR_ERR(bitmap); + else { + mddev->bitmap = bitmap; + rv = bitmap_load(mddev); + if (rv) + mddev->bitmap_info.offset = 0; + } + mddev->pers->quiesce(mddev, 0); + if (rv) { + bitmap_destroy(mddev); + goto out; + } + } + } + } + if (!mddev->external) { + /* Ensure new bitmap info is stored in + * metadata promptly. + */ + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + md_wakeup_thread(mddev->thread); + } + rv = 0; +out: + mddev_unlock(mddev); + if (rv) + return rv; + return len; +} + +static struct md_sysfs_entry bitmap_location = +__ATTR(location, S_IRUGO|S_IWUSR, location_show, location_store); + +/* 'bitmap/space' is the space available at 'location' for the + * bitmap. This allows the kernel to know when it is safe to + * resize the bitmap to match a resized array. + */ +static ssize_t +space_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.space); +} + +static ssize_t +space_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long sectors; + int rv; + + rv = kstrtoul(buf, 10, §ors); + if (rv) + return rv; + + if (sectors == 0) + return -EINVAL; + + if (mddev->bitmap && + sectors < (mddev->bitmap->storage.bytes + 511) >> 9) + return -EFBIG; /* Bitmap is too big for this small space */ + + /* could make sure it isn't too big, but that isn't really + * needed - user-space should be careful. + */ + mddev->bitmap_info.space = sectors; + return len; +} + +static struct md_sysfs_entry bitmap_space = +__ATTR(space, S_IRUGO|S_IWUSR, space_show, space_store); + +static ssize_t +timeout_show(struct mddev *mddev, char *page) +{ + ssize_t len; + unsigned long secs = mddev->bitmap_info.daemon_sleep / HZ; + unsigned long jifs = mddev->bitmap_info.daemon_sleep % HZ; + + len = sprintf(page, "%lu", secs); + if (jifs) + len += sprintf(page+len, ".%03u", jiffies_to_msecs(jifs)); + len += sprintf(page+len, "\n"); + return len; +} + +static ssize_t +timeout_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* timeout can be set at any time */ + unsigned long timeout; + int rv = strict_strtoul_scaled(buf, &timeout, 4); + if (rv) + return rv; + + /* just to make sure we don't overflow... */ + if (timeout >= LONG_MAX / HZ) + return -EINVAL; + + timeout = timeout * HZ / 10000; + + if (timeout >= MAX_SCHEDULE_TIMEOUT) + timeout = MAX_SCHEDULE_TIMEOUT-1; + if (timeout < 1) + timeout = 1; + mddev->bitmap_info.daemon_sleep = timeout; + if (mddev->thread) { + /* if thread->timeout is MAX_SCHEDULE_TIMEOUT, then + * the bitmap is all clean and we don't need to + * adjust the timeout right now + */ + if (mddev->thread->timeout < MAX_SCHEDULE_TIMEOUT) { + mddev->thread->timeout = timeout; + md_wakeup_thread(mddev->thread); + } + } + return len; +} + +static struct md_sysfs_entry bitmap_timeout = +__ATTR(time_base, S_IRUGO|S_IWUSR, timeout_show, timeout_store); + +static ssize_t +backlog_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.max_write_behind); +} + +static ssize_t +backlog_store(struct mddev *mddev, const char *buf, size_t len) +{ + unsigned long backlog; + int rv = kstrtoul(buf, 10, &backlog); + if (rv) + return rv; + if (backlog > COUNTER_MAX) + return -EINVAL; + mddev->bitmap_info.max_write_behind = backlog; + return len; +} + +static struct md_sysfs_entry bitmap_backlog = +__ATTR(backlog, S_IRUGO|S_IWUSR, backlog_show, backlog_store); + +static ssize_t +chunksize_show(struct mddev *mddev, char *page) +{ + return sprintf(page, "%lu\n", mddev->bitmap_info.chunksize); +} + +static ssize_t +chunksize_store(struct mddev *mddev, const char *buf, size_t len) +{ + /* Can only be changed when no bitmap is active */ + int rv; + unsigned long csize; + if (mddev->bitmap) + return -EBUSY; + rv = kstrtoul(buf, 10, &csize); + if (rv) + return rv; + if (csize < 512 || + !is_power_of_2(csize)) + return -EINVAL; + mddev->bitmap_info.chunksize = csize; + return len; +} + +static struct md_sysfs_entry bitmap_chunksize = +__ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store); + +static ssize_t metadata_show(struct mddev *mddev, char *page) +{ + if (mddev_is_clustered(mddev)) + return sprintf(page, "clustered\n"); + return sprintf(page, "%s\n", (mddev->bitmap_info.external + ? "external" : "internal")); +} + +static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap || + mddev->bitmap_info.file || + mddev->bitmap_info.offset) + return -EBUSY; + if (strncmp(buf, "external", 8) == 0) + mddev->bitmap_info.external = 1; + else if ((strncmp(buf, "internal", 8) == 0) || + (strncmp(buf, "clustered", 9) == 0)) + mddev->bitmap_info.external = 0; + else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_metadata = +__ATTR(metadata, S_IRUGO|S_IWUSR, metadata_show, metadata_store); + +static ssize_t can_clear_show(struct mddev *mddev, char *page) +{ + int len; + spin_lock(&mddev->lock); + if (mddev->bitmap) + len = sprintf(page, "%s\n", (mddev->bitmap->need_sync ? + "false" : "true")); + else + len = sprintf(page, "\n"); + spin_unlock(&mddev->lock); + return len; +} + +static ssize_t can_clear_store(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap == NULL) + return -ENOENT; + if (strncmp(buf, "false", 5) == 0) + mddev->bitmap->need_sync = 1; + else if (strncmp(buf, "true", 4) == 0) { + if (mddev->degraded) + return -EBUSY; + mddev->bitmap->need_sync = 0; + } else + return -EINVAL; + return len; +} + +static struct md_sysfs_entry bitmap_can_clear = +__ATTR(can_clear, S_IRUGO|S_IWUSR, can_clear_show, can_clear_store); + +static ssize_t +behind_writes_used_show(struct mddev *mddev, char *page) +{ + ssize_t ret; + spin_lock(&mddev->lock); + if (mddev->bitmap == NULL) + ret = sprintf(page, "0\n"); + else + ret = sprintf(page, "%lu\n", + mddev->bitmap->behind_writes_used); + spin_unlock(&mddev->lock); + return ret; +} + +static ssize_t +behind_writes_used_reset(struct mddev *mddev, const char *buf, size_t len) +{ + if (mddev->bitmap) + mddev->bitmap->behind_writes_used = 0; + return len; +} + +static struct md_sysfs_entry max_backlog_used = +__ATTR(max_backlog_used, S_IRUGO | S_IWUSR, + behind_writes_used_show, behind_writes_used_reset); + +static struct attribute *md_bitmap_attrs[] = { + &bitmap_location.attr, + &bitmap_space.attr, + &bitmap_timeout.attr, + &bitmap_backlog.attr, + &bitmap_chunksize.attr, + &bitmap_metadata.attr, + &bitmap_can_clear.attr, + &max_backlog_used.attr, + NULL +}; +struct attribute_group md_bitmap_group = { + .name = "bitmap", + .attrs = md_bitmap_attrs, +}; + diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h new file mode 100644 index 000000000000..d15721ac07a6 --- /dev/null +++ b/drivers/md/md-bitmap.h @@ -0,0 +1,277 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + * Version 5 is currently set only for clustered devices + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_CLUSTERED 5 +#define BITMAP_MAJOR_HOSTENDIAN 3 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared as well, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stripe is clean + * Anything that dirties the stripe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SHIFT 9 + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_STALE = 1, /* the bitmap file is out of date or had -EIO */ + BITMAP_WRITE_ERROR = 2, /* A write error has occurred */ + BITMAP_HOSTENDIAN =15, +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __le32 magic; /* 0 BITMAP_MAGIC */ + __le32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __le64 events; /* 24 event counter for the bitmap (1)*/ + __le64 events_cleared;/*32 event counter when last bit cleared (2) */ + __le64 sync_size; /* 40 the size of the md device's sync range(3) */ + __le32 state; /* 48 bitmap state information */ + __le32 chunksize; /* 52 the bitmap chunk size in bytes */ + __le32 daemon_sleep; /* 56 seconds between disk flushes */ + __le32 write_behind; /* 60 number of outstanding write-behind writes */ + __le32 sectors_reserved; /* 64 number of 512-byte sectors that are + * reserved for the bitmap. */ + __le32 nodes; /* 68 the maximum number of nodes in cluster. */ + __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ + __u8 pad[256 - 136]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be alloced), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked:1; + /* + * If any counter in this page is '1' or '2' - and so could be + * cleared then that page is marked as 'pending' + */ + unsigned int pending:1; + /* + * count of dirty bits on the page + */ + unsigned int count:30; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + + struct bitmap_counts { + spinlock_t lock; + struct bitmap_page *bp; + unsigned long pages; /* total number of pages + * in the bitmap */ + unsigned long missing_pages; /* number of pages + * not yet allocated */ + unsigned long chunkshift; /* chunksize = 2^chunkshift + * (for bitops) */ + unsigned long chunks; /* Total number of data + * chunks for the array */ + } counts; + + struct mddev *mddev; /* the md device that the bitmap is for */ + + __u64 events_cleared; + int need_sync; + + struct bitmap_storage { + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap + * file superblock */ + struct page **filemap; /* list of cache pages for + * the file */ + unsigned long *filemap_attr; /* attributes associated + * w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file*/ + unsigned long bytes; /* total bytes in the bitmap */ + } storage; + + unsigned long flags; + + int allclean; + + atomic_t behind_writes; + unsigned long behind_writes_used; /* highest actual value at runtime */ + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + unsigned long daemon_lastrun; /* jiffies of last run */ + unsigned long last_end_sync; /* when we lasted called end_sync to + * update bitmap with resync progress */ + + atomic_t pending_writes; /* pending writes to the bitmap file */ + wait_queue_head_t write_wait; + wait_queue_head_t overflow_wait; + wait_queue_head_t behind_wait; + + struct kernfs_node *sysfs_can_clear; + int cluster_slot; /* Slot offset for clustered env */ +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +struct bitmap *bitmap_create(struct mddev *mddev, int slot); +int bitmap_load(struct mddev *mddev); +void bitmap_flush(struct mddev *mddev); +void bitmap_destroy(struct mddev *mddev); + +void bitmap_print_sb(struct bitmap *bitmap); +void bitmap_update_sb(struct bitmap *bitmap); +void bitmap_status(struct seq_file *seq, struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); +void bitmap_write_all(struct bitmap *bitmap); + +void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e); + +/* these are exported */ +int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int behind); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors, int success, int behind); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); +void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force); +void bitmap_sync_with_cluster(struct mddev *mddev, + sector_t old_lo, sector_t old_hi, + sector_t new_lo, sector_t new_hi); + +void bitmap_unplug(struct bitmap *bitmap); +void bitmap_daemon_work(struct mddev *mddev); + +int bitmap_resize(struct bitmap *bitmap, sector_t blocks, + int chunksize, int init); +struct bitmap *get_bitmap_from_slot(struct mddev *mddev, int slot); +int bitmap_copy_from_slot(struct mddev *mddev, int slot, + sector_t *lo, sector_t *hi, bool clear_bits); +void bitmap_free(struct bitmap *bitmap); +void bitmap_wait_behind_writes(struct mddev *mddev); +#endif + +#endif diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index bf41492a2cb0..bc81ecc24c96 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -15,7 +15,7 @@ #include #include #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #define LVB_SIZE 64 diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c new file mode 100644 index 000000000000..38264b38420f --- /dev/null +++ b/drivers/md/md-faulty.c @@ -0,0 +1,372 @@ +/* + * faulty.c : Multiple Devices driver for Linux + * + * Copyright (C) 2004 Neil Brown + * + * fautly-device-simulator personality for md + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +/* + * The "faulty" personality causes some requests to fail. + * + * Possible failure modes are: + * reads fail "randomly" but succeed on retry + * writes fail "randomly" but succeed on retry + * reads for some address fail and then persist until a write + * reads for some address fail and then persist irrespective of write + * writes for some address fail and persist + * all writes fail + * + * Different modes can be active at a time, but only + * one can be set at array creation. Others can be added later. + * A mode can be one-shot or recurrent with the recurrence being + * once in every N requests. + * The bottom 5 bits of the "layout" indicate the mode. The + * remainder indicate a period, or 0 for one-shot. + * + * There is an implementation limit on the number of concurrently + * persisting-faulty blocks. When a new fault is requested that would + * exceed the limit, it is ignored. + * All current faults can be clear using a layout of "0". + * + * Requests are always sent to the device. If they are to fail, + * we clone the bio and insert a new b_end_io into the chain. + */ + +#define WriteTransient 0 +#define ReadTransient 1 +#define WritePersistent 2 +#define ReadPersistent 3 +#define WriteAll 4 /* doesn't go to device */ +#define ReadFixable 5 +#define Modes 6 + +#define ClearErrors 31 +#define ClearFaults 30 + +#define AllPersist 100 /* internal use only */ +#define NoPersist 101 + +#define ModeMask 0x1f +#define ModeShift 5 + +#define MaxFault 50 +#include +#include +#include +#include +#include "md.h" +#include + + +static void faulty_fail(struct bio *bio) +{ + struct bio *b = bio->bi_private; + + b->bi_iter.bi_size = bio->bi_iter.bi_size; + b->bi_iter.bi_sector = bio->bi_iter.bi_sector; + + bio_put(bio); + + bio_io_error(b); +} + +struct faulty_conf { + int period[Modes]; + atomic_t counters[Modes]; + sector_t faults[MaxFault]; + int modes[MaxFault]; + int nfaults; + struct md_rdev *rdev; +}; + +static int check_mode(struct faulty_conf *conf, int mode) +{ + if (conf->period[mode] == 0 && + atomic_read(&conf->counters[mode]) <= 0) + return 0; /* no failure, no decrement */ + + + if (atomic_dec_and_test(&conf->counters[mode])) { + if (conf->period[mode]) + atomic_set(&conf->counters[mode], conf->period[mode]); + return 1; + } + return 0; +} + +static int check_sector(struct faulty_conf *conf, sector_t start, sector_t end, int dir) +{ + /* If we find a ReadFixable sector, we fix it ... */ + int i; + for (i=0; infaults; i++) + if (conf->faults[i] >= start && + conf->faults[i] < end) { + /* found it ... */ + switch (conf->modes[i] * 2 + dir) { + case WritePersistent*2+WRITE: return 1; + case ReadPersistent*2+READ: return 1; + case ReadFixable*2+READ: return 1; + case ReadFixable*2+WRITE: + conf->modes[i] = NoPersist; + return 0; + case AllPersist*2+READ: + case AllPersist*2+WRITE: return 1; + default: + return 0; + } + } + return 0; +} + +static void add_sector(struct faulty_conf *conf, sector_t start, int mode) +{ + int i; + int n = conf->nfaults; + for (i=0; infaults; i++) + if (conf->faults[i] == start) { + switch(mode) { + case NoPersist: conf->modes[i] = mode; return; + case WritePersistent: + if (conf->modes[i] == ReadPersistent || + conf->modes[i] == ReadFixable) + conf->modes[i] = AllPersist; + else + conf->modes[i] = WritePersistent; + return; + case ReadPersistent: + if (conf->modes[i] == WritePersistent) + conf->modes[i] = AllPersist; + else + conf->modes[i] = ReadPersistent; + return; + case ReadFixable: + if (conf->modes[i] == WritePersistent || + conf->modes[i] == ReadPersistent) + conf->modes[i] = AllPersist; + else + conf->modes[i] = ReadFixable; + return; + } + } else if (conf->modes[i] == NoPersist) + n = i; + + if (n >= MaxFault) + return; + conf->faults[n] = start; + conf->modes[n] = mode; + if (conf->nfaults == n) + conf->nfaults = n+1; +} + +static bool faulty_make_request(struct mddev *mddev, struct bio *bio) +{ + struct faulty_conf *conf = mddev->private; + int failit = 0; + + if (bio_data_dir(bio) == WRITE) { + /* write request */ + if (atomic_read(&conf->counters[WriteAll])) { + /* special case - don't decrement, don't generic_make_request, + * just fail immediately + */ + bio_io_error(bio); + return true; + } + + if (check_sector(conf, bio->bi_iter.bi_sector, + bio_end_sector(bio), WRITE)) + failit = 1; + if (check_mode(conf, WritePersistent)) { + add_sector(conf, bio->bi_iter.bi_sector, + WritePersistent); + failit = 1; + } + if (check_mode(conf, WriteTransient)) + failit = 1; + } else { + /* read request */ + if (check_sector(conf, bio->bi_iter.bi_sector, + bio_end_sector(bio), READ)) + failit = 1; + if (check_mode(conf, ReadTransient)) + failit = 1; + if (check_mode(conf, ReadPersistent)) { + add_sector(conf, bio->bi_iter.bi_sector, + ReadPersistent); + failit = 1; + } + if (check_mode(conf, ReadFixable)) { + add_sector(conf, bio->bi_iter.bi_sector, + ReadFixable); + failit = 1; + } + } + if (failit) { + struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + + bio_set_dev(b, conf->rdev->bdev); + b->bi_private = bio; + b->bi_end_io = faulty_fail; + bio = b; + } else + bio_set_dev(bio, conf->rdev->bdev); + + generic_make_request(bio); + return true; +} + +static void faulty_status(struct seq_file *seq, struct mddev *mddev) +{ + struct faulty_conf *conf = mddev->private; + int n; + + if ((n=atomic_read(&conf->counters[WriteTransient])) != 0) + seq_printf(seq, " WriteTransient=%d(%d)", + n, conf->period[WriteTransient]); + + if ((n=atomic_read(&conf->counters[ReadTransient])) != 0) + seq_printf(seq, " ReadTransient=%d(%d)", + n, conf->period[ReadTransient]); + + if ((n=atomic_read(&conf->counters[WritePersistent])) != 0) + seq_printf(seq, " WritePersistent=%d(%d)", + n, conf->period[WritePersistent]); + + if ((n=atomic_read(&conf->counters[ReadPersistent])) != 0) + seq_printf(seq, " ReadPersistent=%d(%d)", + n, conf->period[ReadPersistent]); + + + if ((n=atomic_read(&conf->counters[ReadFixable])) != 0) + seq_printf(seq, " ReadFixable=%d(%d)", + n, conf->period[ReadFixable]); + + if ((n=atomic_read(&conf->counters[WriteAll])) != 0) + seq_printf(seq, " WriteAll"); + + seq_printf(seq, " nfaults=%d", conf->nfaults); +} + + +static int faulty_reshape(struct mddev *mddev) +{ + int mode = mddev->new_layout & ModeMask; + int count = mddev->new_layout >> ModeShift; + struct faulty_conf *conf = mddev->private; + + if (mddev->new_layout < 0) + return 0; + + /* new layout */ + if (mode == ClearFaults) + conf->nfaults = 0; + else if (mode == ClearErrors) { + int i; + for (i=0 ; i < Modes ; i++) { + conf->period[i] = 0; + atomic_set(&conf->counters[i], 0); + } + } else if (mode < Modes) { + conf->period[mode] = count; + if (!count) count++; + atomic_set(&conf->counters[mode], count); + } else + return -EINVAL; + mddev->new_layout = -1; + mddev->layout = -1; /* makes sure further changes come through */ + return 0; +} + +static sector_t faulty_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(raid_disks, + "%s does not support generic reshape\n", __func__); + + if (sectors == 0) + return mddev->dev_sectors; + + return sectors; +} + +static int faulty_run(struct mddev *mddev) +{ + struct md_rdev *rdev; + int i; + struct faulty_conf *conf; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + + conf = kmalloc(sizeof(*conf), GFP_KERNEL); + if (!conf) + return -ENOMEM; + + for (i=0; icounters[i], 0); + conf->period[i] = 0; + } + conf->nfaults = 0; + + rdev_for_each(rdev, mddev) { + conf->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + } + + md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); + mddev->private = conf; + + faulty_reshape(mddev); + + return 0; +} + +static void faulty_free(struct mddev *mddev, void *priv) +{ + struct faulty_conf *conf = priv; + + kfree(conf); +} + +static struct md_personality faulty_personality = +{ + .name = "faulty", + .level = LEVEL_FAULTY, + .owner = THIS_MODULE, + .make_request = faulty_make_request, + .run = faulty_run, + .free = faulty_free, + .status = faulty_status, + .check_reshape = faulty_reshape, + .size = faulty_size, +}; + +static int __init raid_init(void) +{ + return register_md_personality(&faulty_personality); +} + +static void raid_exit(void) +{ + unregister_md_personality(&faulty_personality); +} + +module_init(raid_init); +module_exit(raid_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Fault injection personality for MD"); +MODULE_ALIAS("md-personality-10"); /* faulty */ +MODULE_ALIAS("md-faulty"); +MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c new file mode 100644 index 000000000000..773fc70dced7 --- /dev/null +++ b/drivers/md/md-linear.c @@ -0,0 +1,348 @@ +/* + linear.c : Multiple Devices driver for Linux + Copyright (C) 1994-96 Marc ZYNGIER + or + + + Linear mode management functions. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#include +#include +#include +#include +#include +#include +#include "md.h" +#include "md-linear.h" + +/* + * find which device holds a particular offset + */ +static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) +{ + int lo, mid, hi; + struct linear_conf *conf; + + lo = 0; + hi = mddev->raid_disks - 1; + conf = mddev->private; + + /* + * Binary Search + */ + + while (hi > lo) { + + mid = (hi + lo) / 2; + if (sector < conf->disks[mid].end_sector) + hi = mid; + else + lo = mid + 1; + } + + return conf->disks + lo; +} + +/* + * In linear_congested() conf->raid_disks is used as a copy of + * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks + * and conf->disks[] are created in linear_conf(), they are always + * consitent with each other, but mddev->raid_disks does not. + */ +static int linear_congested(struct mddev *mddev, int bits) +{ + struct linear_conf *conf; + int i, ret = 0; + + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + + for (i = 0; i < conf->raid_disks && !ret ; i++) { + struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); + ret |= bdi_congested(q->backing_dev_info, bits); + } + + rcu_read_unlock(); + return ret; +} + +static sector_t linear_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + struct linear_conf *conf; + sector_t array_sectors; + + conf = mddev->private; + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + array_sectors = conf->array_sectors; + + return array_sectors; +} + +static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) +{ + struct linear_conf *conf; + struct md_rdev *rdev; + int i, cnt; + bool discard_supported = false; + + conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info), + GFP_KERNEL); + if (!conf) + return NULL; + + cnt = 0; + conf->array_sectors = 0; + + rdev_for_each(rdev, mddev) { + int j = rdev->raid_disk; + struct dev_info *disk = conf->disks + j; + sector_t sectors; + + if (j < 0 || j >= raid_disks || disk->rdev) { + pr_warn("md/linear:%s: disk numbering problem. Aborting!\n", + mdname(mddev)); + goto out; + } + + disk->rdev = rdev; + if (mddev->chunk_sectors) { + sectors = rdev->sectors; + sector_div(sectors, mddev->chunk_sectors); + rdev->sectors = sectors * mddev->chunk_sectors; + } + + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + conf->array_sectors += rdev->sectors; + cnt++; + + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; + } + if (cnt != raid_disks) { + pr_warn("md/linear:%s: not enough drives present. Aborting!\n", + mdname(mddev)); + goto out; + } + + if (!discard_supported) + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + + /* + * Here we calculate the device offsets. + */ + conf->disks[0].end_sector = conf->disks[0].rdev->sectors; + + for (i = 1; i < raid_disks; i++) + conf->disks[i].end_sector = + conf->disks[i-1].end_sector + + conf->disks[i].rdev->sectors; + + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + + return conf; + +out: + kfree(conf); + return NULL; +} + +static int linear_run (struct mddev *mddev) +{ + struct linear_conf *conf; + int ret; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + conf = linear_conf(mddev, mddev->raid_disks); + + if (!conf) + return 1; + mddev->private = conf; + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); + + ret = md_integrity_register(mddev); + if (ret) { + kfree(conf); + mddev->private = NULL; + } + return ret; +} + +static int linear_add(struct mddev *mddev, struct md_rdev *rdev) +{ + /* Adding a drive to a linear array allows the array to grow. + * It is permitted if the new drive has a matching superblock + * already on it, with raid_disk equal to raid_disks. + * It is achieved by creating a new linear_private_data structure + * and swapping it in in-place of the current one. + * The current one is never freed until the array is stopped. + * This avoids races. + */ + struct linear_conf *newconf, *oldconf; + + if (rdev->saved_raid_disk != mddev->raid_disks) + return -EINVAL; + + rdev->raid_disk = rdev->saved_raid_disk; + rdev->saved_raid_disk = -1; + + newconf = linear_conf(mddev,mddev->raid_disks+1); + + if (!newconf) + return -ENOMEM; + + /* newconf->raid_disks already keeps a copy of * the increased + * value of mddev->raid_disks, WARN_ONCE() is just used to make + * sure of this. It is possible that oldconf is still referenced + * in linear_congested(), therefore kfree_rcu() is used to free + * oldconf until no one uses it anymore. + */ + mddev_suspend(mddev); + oldconf = rcu_dereference_protected(mddev->private, + lockdep_is_held(&mddev->reconfig_mutex)); + mddev->raid_disks++; + WARN_ONCE(mddev->raid_disks != newconf->raid_disks, + "copied raid_disks doesn't match mddev->raid_disks"); + rcu_assign_pointer(mddev->private, newconf); + md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); + set_capacity(mddev->gendisk, mddev->array_sectors); + mddev_resume(mddev); + revalidate_disk(mddev->gendisk); + kfree_rcu(oldconf, rcu); + return 0; +} + +static void linear_free(struct mddev *mddev, void *priv) +{ + struct linear_conf *conf = priv; + + kfree(conf); +} + +static bool linear_make_request(struct mddev *mddev, struct bio *bio) +{ + char b[BDEVNAME_SIZE]; + struct dev_info *tmp_dev; + sector_t start_sector, end_sector, data_offset; + sector_t bio_sector = bio->bi_iter.bi_sector; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + md_flush_request(mddev, bio); + return true; + } + + tmp_dev = which_dev(mddev, bio_sector); + start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors; + end_sector = tmp_dev->end_sector; + data_offset = tmp_dev->rdev->data_offset; + + if (unlikely(bio_sector >= end_sector || + bio_sector < start_sector)) + goto out_of_bounds; + + if (unlikely(bio_end_sector(bio) > end_sector)) { + /* This bio crosses a device boundary, so we have to split it */ + struct bio *split = bio_split(bio, end_sector - bio_sector, + GFP_NOIO, mddev->bio_set); + bio_chain(split, bio); + generic_make_request(bio); + bio = split; + } + + bio_set_dev(bio, tmp_dev->rdev->bdev); + bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - + start_sector + data_offset; + + if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && + !blk_queue_discard(bio->bi_disk->queue))) { + /* Just ignore it */ + bio_endio(bio); + } else { + if (mddev->gendisk) + trace_block_bio_remap(bio->bi_disk->queue, + bio, disk_devt(mddev->gendisk), + bio_sector); + mddev_check_writesame(mddev, bio); + mddev_check_write_zeroes(mddev, bio); + generic_make_request(bio); + } + return true; + +out_of_bounds: + pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n", + mdname(mddev), + (unsigned long long)bio->bi_iter.bi_sector, + bdevname(tmp_dev->rdev->bdev, b), + (unsigned long long)tmp_dev->rdev->sectors, + (unsigned long long)start_sector); + bio_io_error(bio); + return true; +} + +static void linear_status (struct seq_file *seq, struct mddev *mddev) +{ + seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2); +} + +static void linear_quiesce(struct mddev *mddev, int state) +{ +} + +static struct md_personality linear_personality = +{ + .name = "linear", + .level = LEVEL_LINEAR, + .owner = THIS_MODULE, + .make_request = linear_make_request, + .run = linear_run, + .free = linear_free, + .status = linear_status, + .hot_add_disk = linear_add, + .size = linear_size, + .quiesce = linear_quiesce, + .congested = linear_congested, +}; + +static int __init linear_init (void) +{ + return register_md_personality (&linear_personality); +} + +static void linear_exit (void) +{ + unregister_md_personality (&linear_personality); +} + +module_init(linear_init); +module_exit(linear_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Linear device concatenation personality for MD"); +MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ +MODULE_ALIAS("md-linear"); +MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h new file mode 100644 index 000000000000..8d392e6098b3 --- /dev/null +++ b/drivers/md/md-linear.h @@ -0,0 +1,16 @@ +#ifndef _LINEAR_H +#define _LINEAR_H + +struct dev_info { + struct md_rdev *rdev; + sector_t end_sector; +}; + +struct linear_conf +{ + struct rcu_head rcu; + sector_t array_sectors; + int raid_disks; /* a copy of mddev->raid_disks */ + struct dev_info disks[0]; +}; +#endif diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c new file mode 100644 index 000000000000..5c70176fa24d --- /dev/null +++ b/drivers/md/md-multipath.c @@ -0,0 +1,509 @@ +/* + * multipath.c : Multiple Devices driver for Linux + * + * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat + * + * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman + * + * MULTIPATH management functions. + * + * derived from raid1.c. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * You should have received a copy of the GNU General Public License + * (for example /usr/src/linux/COPYING); if not, write to the Free + * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include "md.h" +#include "md-multipath.h" + +#define MAX_WORK_PER_DISK 128 + +#define NR_RESERVED_BUFS 32 + +static int multipath_map (struct mpconf *conf) +{ + int i, disks = conf->raid_disks; + + /* + * Later we do read balancing on the read side + * now we use the first available disk. + */ + + rcu_read_lock(); + for (i = 0; i < disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + if (rdev && test_bit(In_sync, &rdev->flags) && + !test_bit(Faulty, &rdev->flags)) { + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + return i; + } + } + rcu_read_unlock(); + + pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n"); + return (-1); +} + +static void multipath_reschedule_retry (struct multipath_bh *mp_bh) +{ + unsigned long flags; + struct mddev *mddev = mp_bh->mddev; + struct mpconf *conf = mddev->private; + + spin_lock_irqsave(&conf->device_lock, flags); + list_add(&mp_bh->retry_list, &conf->retry_list); + spin_unlock_irqrestore(&conf->device_lock, flags); + md_wakeup_thread(mddev->thread); +} + +/* + * multipath_end_bh_io() is called when we have finished servicing a multipathed + * operation and are ready to return a success/failure code to the buffer + * cache layer. + */ +static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status) +{ + struct bio *bio = mp_bh->master_bio; + struct mpconf *conf = mp_bh->mddev->private; + + bio->bi_status = status; + bio_endio(bio); + mempool_free(mp_bh, conf->pool); +} + +static void multipath_end_request(struct bio *bio) +{ + struct multipath_bh *mp_bh = bio->bi_private; + struct mpconf *conf = mp_bh->mddev->private; + struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; + + if (!bio->bi_status) + multipath_end_bh_io(mp_bh, 0); + else if (!(bio->bi_opf & REQ_RAHEAD)) { + /* + * oops, IO error: + */ + char b[BDEVNAME_SIZE]; + md_error (mp_bh->mddev, rdev); + pr_info("multipath: %s: rescheduling sector %llu\n", + bdevname(rdev->bdev,b), + (unsigned long long)bio->bi_iter.bi_sector); + multipath_reschedule_retry(mp_bh); + } else + multipath_end_bh_io(mp_bh, bio->bi_status); + rdev_dec_pending(rdev, conf->mddev); +} + +static bool multipath_make_request(struct mddev *mddev, struct bio * bio) +{ + struct mpconf *conf = mddev->private; + struct multipath_bh * mp_bh; + struct multipath_info *multipath; + + if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { + md_flush_request(mddev, bio); + return true; + } + + mp_bh = mempool_alloc(conf->pool, GFP_NOIO); + + mp_bh->master_bio = bio; + mp_bh->mddev = mddev; + + mp_bh->path = multipath_map(conf); + if (mp_bh->path < 0) { + bio_io_error(bio); + mempool_free(mp_bh, conf->pool); + return true; + } + multipath = conf->multipaths + mp_bh->path; + + bio_init(&mp_bh->bio, NULL, 0); + __bio_clone_fast(&mp_bh->bio, bio); + + mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; + bio_set_dev(&mp_bh->bio, multipath->rdev->bdev); + mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; + mp_bh->bio.bi_end_io = multipath_end_request; + mp_bh->bio.bi_private = mp_bh; + mddev_check_writesame(mddev, &mp_bh->bio); + mddev_check_write_zeroes(mddev, &mp_bh->bio); + generic_make_request(&mp_bh->bio); + return true; +} + +static void multipath_status(struct seq_file *seq, struct mddev *mddev) +{ + struct mpconf *conf = mddev->private; + int i; + + seq_printf (seq, " [%d/%d] [", conf->raid_disks, + conf->raid_disks - mddev->degraded); + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); + } + rcu_read_unlock(); + seq_printf (seq, "]"); +} + +static int multipath_congested(struct mddev *mddev, int bits) +{ + struct mpconf *conf = mddev->private; + int i, ret = 0; + + rcu_read_lock(); + for (i = 0; i < mddev->raid_disks ; i++) { + struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); + if (rdev && !test_bit(Faulty, &rdev->flags)) { + struct request_queue *q = bdev_get_queue(rdev->bdev); + + ret |= bdi_congested(q->backing_dev_info, bits); + /* Just like multipath_map, we just check the + * first available device + */ + break; + } + } + rcu_read_unlock(); + return ret; +} + +/* + * Careful, this can execute in IRQ contexts as well! + */ +static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) +{ + struct mpconf *conf = mddev->private; + char b[BDEVNAME_SIZE]; + + if (conf->raid_disks - mddev->degraded <= 1) { + /* + * Uh oh, we can do nothing if this is our last path, but + * first check if this is a queued request for a device + * which has just failed. + */ + pr_warn("multipath: only one IO path left and IO error.\n"); + /* leave it active... it's all we have */ + return; + } + /* + * Mark disk as unusable + */ + if (test_and_clear_bit(In_sync, &rdev->flags)) { + unsigned long flags; + spin_lock_irqsave(&conf->device_lock, flags); + mddev->degraded++; + spin_unlock_irqrestore(&conf->device_lock, flags); + } + set_bit(Faulty, &rdev->flags); + set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); + pr_err("multipath: IO failure on %s, disabling IO path.\n" + "multipath: Operation continuing on %d IO paths.\n", + bdevname(rdev->bdev, b), + conf->raid_disks - mddev->degraded); +} + +static void print_multipath_conf (struct mpconf *conf) +{ + int i; + struct multipath_info *tmp; + + pr_debug("MULTIPATH conf printout:\n"); + if (!conf) { + pr_debug("(conf==NULL)\n"); + return; + } + pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, + conf->raid_disks); + + for (i = 0; i < conf->raid_disks; i++) { + char b[BDEVNAME_SIZE]; + tmp = conf->multipaths + i; + if (tmp->rdev) + pr_debug(" disk%d, o:%d, dev:%s\n", + i,!test_bit(Faulty, &tmp->rdev->flags), + bdevname(tmp->rdev->bdev,b)); + } +} + +static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mpconf *conf = mddev->private; + struct request_queue *q; + int err = -EEXIST; + int path; + struct multipath_info *p; + int first = 0; + int last = mddev->raid_disks - 1; + + if (rdev->raid_disk >= 0) + first = last = rdev->raid_disk; + + print_multipath_conf(conf); + + for (path = first; path <= last; path++) + if ((p=conf->multipaths+path)->rdev == NULL) { + q = rdev->bdev->bd_disk->queue; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + err = md_integrity_add_rdev(rdev, mddev); + if (err) + break; + spin_lock_irq(&conf->device_lock); + mddev->degraded--; + rdev->raid_disk = path; + set_bit(In_sync, &rdev->flags); + spin_unlock_irq(&conf->device_lock); + rcu_assign_pointer(p->rdev, rdev); + err = 0; + break; + } + + print_multipath_conf(conf); + + return err; +} + +static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) +{ + struct mpconf *conf = mddev->private; + int err = 0; + int number = rdev->raid_disk; + struct multipath_info *p = conf->multipaths + number; + + print_multipath_conf(conf); + + if (rdev == p->rdev) { + if (test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) { + pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number); + err = -EBUSY; + goto abort; + } + p->rdev = NULL; + if (!test_bit(RemoveSynchronized, &rdev->flags)) { + synchronize_rcu(); + if (atomic_read(&rdev->nr_pending)) { + /* lost the race, try later */ + err = -EBUSY; + p->rdev = rdev; + goto abort; + } + } + err = md_integrity_register(mddev); + } +abort: + + print_multipath_conf(conf); + return err; +} + +/* + * This is a kernel thread which: + * + * 1. Retries failed read operations on working multipaths. + * 2. Updates the raid superblock when problems encounter. + * 3. Performs writes following reads for array syncronising. + */ + +static void multipathd(struct md_thread *thread) +{ + struct mddev *mddev = thread->mddev; + struct multipath_bh *mp_bh; + struct bio *bio; + unsigned long flags; + struct mpconf *conf = mddev->private; + struct list_head *head = &conf->retry_list; + + md_check_recovery(mddev); + for (;;) { + char b[BDEVNAME_SIZE]; + spin_lock_irqsave(&conf->device_lock, flags); + if (list_empty(head)) + break; + mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); + list_del(head->prev); + spin_unlock_irqrestore(&conf->device_lock, flags); + + bio = &mp_bh->bio; + bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; + + if ((mp_bh->path = multipath_map (conf))<0) { + pr_err("multipath: %s: unrecoverable IO read error for block %llu\n", + bio_devname(bio, b), + (unsigned long long)bio->bi_iter.bi_sector); + multipath_end_bh_io(mp_bh, BLK_STS_IOERR); + } else { + pr_err("multipath: %s: redirecting sector %llu to another IO path\n", + bio_devname(bio, b), + (unsigned long long)bio->bi_iter.bi_sector); + *bio = *(mp_bh->master_bio); + bio->bi_iter.bi_sector += + conf->multipaths[mp_bh->path].rdev->data_offset; + bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev); + bio->bi_opf |= REQ_FAILFAST_TRANSPORT; + bio->bi_end_io = multipath_end_request; + bio->bi_private = mp_bh; + generic_make_request(bio); + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); +} + +static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks) +{ + WARN_ONCE(sectors || raid_disks, + "%s does not support generic reshape\n", __func__); + + return mddev->dev_sectors; +} + +static int multipath_run (struct mddev *mddev) +{ + struct mpconf *conf; + int disk_idx; + struct multipath_info *disk; + struct md_rdev *rdev; + int working_disks; + + if (md_check_no_bitmap(mddev)) + return -EINVAL; + + if (mddev->level != LEVEL_MULTIPATH) { + pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n", + mdname(mddev), mddev->level); + goto out; + } + /* + * copy the already verified devices into our private MULTIPATH + * bookkeeping area. [whatever we allocate in multipath_run(), + * should be freed in multipath_free()] + */ + + conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); + mddev->private = conf; + if (!conf) + goto out; + + conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, + GFP_KERNEL); + if (!conf->multipaths) + goto out_free_conf; + + working_disks = 0; + rdev_for_each(rdev, mddev) { + disk_idx = rdev->raid_disk; + if (disk_idx < 0 || + disk_idx >= mddev->raid_disks) + continue; + + disk = conf->multipaths + disk_idx; + disk->rdev = rdev; + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); + + if (!test_bit(Faulty, &rdev->flags)) + working_disks++; + } + + conf->raid_disks = mddev->raid_disks; + conf->mddev = mddev; + spin_lock_init(&conf->device_lock); + INIT_LIST_HEAD(&conf->retry_list); + + if (!working_disks) { + pr_warn("multipath: no operational IO paths for %s\n", + mdname(mddev)); + goto out_free_conf; + } + mddev->degraded = conf->raid_disks - working_disks; + + conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, + sizeof(struct multipath_bh)); + if (conf->pool == NULL) + goto out_free_conf; + + mddev->thread = md_register_thread(multipathd, mddev, + "multipath"); + if (!mddev->thread) + goto out_free_conf; + + pr_info("multipath: array %s active with %d out of %d IO paths\n", + mdname(mddev), conf->raid_disks - mddev->degraded, + mddev->raid_disks); + /* + * Ok, everything is just fine now + */ + md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); + + if (md_integrity_register(mddev)) + goto out_free_conf; + + return 0; + +out_free_conf: + mempool_destroy(conf->pool); + kfree(conf->multipaths); + kfree(conf); + mddev->private = NULL; +out: + return -EIO; +} + +static void multipath_free(struct mddev *mddev, void *priv) +{ + struct mpconf *conf = priv; + + mempool_destroy(conf->pool); + kfree(conf->multipaths); + kfree(conf); +} + +static struct md_personality multipath_personality = +{ + .name = "multipath", + .level = LEVEL_MULTIPATH, + .owner = THIS_MODULE, + .make_request = multipath_make_request, + .run = multipath_run, + .free = multipath_free, + .status = multipath_status, + .error_handler = multipath_error, + .hot_add_disk = multipath_add_disk, + .hot_remove_disk= multipath_remove_disk, + .size = multipath_size, + .congested = multipath_congested, +}; + +static int __init multipath_init (void) +{ + return register_md_personality (&multipath_personality); +} + +static void __exit multipath_exit (void) +{ + unregister_md_personality (&multipath_personality); +} + +module_init(multipath_init); +module_exit(multipath_exit); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("simple multi-path personality for MD"); +MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ +MODULE_ALIAS("md-multipath"); +MODULE_ALIAS("md-level--4"); diff --git a/drivers/md/md-multipath.h b/drivers/md/md-multipath.h new file mode 100644 index 000000000000..717c60f62898 --- /dev/null +++ b/drivers/md/md-multipath.h @@ -0,0 +1,31 @@ +#ifndef _MULTIPATH_H +#define _MULTIPATH_H + +struct multipath_info { + struct md_rdev *rdev; +}; + +struct mpconf { + struct mddev *mddev; + struct multipath_info *multipaths; + int raid_disks; + spinlock_t device_lock; + struct list_head retry_list; + + mempool_t *pool; +}; + +/* + * this is our 'private' 'collective' MULTIPATH buffer head. + * it contains information about what kind of IO operations were started + * for this MULTIPATH operation, and about their status: + */ + +struct multipath_bh { + struct mddev *mddev; + struct bio *master_bio; + struct bio bio; + int path; + struct list_head retry_list; +}; +#endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 707471e3cb01..97afb28c6f51 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -69,7 +69,7 @@ #include #include "md.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "md-cluster.h" #ifndef MODULE diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c deleted file mode 100644 index b68e0666b9b0..000000000000 --- a/drivers/md/multipath.c +++ /dev/null @@ -1,509 +0,0 @@ -/* - * multipath.c : Multiple Devices driver for Linux - * - * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat - * - * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman - * - * MULTIPATH management functions. - * - * derived from raid1.c. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * You should have received a copy of the GNU General Public License - * (for example /usr/src/linux/COPYING); if not, write to the Free - * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include "md.h" -#include "multipath.h" - -#define MAX_WORK_PER_DISK 128 - -#define NR_RESERVED_BUFS 32 - -static int multipath_map (struct mpconf *conf) -{ - int i, disks = conf->raid_disks; - - /* - * Later we do read balancing on the read side - * now we use the first available disk. - */ - - rcu_read_lock(); - for (i = 0; i < disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && test_bit(In_sync, &rdev->flags) && - !test_bit(Faulty, &rdev->flags)) { - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - return i; - } - } - rcu_read_unlock(); - - pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n"); - return (-1); -} - -static void multipath_reschedule_retry (struct multipath_bh *mp_bh) -{ - unsigned long flags; - struct mddev *mddev = mp_bh->mddev; - struct mpconf *conf = mddev->private; - - spin_lock_irqsave(&conf->device_lock, flags); - list_add(&mp_bh->retry_list, &conf->retry_list); - spin_unlock_irqrestore(&conf->device_lock, flags); - md_wakeup_thread(mddev->thread); -} - -/* - * multipath_end_bh_io() is called when we have finished servicing a multipathed - * operation and are ready to return a success/failure code to the buffer - * cache layer. - */ -static void multipath_end_bh_io(struct multipath_bh *mp_bh, blk_status_t status) -{ - struct bio *bio = mp_bh->master_bio; - struct mpconf *conf = mp_bh->mddev->private; - - bio->bi_status = status; - bio_endio(bio); - mempool_free(mp_bh, conf->pool); -} - -static void multipath_end_request(struct bio *bio) -{ - struct multipath_bh *mp_bh = bio->bi_private; - struct mpconf *conf = mp_bh->mddev->private; - struct md_rdev *rdev = conf->multipaths[mp_bh->path].rdev; - - if (!bio->bi_status) - multipath_end_bh_io(mp_bh, 0); - else if (!(bio->bi_opf & REQ_RAHEAD)) { - /* - * oops, IO error: - */ - char b[BDEVNAME_SIZE]; - md_error (mp_bh->mddev, rdev); - pr_info("multipath: %s: rescheduling sector %llu\n", - bdevname(rdev->bdev,b), - (unsigned long long)bio->bi_iter.bi_sector); - multipath_reschedule_retry(mp_bh); - } else - multipath_end_bh_io(mp_bh, bio->bi_status); - rdev_dec_pending(rdev, conf->mddev); -} - -static bool multipath_make_request(struct mddev *mddev, struct bio * bio) -{ - struct mpconf *conf = mddev->private; - struct multipath_bh * mp_bh; - struct multipath_info *multipath; - - if (unlikely(bio->bi_opf & REQ_PREFLUSH)) { - md_flush_request(mddev, bio); - return true; - } - - mp_bh = mempool_alloc(conf->pool, GFP_NOIO); - - mp_bh->master_bio = bio; - mp_bh->mddev = mddev; - - mp_bh->path = multipath_map(conf); - if (mp_bh->path < 0) { - bio_io_error(bio); - mempool_free(mp_bh, conf->pool); - return true; - } - multipath = conf->multipaths + mp_bh->path; - - bio_init(&mp_bh->bio, NULL, 0); - __bio_clone_fast(&mp_bh->bio, bio); - - mp_bh->bio.bi_iter.bi_sector += multipath->rdev->data_offset; - bio_set_dev(&mp_bh->bio, multipath->rdev->bdev); - mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; - mp_bh->bio.bi_end_io = multipath_end_request; - mp_bh->bio.bi_private = mp_bh; - mddev_check_writesame(mddev, &mp_bh->bio); - mddev_check_write_zeroes(mddev, &mp_bh->bio); - generic_make_request(&mp_bh->bio); - return true; -} - -static void multipath_status(struct seq_file *seq, struct mddev *mddev) -{ - struct mpconf *conf = mddev->private; - int i; - - seq_printf (seq, " [%d/%d] [", conf->raid_disks, - conf->raid_disks - mddev->degraded); - rcu_read_lock(); - for (i = 0; i < conf->raid_disks; i++) { - struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); - } - rcu_read_unlock(); - seq_printf (seq, "]"); -} - -static int multipath_congested(struct mddev *mddev, int bits) -{ - struct mpconf *conf = mddev->private; - int i, ret = 0; - - rcu_read_lock(); - for (i = 0; i < mddev->raid_disks ; i++) { - struct md_rdev *rdev = rcu_dereference(conf->multipaths[i].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = bdev_get_queue(rdev->bdev); - - ret |= bdi_congested(q->backing_dev_info, bits); - /* Just like multipath_map, we just check the - * first available device - */ - break; - } - } - rcu_read_unlock(); - return ret; -} - -/* - * Careful, this can execute in IRQ contexts as well! - */ -static void multipath_error (struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - char b[BDEVNAME_SIZE]; - - if (conf->raid_disks - mddev->degraded <= 1) { - /* - * Uh oh, we can do nothing if this is our last path, but - * first check if this is a queued request for a device - * which has just failed. - */ - pr_warn("multipath: only one IO path left and IO error.\n"); - /* leave it active... it's all we have */ - return; - } - /* - * Mark disk as unusable - */ - if (test_and_clear_bit(In_sync, &rdev->flags)) { - unsigned long flags; - spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded++; - spin_unlock_irqrestore(&conf->device_lock, flags); - } - set_bit(Faulty, &rdev->flags); - set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - pr_err("multipath: IO failure on %s, disabling IO path.\n" - "multipath: Operation continuing on %d IO paths.\n", - bdevname(rdev->bdev, b), - conf->raid_disks - mddev->degraded); -} - -static void print_multipath_conf (struct mpconf *conf) -{ - int i; - struct multipath_info *tmp; - - pr_debug("MULTIPATH conf printout:\n"); - if (!conf) { - pr_debug("(conf==NULL)\n"); - return; - } - pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, - conf->raid_disks); - - for (i = 0; i < conf->raid_disks; i++) { - char b[BDEVNAME_SIZE]; - tmp = conf->multipaths + i; - if (tmp->rdev) - pr_debug(" disk%d, o:%d, dev:%s\n", - i,!test_bit(Faulty, &tmp->rdev->flags), - bdevname(tmp->rdev->bdev,b)); - } -} - -static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - struct request_queue *q; - int err = -EEXIST; - int path; - struct multipath_info *p; - int first = 0; - int last = mddev->raid_disks - 1; - - if (rdev->raid_disk >= 0) - first = last = rdev->raid_disk; - - print_multipath_conf(conf); - - for (path = first; path <= last; path++) - if ((p=conf->multipaths+path)->rdev == NULL) { - q = rdev->bdev->bd_disk->queue; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - err = md_integrity_add_rdev(rdev, mddev); - if (err) - break; - spin_lock_irq(&conf->device_lock); - mddev->degraded--; - rdev->raid_disk = path; - set_bit(In_sync, &rdev->flags); - spin_unlock_irq(&conf->device_lock); - rcu_assign_pointer(p->rdev, rdev); - err = 0; - break; - } - - print_multipath_conf(conf); - - return err; -} - -static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev) -{ - struct mpconf *conf = mddev->private; - int err = 0; - int number = rdev->raid_disk; - struct multipath_info *p = conf->multipaths + number; - - print_multipath_conf(conf); - - if (rdev == p->rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { - pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number); - err = -EBUSY; - goto abort; - } - p->rdev = NULL; - if (!test_bit(RemoveSynchronized, &rdev->flags)) { - synchronize_rcu(); - if (atomic_read(&rdev->nr_pending)) { - /* lost the race, try later */ - err = -EBUSY; - p->rdev = rdev; - goto abort; - } - } - err = md_integrity_register(mddev); - } -abort: - - print_multipath_conf(conf); - return err; -} - -/* - * This is a kernel thread which: - * - * 1. Retries failed read operations on working multipaths. - * 2. Updates the raid superblock when problems encounter. - * 3. Performs writes following reads for array syncronising. - */ - -static void multipathd(struct md_thread *thread) -{ - struct mddev *mddev = thread->mddev; - struct multipath_bh *mp_bh; - struct bio *bio; - unsigned long flags; - struct mpconf *conf = mddev->private; - struct list_head *head = &conf->retry_list; - - md_check_recovery(mddev); - for (;;) { - char b[BDEVNAME_SIZE]; - spin_lock_irqsave(&conf->device_lock, flags); - if (list_empty(head)) - break; - mp_bh = list_entry(head->prev, struct multipath_bh, retry_list); - list_del(head->prev); - spin_unlock_irqrestore(&conf->device_lock, flags); - - bio = &mp_bh->bio; - bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector; - - if ((mp_bh->path = multipath_map (conf))<0) { - pr_err("multipath: %s: unrecoverable IO read error for block %llu\n", - bio_devname(bio, b), - (unsigned long long)bio->bi_iter.bi_sector); - multipath_end_bh_io(mp_bh, BLK_STS_IOERR); - } else { - pr_err("multipath: %s: redirecting sector %llu to another IO path\n", - bio_devname(bio, b), - (unsigned long long)bio->bi_iter.bi_sector); - *bio = *(mp_bh->master_bio); - bio->bi_iter.bi_sector += - conf->multipaths[mp_bh->path].rdev->data_offset; - bio_set_dev(bio, conf->multipaths[mp_bh->path].rdev->bdev); - bio->bi_opf |= REQ_FAILFAST_TRANSPORT; - bio->bi_end_io = multipath_end_request; - bio->bi_private = mp_bh; - generic_make_request(bio); - } - } - spin_unlock_irqrestore(&conf->device_lock, flags); -} - -static sector_t multipath_size(struct mddev *mddev, sector_t sectors, int raid_disks) -{ - WARN_ONCE(sectors || raid_disks, - "%s does not support generic reshape\n", __func__); - - return mddev->dev_sectors; -} - -static int multipath_run (struct mddev *mddev) -{ - struct mpconf *conf; - int disk_idx; - struct multipath_info *disk; - struct md_rdev *rdev; - int working_disks; - - if (md_check_no_bitmap(mddev)) - return -EINVAL; - - if (mddev->level != LEVEL_MULTIPATH) { - pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n", - mdname(mddev), mddev->level); - goto out; - } - /* - * copy the already verified devices into our private MULTIPATH - * bookkeeping area. [whatever we allocate in multipath_run(), - * should be freed in multipath_free()] - */ - - conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL); - mddev->private = conf; - if (!conf) - goto out; - - conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks, - GFP_KERNEL); - if (!conf->multipaths) - goto out_free_conf; - - working_disks = 0; - rdev_for_each(rdev, mddev) { - disk_idx = rdev->raid_disk; - if (disk_idx < 0 || - disk_idx >= mddev->raid_disks) - continue; - - disk = conf->multipaths + disk_idx; - disk->rdev = rdev; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - - if (!test_bit(Faulty, &rdev->flags)) - working_disks++; - } - - conf->raid_disks = mddev->raid_disks; - conf->mddev = mddev; - spin_lock_init(&conf->device_lock); - INIT_LIST_HEAD(&conf->retry_list); - - if (!working_disks) { - pr_warn("multipath: no operational IO paths for %s\n", - mdname(mddev)); - goto out_free_conf; - } - mddev->degraded = conf->raid_disks - working_disks; - - conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, - sizeof(struct multipath_bh)); - if (conf->pool == NULL) - goto out_free_conf; - - mddev->thread = md_register_thread(multipathd, mddev, - "multipath"); - if (!mddev->thread) - goto out_free_conf; - - pr_info("multipath: array %s active with %d out of %d IO paths\n", - mdname(mddev), conf->raid_disks - mddev->degraded, - mddev->raid_disks); - /* - * Ok, everything is just fine now - */ - md_set_array_sectors(mddev, multipath_size(mddev, 0, 0)); - - if (md_integrity_register(mddev)) - goto out_free_conf; - - return 0; - -out_free_conf: - mempool_destroy(conf->pool); - kfree(conf->multipaths); - kfree(conf); - mddev->private = NULL; -out: - return -EIO; -} - -static void multipath_free(struct mddev *mddev, void *priv) -{ - struct mpconf *conf = priv; - - mempool_destroy(conf->pool); - kfree(conf->multipaths); - kfree(conf); -} - -static struct md_personality multipath_personality = -{ - .name = "multipath", - .level = LEVEL_MULTIPATH, - .owner = THIS_MODULE, - .make_request = multipath_make_request, - .run = multipath_run, - .free = multipath_free, - .status = multipath_status, - .error_handler = multipath_error, - .hot_add_disk = multipath_add_disk, - .hot_remove_disk= multipath_remove_disk, - .size = multipath_size, - .congested = multipath_congested, -}; - -static int __init multipath_init (void) -{ - return register_md_personality (&multipath_personality); -} - -static void __exit multipath_exit (void) -{ - unregister_md_personality (&multipath_personality); -} - -module_init(multipath_init); -module_exit(multipath_exit); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("simple multi-path personality for MD"); -MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ -MODULE_ALIAS("md-multipath"); -MODULE_ALIAS("md-level--4"); diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h deleted file mode 100644 index 717c60f62898..000000000000 --- a/drivers/md/multipath.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef _MULTIPATH_H -#define _MULTIPATH_H - -struct multipath_info { - struct md_rdev *rdev; -}; - -struct mpconf { - struct mddev *mddev; - struct multipath_info *multipaths; - int raid_disks; - spinlock_t device_lock; - struct list_head retry_list; - - mempool_t *pool; -}; - -/* - * this is our 'private' 'collective' MULTIPATH buffer head. - * it contains information about what kind of IO operations were started - * for this MULTIPATH operation, and about their status: - */ - -struct multipath_bh { - struct mddev *mddev; - struct bio *master_bio; - struct bio bio; - int path; - struct list_head retry_list; -}; -#endif diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 35264ad0ec70..efdabd3040e7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -43,7 +43,7 @@ #include "md.h" #include "raid1.h" -#include "bitmap.h" +#include "md-bitmap.h" #define UNSUPPORTED_MDDEV_FLAGS \ ((1L << MD_HAS_JOURNAL) | \ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 950fbefbedbb..862cbd162e1c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -29,7 +29,7 @@ #include "md.h" #include "raid10.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" /* * RAID10 provides a combination of RAID0 and RAID1 functionality. diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 0b7406ac8ce1..2b450eee21fa 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -23,7 +23,7 @@ #include #include "md.h" #include "raid5.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" /* diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 928e24a07133..10c0d87074f0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -63,7 +63,7 @@ #include "md.h" #include "raid5.h" #include "raid0.h" -#include "bitmap.h" +#include "md-bitmap.h" #include "raid5-log.h" #define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED) -- cgit v1.2.1 From a0e764c54382be8da96f83bcecc9cf26de3846dc Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 11 Oct 2017 11:46:54 +0100 Subject: md: raid10: remove a couple of redundant variables and initializations Variables dev and bio_last_sector are assigned values that are never read and hence these are redundant variables and can be removed. Also remove the duplicated initialization of sectors, the latter assignment is identical to the first and can be removed. Cleans up 3 clang build warnings: Value stored to 'dev' is never read Value stored to 'bio_last_sector' is never read Value stored to 'sectors' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 862cbd162e1c..b0de5b5ee689 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -383,12 +383,11 @@ static void raid10_end_read_request(struct bio *bio) { int uptodate = !bio->bi_status; struct r10bio *r10_bio = bio->bi_private; - int slot, dev; + int slot; struct md_rdev *rdev; struct r10conf *conf = r10_bio->mddev->private; slot = r10_bio->read_slot; - dev = r10_bio->devs[slot].devnum; rdev = r10_bio->devs[slot].rdev; /* * this branch is our 'one mirror IO has finished' event handler: @@ -748,7 +747,6 @@ static struct md_rdev *read_balance(struct r10conf *conf, raid10_find_phys(conf, r10_bio); rcu_read_lock(); - sectors = r10_bio->sectors; best_slot = -1; best_rdev = NULL; best_dist = MaxSector; @@ -2575,7 +2573,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) struct bio *bio; struct r10conf *conf = mddev->private; struct md_rdev *rdev = r10_bio->devs[slot].rdev; - sector_t bio_last_sector; /* we got a read error. Maybe the drive is bad. Maybe just * the block and we can fix it. @@ -2586,7 +2583,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) * frozen. */ bio = r10_bio->devs[slot].bio; - bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors; bio_put(bio); r10_bio->devs[slot].bio = NULL; -- cgit v1.2.1 From 235b6003fb28f0dd8e7ed8fbdb088bb548291766 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 16:18:36 +1100 Subject: raid5: Set R5_Expanded on parity devices as well as data. When reshaping a fully degraded raid5/raid6 to a larger nubmer of devices, the new device(s) are not in-sync and so that can make the newly grown stripe appear to be "failed". To avoid this, we set the R5_Expanded flag to say "Even though this device is not fully in-sync, this block is safe so don't treat the device as failed for this stripe". This flag is set for data devices, not not for parity devices. Consequently, if you have a RAID6 with two devices that are partly recovered and a spare, and start a reshape to include the spare, then when the reshape gets past the point where the recovery was up to, it will think the stripes are failed and will get into an infinite loop, failing to make progress. So when contructing parity on an EXPAND_READY stripe, set R5_Expanded. Reported-by: Curt Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid5.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 10c0d87074f0..a21dbd22a2fb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1818,8 +1818,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref) struct r5dev *dev = &sh->dev[i]; if (dev->written || i == pd_idx || i == qd_idx) { - if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) + if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) { set_bit(R5_UPTODATE, &dev->flags); + if (test_bit(STRIPE_EXPAND_READY, &sh->state)) + set_bit(R5_Expanded, &dev->flags); + } if (fua) set_bit(R5_WantFUA, &dev->flags); if (sync) -- cgit v1.2.1 From 230b55fa8d64007339319539f8f8e68114d08529 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 14:24:09 +1100 Subject: md: forbid a RAID5 from having both a bitmap and a journal. Having both a bitmap and a journal is pointless. Attempting to do so can corrupt the bitmap if the journal replay happens before the bitmap is initialized. Rather than try to avoid this corruption, simply refuse to allow arrays with both a bitmap and a journal. So: - if raid5_run sees both are present, fail. - if adding a bitmap finds a journal is present, fail - if adding a journal finds a bitmap is present, fail. Cc: stable@vger.kernel.org (4.10+) Signed-off-by: NeilBrown Tested-by: Joshua Kinard Acked-by: Joshua Kinard Signed-off-by: Shaohua Li --- drivers/md/md-bitmap.c | 6 ++++++ drivers/md/md.c | 2 +- drivers/md/raid5.c | 7 +++++++ 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index b843b53b0f65..d1b3b60669ea 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1816,6 +1816,12 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot) BUG_ON(file && mddev->bitmap_info.offset); + if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) { + pr_notice("md/raid:%s: array with journal cannot have bitmap\n", + mdname(mddev)); + return ERR_PTR(-EBUSY); + } + bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL); if (!bitmap) return ERR_PTR(-ENOMEM); diff --git a/drivers/md/md.c b/drivers/md/md.c index 97afb28c6f51..6f25e3f1a1cf 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6362,7 +6362,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) break; } } - if (has_journal) { + if (has_journal || mddev->bitmap) { export_rdev(rdev); return -EBUSY; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a21dbd22a2fb..a8732955f130 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7159,6 +7159,13 @@ static int raid5_run(struct mddev *mddev) min_offset_diff = diff; } + if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) && + (mddev->bitmap_info.offset || mddev->bitmap_info.file)) { + pr_notice("md/raid:%s: array cannot have both journal and bitmap\n", + mdname(mddev)); + return -EINVAL; + } + if (mddev->reshape_position != MaxSector) { /* Check that we can continue the reshape. * Difficulties arise if the stripe we would write to -- cgit v1.2.1 From 4d5324f760aacaefeb721b172aa14bf66045c332 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 19 Oct 2017 12:17:16 +1100 Subject: md: always hold reconfig_mutex when calling mddev_suspend() Most often mddev_suspend() is called with reconfig_mutex held. Make this a requirement in preparation a subsequent patch. Also require reconfig_mutex to be held for mddev_resume(), partly for symmetry and partly to guarantee no races with incr/decr of mddev->suspend. Taking the mutex in r5c_disable_writeback_async() is a little tricky as this is called from a work queue via log->disable_writeback_work, and flush_work() is called on that while holding ->reconfig_mutex. If the work item hasn't run before flush_work() is called, the work function will not be able to get the mutex. So we use mddev_trylock() inside the wait_event() call, and have that abort when conf->log is set to NULL, which happens before flush_work() is called. We wait in mddev->sb_wait and ensure this is woken when any of the conditions change. This requires waking mddev->sb_wait in mddev_unlock(). This is only like to trigger extra wake_ups of threads that needn't be woken when metadata is being written, and that doesn't happen often enough that the cost would be noticeable. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/dm-raid.c | 10 ++++++++-- drivers/md/md.c | 3 +++ drivers/md/raid5-cache.c | 18 +++++++++++++----- 3 files changed, 24 insertions(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 252770696a05..8b1d93114f40 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3628,8 +3628,11 @@ static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; - if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) + if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(&rs->md); mddev_suspend(&rs->md); + mddev_unlock(&rs->md); + } rs->md.ro = 1; } @@ -3886,8 +3889,11 @@ static void raid_resume(struct dm_target *ti) if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS)) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) + if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { + mddev_lock_nointr(mddev); mddev_resume(mddev); + mddev_unlock(mddev); + } } static struct target_type raid_target = { diff --git a/drivers/md/md.c b/drivers/md/md.c index 6f25e3f1a1cf..9767bb33df56 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -344,6 +344,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) void mddev_suspend(struct mddev *mddev) { WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->suspended++) return; synchronize_rcu(); @@ -357,6 +358,7 @@ EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { + lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; wake_up(&mddev->sb_wait); @@ -663,6 +665,7 @@ void mddev_unlock(struct mddev *mddev) */ spin_lock(&pers_lock); md_wakeup_thread(mddev->thread); + wake_up(&mddev->sb_wait); spin_unlock(&pers_lock); } EXPORT_SYMBOL_GPL(mddev_unlock); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 2b450eee21fa..59af7cf35092 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -693,6 +693,8 @@ static void r5c_disable_writeback_async(struct work_struct *work) struct r5l_log *log = container_of(work, struct r5l_log, disable_writeback_work); struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + int locked = 0; if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; @@ -701,11 +703,15 @@ static void r5c_disable_writeback_async(struct work_struct *work) /* wait superblock change before suspend */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); - - mddev_suspend(mddev); - log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; - mddev_resume(mddev); + conf->log == NULL || + (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) && + (locked = mddev_trylock(mddev)))); + if (locked) { + mddev_suspend(mddev); + log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH; + mddev_resume(mddev); + mddev_unlock(mddev); + } } static void r5l_submit_current_io(struct r5l_log *log) @@ -3165,6 +3171,8 @@ void r5l_exit_log(struct r5conf *conf) conf->log = NULL; synchronize_rcu(); + /* Ensure disable_writeback_work wakes up and exits */ + wake_up(&conf->mddev->sb_wait); flush_work(&log->disable_writeback_work); md_unregister_thread(&log->reclaim_thread); mempool_destroy(log->meta_pool); -- cgit v1.2.1 From 52a0d49de3d592a3118e13f35985e3d99eaf43df Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: md: don't call bitmap_create() while array is quiesced. bitmap_create() allocates memory with GFP_KERNEL and so can wait for IO. If called while the array is quiesced, it could wait indefinitely for write out to the array - deadlock. So call bitmap_create() before quiescing the array. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 9767bb33df56..2cb49f639809 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6621,22 +6621,26 @@ static int set_bitmap_file(struct mddev *mddev, int fd) return -ENOENT; /* cannot remove what isn't there */ err = 0; if (mddev->pers) { - mddev->pers->quiesce(mddev, 1); if (fd >= 0) { struct bitmap *bitmap; bitmap = bitmap_create(mddev, -1); + mddev->pers->quiesce(mddev, 1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); } else err = PTR_ERR(bitmap); - } - if (fd < 0 || err) { + if (err) { + bitmap_destroy(mddev); + fd = -1; + } + mddev->pers->quiesce(mddev, 0); + } else if (fd < 0) { + mddev->pers->quiesce(mddev, 1); bitmap_destroy(mddev); - fd = -1; /* make sure to put the file */ + mddev->pers->quiesce(mddev, 0); } - mddev->pers->quiesce(mddev, 0); } if (fd < 0) { struct file *f = mddev->bitmap_info.file; @@ -6920,8 +6924,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.default_offset; mddev->bitmap_info.space = mddev->bitmap_info.default_space; - mddev->pers->quiesce(mddev, 1); bitmap = bitmap_create(mddev, -1); + mddev->pers->quiesce(mddev, 1); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev); -- cgit v1.2.1 From b3143b9a38d5039bcd1f2d1c94039651bfba8043 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: md: move suspend_hi/lo handling into core md code responding to ->suspend_lo and ->suspend_hi is similar to responding to ->suspended. It is best to wait in the common core code without incrementing ->active_io. This allows mddev_suspend()/mddev_resume() to work while requests are waiting for suspend_lo/hi to change. This is will be important after a subsequent patch which uses mddev_suspend() to synchronize updating for suspend_lo/hi. So move the code for testing suspend_lo/hi out of raid1.c and raid5.c, and place it in md.c Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 29 +++++++++++++++++++++++------ drivers/md/raid1.c | 14 +++++--------- drivers/md/raid5.c | 22 ---------------------- 3 files changed, 28 insertions(+), 37 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 2cb49f639809..68de2a6ee29a 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -266,16 +266,31 @@ static DEFINE_SPINLOCK(all_mddevs_lock); * call has finished, the bio has been linked into some internal structure * and so is visible to ->quiesce(), so we don't need the refcount any more. */ +static bool is_suspended(struct mddev *mddev, struct bio *bio) +{ + if (mddev->suspended) + return true; + if (bio_data_dir(bio) != WRITE) + return false; + if (mddev->suspend_lo >= mddev->suspend_hi) + return false; + if (bio->bi_iter.bi_sector >= mddev->suspend_hi) + return false; + if (bio_end_sector(bio) < mddev->suspend_lo) + return false; + return true; +} + void md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: rcu_read_lock(); - if (mddev->suspended) { + if (is_suspended(mddev, bio)) { DEFINE_WAIT(__wait); for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, TASK_UNINTERRUPTIBLE); - if (!mddev->suspended) + if (!is_suspended(mddev, bio)) break; rcu_read_unlock(); schedule(); @@ -4845,10 +4860,11 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; old = mddev->suspend_lo; mddev->suspend_lo = new; - if (new >= old) + if (new >= old) { /* Shrinking suspended region */ + wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 2); - else { + } else { /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); @@ -4888,10 +4904,11 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) goto unlock; old = mddev->suspend_hi; mddev->suspend_hi = new; - if (new <= old) + if (new <= old) { /* Shrinking suspended region */ + wake_up(&mddev->sb_wait); mddev->pers->quiesce(mddev, 2); - else { + } else { /* Expanding suspended region - need to wait */ mddev->pers->quiesce(mddev, 1); mddev->pers->quiesce(mddev, 0); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index efdabd3040e7..fb56ef79a1c3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1310,11 +1310,9 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ - if ((bio_end_sector(bio) > mddev->suspend_lo && - bio->bi_iter.bi_sector < mddev->suspend_hi) || - (mddev_is_clustered(mddev) && + if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, - bio->bi_iter.bi_sector, bio_end_sector(bio)))) { + bio->bi_iter.bi_sector, bio_end_sector(bio))) { /* * As the suspend_* range is controlled by userspace, we want @@ -1325,12 +1323,10 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, sigset_t full, old; prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); - if ((bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_iter.bi_sector >= mddev->suspend_hi) && - (!mddev_is_clustered(mddev) || - !md_cluster_ops->area_resyncing(mddev, WRITE, + if (!mddev_is_clustered(mddev) || + !md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, - bio_end_sector(bio)))) + bio_end_sector(bio))) break; sigfillset(&full); sigprocmask(SIG_BLOCK, &full, &old); diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a8732955f130..354a969f50a6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5685,28 +5685,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) goto retry; } - if (rw == WRITE && - logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - raid5_release_stripe(sh); - /* As the suspend_* range is controlled by - * userspace, we want an interruptible - * wait. - */ - prepare_to_wait(&conf->wait_for_overlap, - &w, TASK_INTERRUPTIBLE); - if (logical_sector >= mddev->suspend_lo && - logical_sector < mddev->suspend_hi) { - sigset_t full, old; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); - schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); - do_prepare = true; - } - goto retry; - } - if (test_bit(STRIPE_EXPANDING, &sh->state) || !add_stripe_bio(sh, bi, dd_idx, rw, previous)) { /* Stripe is busy expanding or -- cgit v1.2.1 From 9e1cc0a54556a6c63dc0cfb7cd7d60d43337bba6 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: md: use mddev_suspend/resume instead of ->quiesce() mddev_suspend() is a more general interface than calling ->quiesce() and is so more extensible. A future patch will make use of this. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 68de2a6ee29a..5bd4f18763bd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4866,8 +4866,8 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 2); } else { /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); + mddev_suspend(mddev); + mddev_resume(mddev); } err = 0; unlock: @@ -4910,8 +4910,8 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) mddev->pers->quiesce(mddev, 2); } else { /* Expanding suspended region - need to wait */ - mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); + mddev_suspend(mddev); + mddev_resume(mddev); } err = 0; unlock: @@ -6642,7 +6642,7 @@ static int set_bitmap_file(struct mddev *mddev, int fd) struct bitmap *bitmap; bitmap = bitmap_create(mddev, -1); - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; err = bitmap_load(mddev); @@ -6652,11 +6652,11 @@ static int set_bitmap_file(struct mddev *mddev, int fd) bitmap_destroy(mddev); fd = -1; } - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else if (fd < 0) { - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } } if (fd < 0) { @@ -6942,7 +6942,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.space = mddev->bitmap_info.default_space; bitmap = bitmap_create(mddev, -1); - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); if (!IS_ERR(bitmap)) { mddev->bitmap = bitmap; rv = bitmap_load(mddev); @@ -6950,7 +6950,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) rv = PTR_ERR(bitmap); if (rv) bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); } else { /* remove the bitmap */ if (!mddev->bitmap) { @@ -6973,9 +6973,9 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); } - mddev->pers->quiesce(mddev, 1); + mddev_suspend(mddev); bitmap_destroy(mddev); - mddev->pers->quiesce(mddev, 0); + mddev_resume(mddev); mddev->bitmap_info.offset = 0; } } -- cgit v1.2.1 From 35bfc52187f6df8779d0f1cebdb52b7f797baf4e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 13:46:43 +1100 Subject: md: allow metadata update while suspending. There are various deadlocks that can occur when a thread holds reconfig_mutex and calls ->quiesce(mddev, 1). As some write request block waiting for metadata to be updated (e.g. to record device failure), and as the md thread updates the metadata while the reconfig mutex is held, holding the mutex can stop write requests completing, and this prevents ->quiesce(mddev, 1) from completing. ->quiesce() is now usually called from mddev_suspend(), and it is always called with reconfig_mutex held. So at this time it is safe for the thread to update metadata without explicitly taking the lock. So add 2 new flags, one which says the unlocked updates is allowed, and one which ways it is happening. Then allow it while the quiesce completes, and then wait for it to finish. Reported-and-tested-by: Xiao Ni Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 14 ++++++++++++++ drivers/md/md.h | 6 ++++++ 2 files changed, 20 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5bd4f18763bd..9155f00dca20 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -364,8 +364,12 @@ void mddev_suspend(struct mddev *mddev) return; synchronize_rcu(); wake_up(&mddev->sb_wait); + set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); + smp_mb__after_atomic(); wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); mddev->pers->quiesce(mddev, 1); + clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); + wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); del_timer_sync(&mddev->safemode_timer); } @@ -8838,6 +8842,16 @@ void md_check_recovery(struct mddev *mddev) unlock: wake_up(&mddev->sb_wait); mddev_unlock(mddev); + } else if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) { + /* Write superblock - thread that called mddev_suspend() + * holds reconfig_mutex for us. + */ + set_bit(MD_UPDATING_SB, &mddev->flags); + smp_mb__after_atomic(); + if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags)) + md_update_sb(mddev, 0); + clear_bit_unlock(MD_UPDATING_SB, &mddev->flags); + wake_up(&mddev->sb_wait); } } EXPORT_SYMBOL(md_check_recovery); diff --git a/drivers/md/md.h b/drivers/md/md.h index d8287d3cd1bf..03fc641e5da1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -237,6 +237,12 @@ enum mddev_flags { */ MD_HAS_PPL, /* The raid array has PPL feature set */ MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */ + MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update + * the metadata without taking reconfig_mutex. + */ + MD_UPDATING_SB, /* md_check_recovery is updating the metadata + * without explicitly holding reconfig_mutex. + */ }; enum mddev_sb_flags { -- cgit v1.2.1 From b03e0ccb5ab9df3efbe51c87843a1ffbecbafa1f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 19 Oct 2017 12:49:15 +1100 Subject: md: remove special meaning of ->quiesce(.., 2) The '2' argument means "wake up anything that is waiting". This is an inelegant part of the design and was added to help support management of suspend_lo/suspend_hi setting. Now that suspend_lo/hi is managed in mddev_suspend/resume, that need is gone. These is still a couple of places where we call 'quiesce' with an argument of '2', but they can safely be changed to call ->quiesce(.., 1); ->quiesce(.., 0) which achieve the same result at the small cost of pausing IO briefly. This removes a small "optimization" from suspend_{hi,lo}_store, but it isn't clear that optimization served a useful purpose. The code now is a lot clearer. Suggested-by: Shaohua Li Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 6 +++--- drivers/md/md.c | 34 ++++++++++------------------------ drivers/md/md.h | 9 ++++----- drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 13 +++---------- drivers/md/raid10.c | 10 +++------- drivers/md/raid5-cache.c | 12 ++++++------ drivers/md/raid5-log.h | 2 +- drivers/md/raid5.c | 18 ++++++------------ 9 files changed, 37 insertions(+), 69 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index bc81ecc24c96..d0fd1bd8575c 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -442,10 +442,11 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) static void remove_suspend_info(struct mddev *mddev, int slot) { struct md_cluster_info *cinfo = mddev->cluster_info; + mddev->pers->quiesce(mddev, 1); spin_lock_irq(&cinfo->suspend_lock); __remove_suspend_info(cinfo, slot); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); } @@ -492,13 +493,12 @@ static void process_suspend_info(struct mddev *mddev, s->lo = lo; s->hi = hi; mddev->pers->quiesce(mddev, 1); - mddev->pers->quiesce(mddev, 0); spin_lock_irq(&cinfo->suspend_lock); /* Remove existing entry (if exists) before adding */ __remove_suspend_info(cinfo, slot); list_add(&s->list, &cinfo->suspend_list); spin_unlock_irq(&cinfo->suspend_lock); - mddev->pers->quiesce(mddev, 2); + mddev->pers->quiesce(mddev, 0); } static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9155f00dca20..d441b1d9846c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4846,7 +4846,7 @@ suspend_lo_show(struct mddev *mddev, char *page) static ssize_t suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4862,17 +4862,10 @@ suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) if (mddev->pers == NULL || mddev->pers->quiesce == NULL) goto unlock; - old = mddev->suspend_lo; + mddev_suspend(mddev); mddev->suspend_lo = new; - if (new >= old) { - /* Shrinking suspended region */ - wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 2); - } else { - /* Expanding suspended region - need to wait */ - mddev_suspend(mddev); - mddev_resume(mddev); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); @@ -4890,7 +4883,7 @@ suspend_hi_show(struct mddev *mddev, char *page) static ssize_t suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) { - unsigned long long old, new; + unsigned long long new; int err; err = kstrtoull(buf, 10, &new); @@ -4903,20 +4896,13 @@ suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) if (err) return err; err = -EINVAL; - if (mddev->pers == NULL || - mddev->pers->quiesce == NULL) + if (mddev->pers == NULL) goto unlock; - old = mddev->suspend_hi; + + mddev_suspend(mddev); mddev->suspend_hi = new; - if (new <= old) { - /* Shrinking suspended region */ - wake_up(&mddev->sb_wait); - mddev->pers->quiesce(mddev, 2); - } else { - /* Expanding suspended region - need to wait */ - mddev_suspend(mddev); - mddev_resume(mddev); - } + mddev_resume(mddev); + err = 0; unlock: mddev_unlock(mddev); diff --git a/drivers/md/md.h b/drivers/md/md.h index 03fc641e5da1..998b4ce1498f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -544,12 +544,11 @@ struct md_personality int (*check_reshape) (struct mddev *mddev); int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); - /* quiesce moves between quiescence states - * 0 - fully active - * 1 - no new requests allowed - * others - reserved + /* quiesce suspends or resumes internal processing. + * 1 - stop new actions and wait for action io to complete + * 0 - return to normal behaviour */ - void (*quiesce) (struct mddev *mddev, int state); + void (*quiesce) (struct mddev *mddev, int quiesce); /* takeover is used to transition an array from one * personality to another. The new personality must be able * to handle the data in the current layout. diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 5a00fc118470..5ecba9eef441 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -768,7 +768,7 @@ static void *raid0_takeover(struct mddev *mddev) return ERR_PTR(-EINVAL); } -static void raid0_quiesce(struct mddev *mddev, int state) +static void raid0_quiesce(struct mddev *mddev, int quiesce) { } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fb56ef79a1c3..9428dfa7e9a0 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -3273,21 +3273,14 @@ static int raid1_reshape(struct mddev *mddev) return 0; } -static void raid1_quiesce(struct mddev *mddev, int state) +static void raid1_quiesce(struct mddev *mddev, int quiesce) { struct r1conf *conf = mddev->private; - switch(state) { - case 2: /* wake for suspend */ - wake_up(&conf->wait_barrier); - break; - case 1: + if (quiesce) freeze_array(conf, 0); - break; - case 0: + else unfreeze_array(conf); - break; - } } static void *raid1_takeover(struct mddev *mddev) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b0de5b5ee689..615f677ceb1a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3828,18 +3828,14 @@ static void raid10_free(struct mddev *mddev, void *priv) kfree(conf); } -static void raid10_quiesce(struct mddev *mddev, int state) +static void raid10_quiesce(struct mddev *mddev, int quiesce) { struct r10conf *conf = mddev->private; - switch(state) { - case 1: + if (quiesce) raise_barrier(conf, 0); - break; - case 0: + else lower_barrier(conf); - break; - } } static int raid10_resize(struct mddev *mddev, sector_t sectors) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 59af7cf35092..037ed274807f 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -1589,21 +1589,21 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) md_wakeup_thread(log->reclaim_thread); } -void r5l_quiesce(struct r5l_log *log, int state) +void r5l_quiesce(struct r5l_log *log, int quiesce) { struct mddev *mddev; - if (!log || state == 2) + if (!log) return; - if (state == 0) - kthread_unpark(log->reclaim_thread->tsk); - else if (state == 1) { + + if (quiesce) { /* make sure r5l_write_super_and_discard_space exits */ mddev = log->rdev->mddev; wake_up(&mddev->sb_wait); kthread_park(log->reclaim_thread->tsk); r5l_wake_reclaim(log, MaxSector); r5l_do_reclaim(log); - } + } else + kthread_unpark(log->reclaim_thread->tsk); } bool r5l_log_disk_error(struct r5conf *conf) diff --git a/drivers/md/raid5-log.h b/drivers/md/raid5-log.h index 328d67aedda4..c3596a27a5a8 100644 --- a/drivers/md/raid5-log.h +++ b/drivers/md/raid5-log.h @@ -8,7 +8,7 @@ extern void r5l_write_stripe_run(struct r5l_log *log); extern void r5l_flush_stripe_to_raid(struct r5l_log *log); extern void r5l_stripe_write_finished(struct stripe_head *sh); extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio); -extern void r5l_quiesce(struct r5l_log *log, int state); +extern void r5l_quiesce(struct r5l_log *log, int quiesce); extern bool r5l_log_disk_error(struct r5conf *conf); extern bool r5c_is_writeback(struct r5l_log *log); extern int diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 354a969f50a6..17ffa1e44c84 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -8008,16 +8008,12 @@ static void raid5_finish_reshape(struct mddev *mddev) } } -static void raid5_quiesce(struct mddev *mddev, int state) +static void raid5_quiesce(struct mddev *mddev, int quiesce) { struct r5conf *conf = mddev->private; - switch(state) { - case 2: /* resume for a suspend */ - wake_up(&conf->wait_for_overlap); - break; - - case 1: /* stop all writes */ + if (quiesce) { + /* stop all writes */ lock_all_device_hash_locks_irq(conf); /* '2' tells resync/reshape to pause so that all * active stripes can drain @@ -8033,17 +8029,15 @@ static void raid5_quiesce(struct mddev *mddev, int state) unlock_all_device_hash_locks_irq(conf); /* allow reshape to continue */ wake_up(&conf->wait_for_overlap); - break; - - case 0: /* re-enable writes */ + } else { + /* re-enable writes */ lock_all_device_hash_locks_irq(conf); conf->quiesce = 0; wake_up(&conf->wait_for_quiescent); wake_up(&conf->wait_for_overlap); unlock_all_device_hash_locks_irq(conf); - break; } - r5l_quiesce(conf->log, state); + r5l_quiesce(conf->log, quiesce); } static void *raid45_takeover_raid0(struct mddev *mddev, int level) -- cgit v1.2.1 From ae89fd3de4793c0dc2ec7e9f26b58a357d74a6c7 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Wed, 18 Oct 2017 19:01:11 -0400 Subject: md: use TASK_IDLE instead of blocking signals Hi - I submit this patch for the next merge window: Some times ago, I made a patch f9c79bc05a2a that blocks signals around the schedule() calls in MD. The MD subsystem needs to do an uninterruptible sleep that is not accounted in load average - so we block signals and use interruptible sleep. The kernel has a special TASK_IDLE state for this purpose, so we can use it instead of blocking signals. This patch doesn't fix any bug, it just makes the code simpler. Signed-off-by: Mikulas Patocka Acked-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 7 +------ drivers/md/raid5.c | 1 - 2 files changed, 1 insertion(+), 7 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 9428dfa7e9a0..1f36473c79dc 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -37,7 +37,6 @@ #include #include #include -#include #include @@ -1320,18 +1319,14 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ DEFINE_WAIT(w); for (;;) { - sigset_t full, old; prepare_to_wait(&conf->wait_barrier, - &w, TASK_INTERRUPTIBLE); + &w, TASK_IDLE); if (!mddev_is_clustered(mddev) || !md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) break; - sigfillset(&full); - sigprocmask(SIG_BLOCK, &full, &old); schedule(); - sigprocmask(SIG_SETMASK, &old, NULL); } finish_wait(&conf->wait_barrier, &w); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 17ffa1e44c84..2a4b34941d86 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -55,7 +55,6 @@ #include #include #include -#include #include #include -- cgit v1.2.1 From f6eca2d43ed694ab8124dd24c88277f7eca93b7d Mon Sep 17 00:00:00 2001 From: Nate Dailey Date: Tue, 17 Oct 2017 08:17:03 -0400 Subject: raid1: prevent freeze_array/wait_all_barriers deadlock If freeze_array is attempted in the middle of close_sync/ wait_all_barriers, deadlock can occur. freeze_array will wait for nr_pending and nr_queued to line up. wait_all_barriers increments nr_pending for each barrier bucket, one at a time, but doesn't actually issue IO that could be counted in nr_queued. So freeze_array is blocked until wait_all_barriers completes and allow_all_barriers runs. At the same time, when _wait_barrier sees array_frozen == 1, it stops and waits for freeze_array to complete. Prevent the deadlock by making close_sync call _wait_barrier and _allow_barrier for one bucket at a time, instead of deferring the _allow_barrier calls until after all _wait_barriers are complete. Signed-off-by: Nate Dailey Fix: fd76863e37fe(RAID1: a new I/O barrier implementation to remove resync window) Reviewed-by: Coly Li Cc: stable@vger.kernel.org (v4.11) Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1f36473c79dc..038f5eb299ce 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -989,14 +989,6 @@ static void wait_barrier(struct r1conf *conf, sector_t sector_nr) _wait_barrier(conf, idx); } -static void wait_all_barriers(struct r1conf *conf) -{ - int idx; - - for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - _wait_barrier(conf, idx); -} - static void _allow_barrier(struct r1conf *conf, int idx) { atomic_dec(&conf->nr_pending[idx]); @@ -1010,14 +1002,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr) _allow_barrier(conf, idx); } -static void allow_all_barriers(struct r1conf *conf) -{ - int idx; - - for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) - _allow_barrier(conf, idx); -} - /* conf->resync_lock should be held */ static int get_unqueued_pending(struct r1conf *conf) { @@ -1645,8 +1629,12 @@ static void print_conf(struct r1conf *conf) static void close_sync(struct r1conf *conf) { - wait_all_barriers(conf); - allow_all_barriers(conf); + int idx; + + for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) { + _wait_barrier(conf, idx); + _allow_barrier(conf, idx); + } mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; -- cgit v1.2.1 From efa4b77b00b56138fb7e68d2fe8fd1b3c15cd503 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 18 Oct 2017 22:08:13 -0700 Subject: md: use lockdep_assert_held lockdep_assert_held is a better way to assert lock held, and it works for UP. Signed-off-by: Shaohua Li --- drivers/md/md.c | 4 ++-- drivers/md/md.h | 5 ----- drivers/md/raid5-cache.c | 12 ++++++------ 3 files changed, 8 insertions(+), 13 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index d441b1d9846c..5a0ec1d1a6e8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2335,7 +2335,7 @@ static void export_array(struct mddev *mddev) static bool set_in_sync(struct mddev *mddev) { - WARN_ON_ONCE(NR_CPUS != 1 && !spin_is_locked(&mddev->lock)); + lockdep_assert_held(&mddev->lock); if (!mddev->in_sync) { mddev->sync_checkers++; spin_unlock(&mddev->lock); @@ -6749,7 +6749,7 @@ static int set_array_info(struct mddev *mddev, mdu_array_info_t *info) void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) { - WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); + lockdep_assert_held(&mddev->reconfig_mutex); if (mddev->external_size) return; diff --git a/drivers/md/md.h b/drivers/md/md.h index 998b4ce1498f..7d6bcf0eba0c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -500,11 +500,6 @@ static inline void mddev_lock_nointr(struct mddev *mddev) mutex_lock(&mddev->reconfig_mutex); } -static inline int mddev_is_locked(struct mddev *mddev) -{ - return mutex_is_locked(&mddev->reconfig_mutex); -} - static inline int mddev_trylock(struct mddev *mddev) { return mutex_trylock(&mddev->reconfig_mutex); diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 037ed274807f..f1c86d938502 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -539,7 +539,7 @@ static void r5l_log_run_stripes(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -555,7 +555,7 @@ static void r5l_move_to_end_ios(struct r5l_log *log) { struct r5l_io_unit *io, *next; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) { /* don't change list order */ @@ -1200,7 +1200,7 @@ static void r5l_run_no_mem_stripe(struct r5l_log *log) { struct stripe_head *sh; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); if (!list_empty(&log->no_mem_stripes)) { sh = list_first_entry(&log->no_mem_stripes, @@ -1216,7 +1216,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log) struct r5l_io_unit *io, *next; bool found = false; - assert_spin_locked(&log->io_list_lock); + lockdep_assert_held(&log->io_list_lock); list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) { /* don't change list order */ @@ -1388,7 +1388,7 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) * raid5_release_stripe() while holding conf->device_lock */ BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); list_del_init(&sh->lru); atomic_inc(&sh->count); @@ -1415,7 +1415,7 @@ void r5c_flush_cache(struct r5conf *conf, int num) int count; struct stripe_head *sh, *next; - assert_spin_locked(&conf->device_lock); + lockdep_assert_held(&conf->device_lock); if (!conf->log) return; -- cgit v1.2.1 From d4098c7262a47f529765d89614484a957363d623 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Oct 2017 15:11:50 +0800 Subject: md-cluster/raid10: set "do_balance = 0" if area is resyncing Just like clustered raid1, it is impossible for cluster raid10 to choose the best device for read balance when the area of array is resyncing. Because we cannot trust the data to be the same on all devices at that time, so we choose just the first one to use, so set do_balance to 0. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 615f677ceb1a..61890231972e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -759,8 +759,11 @@ static struct md_rdev *read_balance(struct r10conf *conf, * the resync window. We take the first readable disk when * above the resync window. */ - if (conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) + if ((conf->mddev->recovery_cp < MaxSector + && (this_sector + sectors >= conf->next_resync)) || + (mddev_is_clustered(conf->mddev) && + md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, + this_sector + sectors))) do_balance = 0; for (slot = 0; slot < conf->copies ; slot++) { -- cgit v1.2.1 From cb8a7a7e1098e74d36378b992a6d012668ec10d9 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Oct 2017 15:11:51 +0800 Subject: md-cluster: Suspend writes in RAID10 if within range If there is a resync going on, all nodes must suspend writes to the range. This is recorded in suspend_info and suspend_list. If there is an I/O within the ranges of any of the suspend_info, area_resyncing will return 1. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 61890231972e..cc6a56a659a3 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1294,6 +1294,22 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, sector_t sectors; int max_sectors; + if ((mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, + bio_end_sector(bio)))) { + DEFINE_WAIT(w); + for (;;) { + prepare_to_wait(&conf->wait_barrier, + &w, TASK_IDLE); + if (!md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))) + break; + schedule(); + } + finish_wait(&conf->wait_barrier, &w); + } + /* * Register the new request and wait if the reconstruction * thread has put up a bar for new requests. -- cgit v1.2.1 From 8db87912c9a8771c53b98845cd5516ea63b22e1e Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Oct 2017 15:11:52 +0800 Subject: md-cluster: Use a small window for raid10 resync Suspending the entire device for resync could take too long. Resync in small chunks. cluster's resync window is maintained in r10conf as cluster_sync_low and cluster_sync_high, and processed in raid10's sync_request(). If the current resync is outside the cluster resync window: 1. Set the cluster_sync_low to curr_resync_completed. 2. Set cluster_sync_high to cluster_sync_low + stripe size. 3. Send a message to all nodes so they may add it in their suspension list. Note: We only support "near" raid10 so far, resync a far or offset raid10 array could have trouble. So raid10_run checks the layout of clustered raid10, it will refuse to run if the layout is not correct. With the "near" layout we process one stripe at a time progressing monotonically through the address space. So we can have a sliding window of whole-stripes which moves through the array suspending IO on other nodes, and both resync which uses array addresses and recovery which uses device addresses can stay within this window. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid10.c | 113 +++++++++++++++++++++++++++++++++++++++++++++++++++- drivers/md/raid10.h | 6 +++ 2 files changed, 118 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index cc6a56a659a3..b9edbc747a95 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data) kfree(r10_bio); } +#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) /* amount of memory to reserve for resync requests */ #define RESYNC_WINDOW (1024*1024) /* maximum number of concurrent requests, memory permitting */ #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) /* * When performing a resync, we need to read and compare, so @@ -2840,6 +2843,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf) return r10bio; } +/* + * Set cluster_sync_high since we need other nodes to add the + * range [cluster_sync_low, cluster_sync_high] to suspend list. + */ +static void raid10_set_cluster_sync_high(struct r10conf *conf) +{ + sector_t window_size; + int extra_chunk, chunks; + + /* + * First, here we define "stripe" as a unit which across + * all member devices one time, so we get chunks by use + * raid_disks / near_copies. Otherwise, if near_copies is + * close to raid_disks, then resync window could increases + * linearly with the increase of raid_disks, which means + * we will suspend a really large IO window while it is not + * necessary. If raid_disks is not divisible by near_copies, + * an extra chunk is needed to ensure the whole "stripe" is + * covered. + */ + + chunks = conf->geo.raid_disks / conf->geo.near_copies; + if (conf->geo.raid_disks % conf->geo.near_copies == 0) + extra_chunk = 0; + else + extra_chunk = 1; + window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors; + + /* + * At least use a 32M window to align with raid1's resync window + */ + window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ? + CLUSTER_RESYNC_WINDOW_SECTORS : window_size; + + conf->cluster_sync_high = conf->cluster_sync_low + window_size; +} + /* * perform a "sync" on one "block" * @@ -2912,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) max_sector = mddev->resync_max_sectors; if (sector_nr >= max_sector) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + /* If we aborted, we need to abort the * sync on the 'current' bitmap chucks (there can * be several when recovering multiple devices). @@ -3266,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* resync. Schedule a read for every block at this virt offset */ int count = 0; - bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0); + /* + * Since curr_resync_completed could probably not update in + * time, and we will set cluster_sync_low based on it. + * Let's check against "sector_nr + 2 * RESYNC_SECTORS" for + * safety reason, which ensures curr_resync_completed is + * updated in bitmap_cond_end_sync. + */ + bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && + (sector_nr + 2 * RESYNC_SECTORS > + conf->cluster_sync_high)); if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, mddev->degraded) && @@ -3400,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, } while (++page_idx < RESYNC_PAGES); r10_bio->sectors = nr_sectors; + if (mddev_is_clustered(mddev) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { + /* It is resync not recovery */ + if (conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + raid10_set_cluster_sync_high(conf); + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } else if (mddev_is_clustered(mddev)) { + /* This is recovery not resync */ + sector_t sect_va1, sect_va2; + bool broadcast_msg = false; + + for (i = 0; i < conf->geo.raid_disks; i++) { + /* + * sector_nr is a device address for recovery, so we + * need translate it to array address before compare + * with cluster_sync_high. + */ + sect_va1 = raid10_find_virt(conf, sector_nr, i); + + if (conf->cluster_sync_high < sect_va1 + nr_sectors) { + broadcast_msg = true; + /* + * curr_resync_completed is similar as + * sector_nr, so make the translation too. + */ + sect_va2 = raid10_find_virt(conf, + mddev->curr_resync_completed, i); + + if (conf->cluster_sync_low == 0 || + conf->cluster_sync_low > sect_va2) + conf->cluster_sync_low = sect_va2; + } + } + if (broadcast_msg) { + raid10_set_cluster_sync_high(conf); + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + } + while (biolist) { bio = biolist; biolist = biolist->bi_next; @@ -3659,6 +3758,18 @@ static int raid10_run(struct mddev *mddev) if (!conf) goto out; + if (mddev_is_clustered(conf->mddev)) { + int fc, fo; + + fc = (mddev->layout >> 8) & 255; + fo = mddev->layout & (1<<16); + if (fc > 1 || fo > 0) { + pr_err("only near layout is supported by clustered" + " raid10\n"); + goto out; + } + } + mddev->thread = conf->thread; conf->thread = NULL; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 735ce1a3d260..2bef4e8789c8 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -88,6 +88,12 @@ struct r10conf { * the new thread here until we fully activate the array. */ struct md_thread *thread; + + /* + * Keep track of cluster resync window to send to other nodes. + */ + sector_t cluster_sync_low; + sector_t cluster_sync_high; }; /* -- cgit v1.2.1 From f81f7302e86f5c0a21b59c94164f2510812b7764 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Oct 2017 15:33:33 +0800 Subject: raid1: remove obsolete code in raid1_write_request There are some lines could be removed due to recent change for raid1 such as commit 3956df15d634 ("md: move suspend_hi/lo handling into core md code"). Also, seems some comments are put to wrong place, move them before wait_barrier. Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/raid1.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 038f5eb299ce..cc9d337a1ed3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1286,27 +1286,15 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, int first_clone; int max_sectors; - /* - * Register the new request and wait if the reconstruction - * thread has put up a bar for new requests. - * Continue immediately if no resync is active currently. - */ - - if (mddev_is_clustered(mddev) && md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) { - /* - * As the suspend_* range is controlled by userspace, we want - * an interruptible wait. - */ DEFINE_WAIT(w); for (;;) { prepare_to_wait(&conf->wait_barrier, &w, TASK_IDLE); - if (!mddev_is_clustered(mddev) || - !md_cluster_ops->area_resyncing(mddev, WRITE, + if (!md_cluster_ops->area_resyncing(mddev, WRITE, bio->bi_iter.bi_sector, bio_end_sector(bio))) break; @@ -1314,6 +1302,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, } finish_wait(&conf->wait_barrier, &w); } + + /* + * Register the new request and wait if the reconstruction + * thread has put up a bar for new requests. + * Continue immediately if no resync is active currently. + */ wait_barrier(conf, bio->bi_iter.bi_sector); r1_bio = alloc_r1bio(mddev, bio); -- cgit v1.2.1 From fc33060ba0c78310f6398357ffca8f55a4c41cee Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 27 Oct 2017 16:59:06 +0100 Subject: md: remove redundant variable q The pointer q is assigned but never read; it is redundant and can be removed. Cleans up clang warning: drivers/md/md-multipath.c:260:4: warning: Value stored to 'q' is never read Signed-off-by: Colin Ian King Signed-off-by: Shaohua Li --- drivers/md/md-multipath.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 5c70176fa24d..e40065bdbfc8 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -243,7 +243,6 @@ static void print_multipath_conf (struct mpconf *conf) static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct mpconf *conf = mddev->private; - struct request_queue *q; int err = -EEXIST; int path; struct multipath_info *p; @@ -257,7 +256,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev) for (path = first; path <= last; path++) if ((p=conf->multipaths+path)->rdev == NULL) { - q = rdev->bdev->bd_disk->queue; disk_stack_limits(mddev->gendisk, rdev->bdev, rdev->data_offset << 9); -- cgit v1.2.1 From f0e230ad877855567607fe2f40802b6317ad38f3 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 24 Oct 2017 15:11:53 +0800 Subject: md-cluster: update document for raid10 Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/Kconfig | 5 +++-- drivers/md/md-cluster.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 4a249ee86364..83b9362be09c 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -178,7 +178,7 @@ config MD_FAULTY config MD_CLUSTER - tristate "Cluster Support for MD (EXPERIMENTAL)" + tristate "Cluster Support for MD" depends on BLK_DEV_MD depends on DLM default n @@ -188,7 +188,8 @@ config MD_CLUSTER nodes in the cluster can access the MD devices simultaneously. This brings the redundancy (and uptime) of RAID levels across the - nodes of the cluster. + nodes of the cluster. Currently, it can work with raid1 and raid10 + (limited support). If unsure, say N. diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index d0fd1bd8575c..79bfbc840385 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1478,7 +1478,7 @@ static struct md_cluster_operations cluster_ops = { static int __init cluster_init(void) { - pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); + pr_warn("md-cluster: support raid1 and raid10 (limited support)\n"); pr_info("Registering Cluster MD functions\n"); register_md_cluster_operations(&cluster_ops, THIS_MODULE); return 0; -- cgit v1.2.1 From b90f6ff080c52e2f05364210733df120e3c4e597 Mon Sep 17 00:00:00 2001 From: Artur Paszkiewicz Date: Thu, 26 Oct 2017 15:56:54 +0200 Subject: md: don't check MD_SB_CHANGE_CLEAN in md_allow_write Only MD_SB_CHANGE_PENDING should be used to wait for transition from clean to dirty. Checking also MD_SB_CHANGE_CLEAN is unnecessary and can race with e.g. md_do_sync(). This sporadically causes a hang when changing consistency policy during resync: INFO: task mdadm:6183 blocked for more than 30 seconds. Not tainted 4.14.0-rc3+ #391 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. mdadm D12752 6183 6022 0x00000000 Call Trace: __schedule+0x93f/0x990 schedule+0x6b/0x90 md_allow_write+0x100/0x130 [md_mod] ? do_wait_intr_irq+0x90/0x90 resize_stripes+0x3a/0x5b0 [raid456] ? kernfs_fop_write+0xbe/0x180 raid5_change_consistency_policy+0xa6/0x200 [raid456] consistency_policy_store+0x2e/0x70 [md_mod] md_attr_store+0x90/0xc0 [md_mod] sysfs_kf_write+0x42/0x50 kernfs_fop_write+0x119/0x180 __vfs_write+0x28/0x110 ? rcu_sync_lockdep_assert+0x12/0x60 ? __sb_start_write+0x15a/0x1c0 ? vfs_write+0xa3/0x1a0 vfs_write+0xb4/0x1a0 SyS_write+0x49/0xa0 entry_SYSCALL_64_fastpath+0x18/0xad Fixes: 2214c260c72b ("md: don't return -EAGAIN in md_allow_write for external metadata arrays") Cc: Signed-off-by: Artur Paszkiewicz Signed-off-by: Shaohua Li --- drivers/md/md.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 5a0ec1d1a6e8..2bf4cc41b4f8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8125,7 +8125,6 @@ void md_allow_write(struct mddev *mddev) sysfs_notify_dirent_safe(mddev->sysfs_state); /* wait for the dirty state to be recorded in the metadata */ wait_event(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) && !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); } else spin_unlock(&mddev->lock); -- cgit v1.2.1 From db0505d320660b6ad92418847e7eca6b61b246ac Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 17 Oct 2017 16:18:36 +1100 Subject: md: be cautious about using ->curr_resync_completed for ->recovery_offset The ->recovery_offset shows how much of a non-InSync device is actually in sync - how much has been recoveryed. When performing a recovery, ->curr_resync and ->curr_resync_completed follow the device address being recovered and so can be used to update ->recovery_offset. When performing a reshape, ->curr_resync* might follow the device addresses (raid5) or might follow array addresses (raid10), so cannot in general be used to set ->recovery_offset. When reshaping backwards, ->curre_resync* measures from the *end* of the array-or-device, so is particularly unhelpful. So change the common code in md.c to only use ->curr_resync_complete for the simple recovery case, and add code to raid5.c to update ->recovery_offset during a forwards reshape. Signed-off-by: NeilBrown Signed-off-by: Shaohua Li --- drivers/md/md.c | 33 ++++++++++++++++++++++----------- drivers/md/raid5.c | 24 ++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 11 deletions(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 2bf4cc41b4f8..15e4668f594c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2454,10 +2454,18 @@ repeat: } } - /* First make sure individual recovery_offsets are correct */ + /* + * First make sure individual recovery_offsets are correct + * curr_resync_completed can only be used during recovery. + * During reshape/resync it might use array-addresses rather + * that device addresses. + */ rdev_for_each(rdev, mddev) { if (rdev->raid_disk >= 0 && mddev->delta_disks >= 0 && + test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) && + !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && !test_bit(Journal, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && mddev->curr_resync_completed > rdev->recovery_offset) @@ -8491,16 +8499,19 @@ void md_do_sync(struct md_thread *thread) } else { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) mddev->curr_resync = MaxSector; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (rdev->raid_disk >= 0 && - mddev->delta_disks >= 0 && - !test_bit(Journal, &rdev->flags) && - !test_bit(Faulty, &rdev->flags) && - !test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < mddev->curr_resync) - rdev->recovery_offset = mddev->curr_resync; - rcu_read_unlock(); + if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && + test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (rdev->raid_disk >= 0 && + mddev->delta_disks >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(Faulty, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < mddev->curr_resync) + rdev->recovery_offset = mddev->curr_resync; + rcu_read_unlock(); + } } } skip: diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2a4b34941d86..1649e82faae2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5738,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk */ struct r5conf *conf = mddev->private; struct stripe_head *sh; + struct md_rdev *rdev; sector_t first_sector, last_sector; int raid_disks = conf->previous_raid_disks; int data_disks = raid_disks - conf->max_degraded; @@ -5860,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk return 0; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; + conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -5958,6 +5968,14 @@ finish: goto ret; mddev->reshape_position = conf->reshape_progress; mddev->curr_resync_completed = sector_nr; + if (!mddev->reshape_backwards) + /* Can update recovery_offset */ + rdev_for_each(rdev, mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < sector_nr) + rdev->recovery_offset = sector_nr; conf->reshape_checkpoint = jiffies; set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); md_wakeup_thread(mddev->thread); @@ -7945,6 +7963,7 @@ static void end_reshape(struct r5conf *conf) { if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { + struct md_rdev *rdev; spin_lock_irq(&conf->device_lock); conf->previous_raid_disks = conf->raid_disks; @@ -7952,6 +7971,11 @@ static void end_reshape(struct r5conf *conf) smp_wmb(); conf->reshape_progress = MaxSector; conf->mddev->reshape_position = MaxSector; + rdev_for_each(rdev, conf->mddev) + if (rdev->raid_disk >= 0 && + !test_bit(Journal, &rdev->flags) && + !test_bit(In_sync, &rdev->flags)) + rdev->recovery_offset = MaxSector; spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); -- cgit v1.2.1 From 97f0eb9f0fec0563c1c796d95123e871b8bb65c0 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Mon, 6 Nov 2017 10:11:25 +0800 Subject: md/bitmap: clear BITMAP_WRITE_ERROR bit before writing it to sb For a RAID1 device using a file-based bitmap, if a bitmap write error occurs but the later writes succeed, it's possible both BITMAP_STALE and BITMAP_WRITE_ERROR bits will be written to the bitmap super block, the BITMAP_STALE bit will be handled properly and be cleared, but the BITMAP_WRITE_ERROR bit in sb->flags will make bitmap_create() to fail. So clear it to protect against the write failure-and-then-recovery case. Signed-off-by: Hou Tao Signed-off-by: Shaohua Li --- drivers/md/md-bitmap.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index d1b3b60669ea..a60e46529d9f 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -459,7 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap) /* rocking back to read-only */ bitmap->events_cleared = bitmap->mddev->events; sb->events_cleared = cpu_to_le64(bitmap->events_cleared); - sb->state = cpu_to_le32(bitmap->flags); + /* + * clear BITMAP_WRITE_ERROR bit to protect against the case that + * a bitmap write error occurred but the later writes succeeded. + */ + sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR)); /* Just in case these have been changed via sysfs: */ sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); -- cgit v1.2.1 From 0202ce8a90efdc81600e7bf9712d8c324081a924 Mon Sep 17 00:00:00 2001 From: Zdenek Kabelac Date: Wed, 8 Nov 2017 13:44:55 +0100 Subject: md: release allocated bitset sync_set Patch fixes kmemleak on md_stop() path used likely only by dm-raid wrapper. Code of md is using mddev_put() where both bitsets are released however this freeing is not shared. Also set NULL to bio_set and sync_set pointers just like mddev_put is doing. Signed-off-by: Zdenek Kabelac Signed-off-by: Shaohua Li --- drivers/md/md.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers/md') diff --git a/drivers/md/md.c b/drivers/md/md.c index 15e4668f594c..e014d39159d7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5852,8 +5852,14 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); - if (mddev->bio_set) + if (mddev->bio_set) { bioset_free(mddev->bio_set); + mddev->bio_set = NULL; + } + if (mddev->sync_set) { + bioset_free(mddev->sync_set); + mddev->sync_set = NULL; + } } EXPORT_SYMBOL_GPL(md_stop); -- cgit v1.2.1 From 0868b99c214a3d55486c700de7c3f770b7243e7c Mon Sep 17 00:00:00 2001 From: Zdenek Kabelac Date: Wed, 8 Nov 2017 13:44:56 +0100 Subject: md: free unused memory after bitmap resize When bitmap is resized, the old kalloced chunks just are not released once the resized bitmap starts to use new space. This fixes in particular kmemleak reports like this one: unreferenced object 0xffff8f4311e9c000 (size 4096): comm "lvm", pid 19333, jiffies 4295263268 (age 528.265s) hex dump (first 32 bytes): 02 80 02 80 02 80 02 80 02 80 02 80 02 80 02 80 ................ 02 80 02 80 02 80 02 80 02 80 02 80 02 80 02 80 ................ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_trace+0x14e/0x2e0 [] bitmap_checkpage+0x7c/0x110 [] bitmap_get_counter+0x45/0xd0 [] bitmap_set_memory_bits+0x43/0xe0 [] bitmap_init_from_disk+0x23c/0x530 [] bitmap_load+0xbe/0x160 [] raid_preresume+0x203/0x2f0 [dm_raid] [] dm_table_resume_targets+0x4f/0xe0 [] dm_resume+0x122/0x140 [] dev_suspend+0x18f/0x290 [] ctl_ioctl+0x287/0x560 [] dm_ctl_ioctl+0x13/0x20 [] do_vfs_ioctl+0xa6/0x750 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x1f/0xc2 Signed-off-by: Zdenek Kabelac Signed-off-by: Shaohua Li --- drivers/md/md-bitmap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers/md') diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index a60e46529d9f..bb45c0ccc1bf 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2162,6 +2162,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, for (k = 0; k < page; k++) { kfree(new_bp[k].map); } + kfree(new_bp); /* restore some fields from old_counts */ bitmap->counts.bp = old_counts.bp; @@ -2212,6 +2213,14 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, block += old_blocks; } + if (bitmap->counts.bp != old_counts.bp) { + unsigned long k; + for (k = 0; k < old_counts.pages; k++) + if (!old_counts.bp[k].hijacked) + kfree(old_counts.bp[k].map); + kfree(old_counts.bp); + } + if (!init) { int i; while (block < (chunks << chunkshift)) { -- cgit v1.2.1