summaryrefslogtreecommitdiff
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c593
1 files changed, 435 insertions, 158 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 717daad71fb1..d429c30cd514 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -53,6 +53,7 @@
#include <linux/slab.h>
#include "md.h"
#include "bitmap.h"
+#include "md-cluster.h"
#ifndef MODULE
static void autostart_arrays(int part);
@@ -66,6 +67,11 @@ static void autostart_arrays(int part);
static LIST_HEAD(pers_list);
static DEFINE_SPINLOCK(pers_lock);
+struct md_cluster_operations *md_cluster_ops;
+EXPORT_SYMBOL(md_cluster_ops);
+struct module *md_cluster_mod;
+EXPORT_SYMBOL(md_cluster_mod);
+
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
static struct workqueue_struct *md_wq;
static struct workqueue_struct *md_misc_wq;
@@ -249,6 +255,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
const int rw = bio_data_dir(bio);
struct mddev *mddev = q->queuedata;
unsigned int sectors;
+ int cpu;
if (mddev == NULL || mddev->pers == NULL
|| !mddev->ready) {
@@ -284,7 +291,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
sectors = bio_sectors(bio);
mddev->pers->make_request(mddev, bio);
- generic_start_io_acct(rw, sectors, &mddev->gendisk->part0);
+ cpu = part_stat_lock();
+ part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+ part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+ part_stat_unlock();
if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
wake_up(&mddev->sb_wait);
@@ -636,7 +646,7 @@ void mddev_unlock(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_unlock);
-static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
+struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
{
struct md_rdev *rdev;
@@ -646,6 +656,7 @@ static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
return NULL;
}
+EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
{
@@ -2013,7 +2024,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
{
char b[BDEVNAME_SIZE];
struct kobject *ko;
- char *s;
int err;
/* prevent duplicates */
@@ -2043,11 +2053,11 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
int choice = 0;
if (mddev->pers)
choice = mddev->raid_disks;
- while (find_rdev_nr_rcu(mddev, choice))
+ while (md_find_rdev_nr_rcu(mddev, choice))
choice++;
rdev->desc_nr = choice;
} else {
- if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
+ if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
rcu_read_unlock();
return -EBUSY;
}
@@ -2059,8 +2069,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
return -EBUSY;
}
bdevname(rdev->bdev,b);
- while ( (s=strchr(b, '/')) != NULL)
- *s = '!';
+ strreplace(b, '/', '!');
rdev->mddev = mddev;
printk(KERN_INFO "md: bind<%s>\n", b);
@@ -2162,11 +2171,12 @@ static void export_rdev(struct md_rdev *rdev)
kobject_put(&rdev->kobj);
}
-static void kick_rdev_from_array(struct md_rdev *rdev)
+void md_kick_rdev_from_array(struct md_rdev *rdev)
{
unbind_rdev_from_array(rdev);
export_rdev(rdev);
}
+EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
static void export_array(struct mddev *mddev)
{
@@ -2175,7 +2185,7 @@ static void export_array(struct mddev *mddev)
while (!list_empty(&mddev->disks)) {
rdev = list_first_entry(&mddev->disks, struct md_rdev,
same_set);
- kick_rdev_from_array(rdev);
+ md_kick_rdev_from_array(rdev);
}
mddev->raid_disks = 0;
mddev->major_version = 0;
@@ -2204,7 +2214,7 @@ static void sync_sbs(struct mddev *mddev, int nospares)
}
}
-static void md_update_sb(struct mddev *mddev, int force_change)
+void md_update_sb(struct mddev *mddev, int force_change)
{
struct md_rdev *rdev;
int sync_req;
@@ -2365,6 +2375,37 @@ repeat:
wake_up(&rdev->blocked_wait);
}
}
+EXPORT_SYMBOL(md_update_sb);
+
+static int add_bound_rdev(struct md_rdev *rdev)
+{
+ struct mddev *mddev = rdev->mddev;
+ int err = 0;
+
+ if (!mddev->pers->hot_remove_disk) {
+ /* If there is hot_add_disk but no hot_remove_disk
+ * then added disks for geometry changes,
+ * and should be added immediately.
+ */
+ super_types[mddev->major_version].
+ validate_super(mddev, rdev);
+ err = mddev->pers->hot_add_disk(mddev, rdev);
+ if (err) {
+ unbind_rdev_from_array(rdev);
+ export_rdev(rdev);
+ return err;
+ }
+ }
+ sysfs_notify_dirent_safe(rdev->sysfs_state);
+
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (mddev->degraded)
+ set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_new_event(mddev);
+ md_wakeup_thread(mddev->thread);
+ return 0;
+}
/* words written to sysfs files may, or may not, be \n terminated.
* We want to accept with case. For this we use cmd_match.
@@ -2467,10 +2508,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = -EBUSY;
else {
struct mddev *mddev = rdev->mddev;
- kick_rdev_from_array(rdev);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->remove_disk(mddev, rdev);
+ md_kick_rdev_from_array(rdev);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
if (mddev->pers)
md_update_sb(mddev, 1);
md_new_event(mddev);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
err = 0;
}
} else if (cmd_match(buf, "writemostly")) {
@@ -2549,6 +2596,21 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
clear_bit(Replacement, &rdev->flags);
err = 0;
}
+ } else if (cmd_match(buf, "re-add")) {
+ if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
+ /* clear_bit is performed _after_ all the devices
+ * have their local Faulty bit cleared. If any writes
+ * happen in the meantime in the local node, they
+ * will land in the local bitmap, which will be synced
+ * by this node eventually
+ */
+ if (!mddev_is_clustered(rdev->mddev) ||
+ (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
+ clear_bit(Faulty, &rdev->flags);
+ err = add_bound_rdev(rdev);
+ }
+ } else
+ err = -EBUSY;
}
if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2566,13 +2628,14 @@ errors_show(struct md_rdev *rdev, char *page)
static ssize_t
errors_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (*buf && (*e == 0 || *e == '\n')) {
- atomic_set(&rdev->corrected_errors, n);
- return len;
- }
- return -EINVAL;
+ unsigned int n;
+ int rv;
+
+ rv = kstrtouint(buf, 10, &n);
+ if (rv < 0)
+ return rv;
+ atomic_set(&rdev->corrected_errors, n);
+ return len;
}
static struct rdev_sysfs_entry rdev_errors =
__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
@@ -2589,13 +2652,16 @@ slot_show(struct md_rdev *rdev, char *page)
static ssize_t
slot_store(struct md_rdev *rdev, const char *buf, size_t len)
{
- char *e;
+ int slot;
int err;
- int slot = simple_strtoul(buf, &e, 10);
+
if (strncmp(buf, "none", 4)==0)
slot = -1;
- else if (e==buf || (*e && *e!= '\n'))
- return -EINVAL;
+ else {
+ err = kstrtouint(buf, 10, (unsigned int *)&slot);
+ if (err < 0)
+ return err;
+ }
if (rdev->mddev->pers && slot == -1) {
/* Setting 'slot' on an active array requires also
* updating the 'rd%d' link, and communicating
@@ -3123,7 +3189,7 @@ static void analyze_sbs(struct mddev *mddev)
"md: fatal superblock inconsistency in %s"
" -- removing from array\n",
bdevname(rdev->bdev,b));
- kick_rdev_from_array(rdev);
+ md_kick_rdev_from_array(rdev);
}
super_types[mddev->major_version].
@@ -3138,18 +3204,27 @@ static void analyze_sbs(struct mddev *mddev)
"md: %s: %s: only %d devices permitted\n",
mdname(mddev), bdevname(rdev->bdev, b),
mddev->max_disks);
- kick_rdev_from_array(rdev);
+ md_kick_rdev_from_array(rdev);
continue;
}
- if (rdev != freshest)
+ if (rdev != freshest) {
if (super_types[mddev->major_version].
validate_super(mddev, rdev)) {
printk(KERN_WARNING "md: kicking non-fresh %s"
" from array!\n",
bdevname(rdev->bdev,b));
- kick_rdev_from_array(rdev);
+ md_kick_rdev_from_array(rdev);
continue;
}
+ /* No device should have a Candidate flag
+ * when reading devices
+ */
+ if (test_bit(Candidate, &rdev->flags)) {
+ pr_info("md: kicking Cluster Candidate %s from array!\n",
+ bdevname(rdev->bdev, b));
+ md_kick_rdev_from_array(rdev);
+ }
+ }
if (mddev->level == LEVEL_MULTIPATH) {
rdev->desc_nr = i++;
rdev->raid_disk = rdev->desc_nr;
@@ -3471,12 +3546,12 @@ layout_show(struct mddev *mddev, char *page)
static ssize_t
layout_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
+ unsigned int n;
int err;
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
return err;
@@ -3520,12 +3595,12 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks);
static ssize_t
raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
+ unsigned int n;
int err;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtouint(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
@@ -3572,12 +3647,12 @@ chunk_size_show(struct mddev *mddev, char *page)
static ssize_t
chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
{
+ unsigned long n;
int err;
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
- if (!*buf || (*e && *e != '\n'))
- return -EINVAL;
+ err = kstrtoul(buf, 10, &n);
+ if (err < 0)
+ return err;
err = mddev_lock(mddev);
if (err)
@@ -3615,19 +3690,24 @@ resync_start_show(struct mddev *mddev, char *page)
static ssize_t
resync_start_store(struct mddev *mddev, const char *buf, size_t len)
{
+ unsigned long long n;
int err;
- char *e;
- unsigned long long n = simple_strtoull(buf, &e, 10);
+
+ if (cmd_match(buf, "none"))
+ n = MaxSector;
+ else {
+ err = kstrtoull(buf, 10, &n);
+ if (err < 0)
+ return err;
+ if (n != (sector_t)n)
+ return -EINVAL;
+ }
err = mddev_lock(mddev);
if (err)
return err;
if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
err = -EBUSY;
- else if (cmd_match(buf, "none"))
- n = MaxSector;
- else if (!*buf || (*e && *e != '\n'))
- err = -EINVAL;
if (!err) {
mddev->recovery_cp = n;
@@ -3761,7 +3841,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
err = -EBUSY;
}
spin_unlock(&mddev->lock);
- return err;
+ return err ?: len;
}
err = mddev_lock(mddev);
if (err)
@@ -3863,14 +3943,14 @@ max_corrected_read_errors_show(struct mddev *mddev, char *page) {
static ssize_t
max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long n = simple_strtoul(buf, &e, 10);
+ unsigned int n;
+ int rv;
- if (*buf && (*e == 0 || *e == '\n')) {
- atomic_set(&mddev->max_corr_read_errors, n);
- return len;
- }
- return -EINVAL;
+ rv = kstrtouint(buf, 10, &n);
+ if (rv < 0)
+ return rv;
+ atomic_set(&mddev->max_corr_read_errors, n);
+ return len;
}
static struct md_sysfs_entry max_corr_read_errors =
@@ -3932,8 +4012,10 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
else
rdev = md_import_device(dev, -1, -1);
- if (IS_ERR(rdev))
+ if (IS_ERR(rdev)) {
+ mddev_unlock(mddev);
return PTR_ERR(rdev);
+ }
err = bind_rdev_to_array(rdev, mddev);
out:
if (err)
@@ -4004,8 +4086,12 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
return err;
if (mddev->pers) {
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
} else {
if (mddev->dev_sectors == 0 ||
mddev->dev_sectors > sectors)
@@ -4134,34 +4220,36 @@ action_store(struct mddev *mddev, const char *page, size_t len)
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
- if (cmd_match(page, "frozen"))
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- else
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
- flush_workqueue(md_misc_wq);
- if (mddev->sync_thread) {
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- if (mddev_lock(mddev) == 0) {
+ if (cmd_match(page, "frozen"))
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ else
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+ mddev_lock(mddev) == 0) {
+ flush_workqueue(md_misc_wq);
+ if (mddev->sync_thread) {
+ set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
- mddev_unlock(mddev);
}
+ mddev_unlock(mddev);
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync"))
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
else if (cmd_match(page, "recover")) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} else if (cmd_match(page, "reshape")) {
int err;
if (mddev->pers->start_reshape == NULL)
return -EINVAL;
err = mddev_lock(mddev);
if (!err) {
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
err = mddev->pers->start_reshape(mddev);
mddev_unlock(mddev);
}
@@ -4173,6 +4261,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
else if (!cmd_match(page, "repair"))
return -EINVAL;
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
}
@@ -4220,15 +4309,18 @@ sync_min_show(struct mddev *mddev, char *page)
static ssize_t
sync_min_store(struct mddev *mddev, const char *buf, size_t len)
{
- int min;
- char *e;
+ unsigned int min;
+ int rv;
+
if (strncmp(buf, "system", 6)==0) {
- mddev->sync_speed_min = 0;
- return len;
+ min = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &min);
+ if (rv < 0)
+ return rv;
+ if (min == 0)
+ return -EINVAL;
}
- min = simple_strtoul(buf, &e, 10);
- if (buf == e || (*e && *e != '\n') || min <= 0)
- return -EINVAL;
mddev->sync_speed_min = min;
return len;
}
@@ -4246,15 +4338,18 @@ sync_max_show(struct mddev *mddev, char *page)
static ssize_t
sync_max_store(struct mddev *mddev, const char *buf, size_t len)
{
- int max;
- char *e;
+ unsigned int max;
+ int rv;
+
if (strncmp(buf, "system", 6)==0) {
- mddev->sync_speed_max = 0;
- return len;
+ max = 0;
+ } else {
+ rv = kstrtouint(buf, 10, &max);
+ if (rv < 0)
+ return rv;
+ if (max == 0)
+ return -EINVAL;
}
- max = simple_strtoul(buf, &e, 10);
- if (buf == e || (*e && *e != '\n') || max <= 0)
- return -EINVAL;
mddev->sync_speed_max = max;
return len;
}
@@ -4350,7 +4445,6 @@ min_sync_store(struct mddev *mddev, const char *buf, size_t len)
{
unsigned long long min;
int err;
- int chunk;
if (kstrtoull(buf, 10, &min))
return -EINVAL;
@@ -4364,16 +4458,8 @@ min_sync_store(struct mddev *mddev, const char *buf, size_t len)
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
goto out_unlock;
- /* Must be a multiple of chunk_size */
- chunk = mddev->chunk_sectors;
- if (chunk) {
- sector_t temp = min;
-
- err = -EINVAL;
- if (sector_div(temp, chunk))
- goto out_unlock;
- }
- mddev->resync_min = min;
+ /* Round down to multiple of 4K for safety */
+ mddev->resync_min = round_down(min, 8);
err = 0;
out_unlock:
@@ -4446,12 +4532,13 @@ suspend_lo_show(struct mddev *mddev, char *page)
static ssize_t
suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old;
+ unsigned long long old, new;
int err;
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
@@ -4488,12 +4575,13 @@ suspend_hi_show(struct mddev *mddev, char *page)
static ssize_t
suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
{
- char *e;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- unsigned long long old;
+ unsigned long long old, new;
int err;
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
@@ -4535,11 +4623,13 @@ static ssize_t
reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
{
struct md_rdev *rdev;
- char *e;
+ unsigned long long new;
int err;
- unsigned long long new = simple_strtoull(buf, &e, 10);
- if (buf == e || (*e && *e != '\n'))
+ err = kstrtoull(buf, 10, &new);
+ if (err < 0)
+ return err;
+ if (new != (sector_t)new)
return -EINVAL;
err = mddev_lock(mddev);
if (err)
@@ -4750,12 +4840,12 @@ static void md_free(struct kobject *ko)
if (mddev->sysfs_state)
sysfs_put(mddev->sysfs_state);
+ if (mddev->queue)
+ blk_cleanup_queue(mddev->queue);
if (mddev->gendisk) {
del_gendisk(mddev->gendisk);
put_disk(mddev->gendisk);
}
- if (mddev->queue)
- blk_cleanup_queue(mddev->queue);
kfree(mddev);
}
@@ -5073,15 +5163,22 @@ int md_run(struct mddev *mddev)
}
if (err == 0 && pers->sync_request &&
(mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
- err = bitmap_create(mddev);
- if (err)
+ struct bitmap *bitmap;
+
+ bitmap = bitmap_create(mddev, -1);
+ if (IS_ERR(bitmap)) {
+ err = PTR_ERR(bitmap);
printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
mdname(mddev), err);
+ } else
+ mddev->bitmap = bitmap;
+
}
if (err) {
mddev_detach(mddev);
if (mddev->private)
pers->free(mddev, mddev->private);
+ mddev->private = NULL;
module_put(pers->owner);
bitmap_destroy(mddev);
return err;
@@ -5217,6 +5314,7 @@ static void md_clean(struct mddev *mddev)
mddev->changed = 0;
mddev->degraded = 0;
mddev->safemode = 0;
+ mddev->private = NULL;
mddev->merge_check_needed = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
@@ -5228,6 +5326,8 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
@@ -5246,6 +5346,8 @@ static void __md_stop_writes(struct mddev *mddev)
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
}
void md_stop_writes(struct mddev *mddev)
@@ -5285,6 +5387,7 @@ static void __md_stop(struct mddev *mddev)
mddev->pers = NULL;
spin_unlock(&mddev->lock);
pers->free(mddev, mddev->private);
+ mddev->private = NULL;
if (pers->sync_request && mddev->to_remove == NULL)
mddev->to_remove = &md_redundancy_group;
module_put(pers->owner);
@@ -5632,6 +5735,8 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
info.state = (1<<MD_SB_CLEAN);
if (mddev->bitmap && mddev->bitmap_info.offset)
info.state |= (1<<MD_SB_BITMAP_PRESENT);
+ if (mddev_is_clustered(mddev))
+ info.state |= (1<<MD_SB_CLUSTERED);
info.active_disks = insync;
info.working_disks = working;
info.failed_disks = failed;
@@ -5661,7 +5766,7 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
/* bitmap disabled, zero the first byte and copy out */
if (!mddev->bitmap_info.file)
file->pathname[0] = '\0';
- else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+ else if ((ptr = file_path(mddev->bitmap_info.file,
file->pathname, sizeof(file->pathname))),
IS_ERR(ptr))
err = PTR_ERR(ptr);
@@ -5687,7 +5792,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
return -EFAULT;
rcu_read_lock();
- rdev = find_rdev_nr_rcu(mddev, info.number);
+ rdev = md_find_rdev_nr_rcu(mddev, info.number);
if (rdev) {
info.major = MAJOR(rdev->bdev->bd_dev);
info.minor = MINOR(rdev->bdev->bd_dev);
@@ -5720,6 +5825,13 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
struct md_rdev *rdev;
dev_t dev = MKDEV(info->major,info->minor);
+ if (mddev_is_clustered(mddev) &&
+ !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
+ pr_err("%s: Cannot add to clustered mddev.\n",
+ mdname(mddev));
+ return -EINVAL;
+ }
+
if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
return -EOVERFLOW;
@@ -5806,31 +5918,38 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
else
clear_bit(WriteMostly, &rdev->flags);
+ /*
+ * check whether the device shows up in other nodes
+ */
+ if (mddev_is_clustered(mddev)) {
+ if (info->state & (1 << MD_DISK_CANDIDATE)) {
+ /* Through --cluster-confirm */
+ set_bit(Candidate, &rdev->flags);
+ err = md_cluster_ops->new_disk_ack(mddev, true);
+ if (err) {
+ export_rdev(rdev);
+ return err;
+ }
+ } else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
+ /* --add initiated by this node */
+ err = md_cluster_ops->add_new_disk_start(mddev, rdev);
+ if (err) {
+ md_cluster_ops->add_new_disk_finish(mddev);
+ export_rdev(rdev);
+ return err;
+ }
+ }
+ }
+
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
- if (!err && !mddev->pers->hot_remove_disk) {
- /* If there is hot_add_disk but no hot_remove_disk
- * then added disks for geometry changes,
- * and should be added immediately.
- */
- super_types[mddev->major_version].
- validate_super(mddev, rdev);
- err = mddev->pers->hot_add_disk(mddev, rdev);
- if (err)
- unbind_rdev_from_array(rdev);
- }
if (err)
export_rdev(rdev);
else
- sysfs_notify_dirent_safe(rdev->sysfs_state);
-
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
- if (mddev->degraded)
- set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- if (!err)
- md_new_event(mddev);
- md_wakeup_thread(mddev->thread);
+ err = add_bound_rdev(rdev);
+ if (mddev_is_clustered(mddev) &&
+ (info->state & (1 << MD_DISK_CLUSTER_ADD)))
+ md_cluster_ops->add_new_disk_finish(mddev);
return err;
}
@@ -5891,18 +6010,29 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
if (!rdev)
return -ENXIO;
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
+
clear_bit(Blocked, &rdev->flags);
remove_and_add_spares(mddev, rdev);
if (rdev->raid_disk >= 0)
goto busy;
- kick_rdev_from_array(rdev);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->remove_disk(mddev, rdev);
+
+ md_kick_rdev_from_array(rdev);
md_update_sb(mddev, 1);
md_new_event(mddev);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+
return 0;
busy:
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_cancel(mddev);
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
bdevname(rdev->bdev,b), mdname(mddev));
return -EBUSY;
@@ -5952,12 +6082,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
err = -EINVAL;
goto abort_export;
}
+
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
clear_bit(In_sync, &rdev->flags);
rdev->desc_nr = -1;
rdev->saved_raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
- goto abort_export;
+ goto abort_clustered;
/*
* The rest should better be atomic, we can have disk failures
@@ -5968,6 +6101,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
md_update_sb(mddev, 1);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
/*
* Kick recovery, maybe this spare has to be added to the
* array immediately.
@@ -5977,6 +6112,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
md_new_event(mddev);
return 0;
+abort_clustered:
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_cancel(mddev);
abort_export:
export_rdev(rdev);
return err;
@@ -6034,9 +6172,14 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
if (mddev->pers) {
mddev->pers->quiesce(mddev, 1);
if (fd >= 0) {
- err = bitmap_create(mddev);
- if (!err)
+ struct bitmap *bitmap;
+
+ bitmap = bitmap_create(mddev, -1);
+ if (!IS_ERR(bitmap)) {
+ mddev->bitmap = bitmap;
err = bitmap_load(mddev);
+ } else
+ err = PTR_ERR(bitmap);
}
if (fd < 0 || err) {
bitmap_destroy(mddev);
@@ -6254,7 +6397,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
mddev->ctime != info->ctime ||
mddev->level != info->level ||
/* mddev->layout != info->layout || */
- !mddev->persistent != info->not_persistent||
+ mddev->persistent != !info->not_persistent ||
mddev->chunk_sectors != info->chunk_size >> 9 ||
/* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
((state^info->state) & 0xfffffe00)
@@ -6289,6 +6432,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
return rv;
}
}
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2);
@@ -6296,33 +6441,49 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
rv = update_raid_disks(mddev, info->raid_disks);
if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
- if (mddev->pers->quiesce == NULL || mddev->thread == NULL)
- return -EINVAL;
- if (mddev->recovery || mddev->sync_thread)
- return -EBUSY;
+ if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
+ rv = -EINVAL;
+ goto err;
+ }
+ if (mddev->recovery || mddev->sync_thread) {
+ rv = -EBUSY;
+ goto err;
+ }
if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
+ struct bitmap *bitmap;
/* add the bitmap */
- if (mddev->bitmap)
- return -EEXIST;
- if (mddev->bitmap_info.default_offset == 0)
- return -EINVAL;
+ if (mddev->bitmap) {
+ rv = -EEXIST;
+ goto err;
+ }
+ if (mddev->bitmap_info.default_offset == 0) {
+ rv = -EINVAL;
+ goto err;
+ }
mddev->bitmap_info.offset =
mddev->bitmap_info.default_offset;
mddev->bitmap_info.space =
mddev->bitmap_info.default_space;
mddev->pers->quiesce(mddev, 1);
- rv = bitmap_create(mddev);
- if (!rv)
+ bitmap = bitmap_create(mddev, -1);
+ if (!IS_ERR(bitmap)) {
+ mddev->bitmap = bitmap;
rv = bitmap_load(mddev);
+ } else
+ rv = PTR_ERR(bitmap);
if (rv)
bitmap_destroy(mddev);
mddev->pers->quiesce(mddev, 0);
} else {
/* remove the bitmap */
- if (!mddev->bitmap)
- return -ENOENT;
- if (mddev->bitmap->storage.file)
- return -EINVAL;
+ if (!mddev->bitmap) {
+ rv = -ENOENT;
+ goto err;
+ }
+ if (mddev->bitmap->storage.file) {
+ rv = -EINVAL;
+ goto err;
+ }
mddev->pers->quiesce(mddev, 1);
bitmap_destroy(mddev);
mddev->pers->quiesce(mddev, 0);
@@ -6330,6 +6491,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
}
}
md_update_sb(mddev, 1);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+ return rv;
+err:
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_cancel(mddev);
return rv;
}
@@ -6389,6 +6556,7 @@ static inline bool md_ioctl_valid(unsigned int cmd)
case SET_DISK_FAULTY:
case STOP_ARRAY:
case STOP_ARRAY_RO:
+ case CLUSTERED_DISK_NACK:
return true;
default:
return false;
@@ -6661,6 +6829,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
goto unlock;
}
+ case CLUSTERED_DISK_NACK:
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->new_disk_ack(mddev, false);
+ else
+ err = -EINVAL;
+ goto unlock;
+
case HOT_ADD_DISK:
err = hot_add_disk(mddev, new_decode_dev(arg));
goto unlock;
@@ -7234,6 +7409,55 @@ int unregister_md_personality(struct md_personality *p)
}
EXPORT_SYMBOL(unregister_md_personality);
+int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
+{
+ if (md_cluster_ops != NULL)
+ return -EALREADY;
+ spin_lock(&pers_lock);
+ md_cluster_ops = ops;
+ md_cluster_mod = module;
+ spin_unlock(&pers_lock);
+ return 0;
+}
+EXPORT_SYMBOL(register_md_cluster_operations);
+
+int unregister_md_cluster_operations(void)
+{
+ spin_lock(&pers_lock);
+ md_cluster_ops = NULL;
+ spin_unlock(&pers_lock);
+ return 0;
+}
+EXPORT_SYMBOL(unregister_md_cluster_operations);
+
+int md_setup_cluster(struct mddev *mddev, int nodes)
+{
+ int err;
+
+ err = request_module("md-cluster");
+ if (err) {
+ pr_err("md-cluster module not found.\n");
+ return err;
+ }
+
+ spin_lock(&pers_lock);
+ if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+ spin_unlock(&pers_lock);
+ return -ENOENT;
+ }
+ spin_unlock(&pers_lock);
+
+ return md_cluster_ops->join(mddev, nodes);
+}
+
+void md_cluster_stop(struct mddev *mddev)
+{
+ if (!md_cluster_ops)
+ return;
+ md_cluster_ops->leave(mddev);
+ module_put(md_cluster_mod);
+}
+
static int is_mddev_idle(struct mddev *mddev, int init)
{
struct md_rdev *rdev;
@@ -7371,7 +7595,11 @@ int md_allow_write(struct mddev *mddev)
mddev->safemode == 0)
mddev->safemode = 1;
spin_unlock(&mddev->lock);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
md_update_sb(mddev, 0);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} else
spin_unlock(&mddev->lock);
@@ -7572,6 +7800,9 @@ void md_do_sync(struct md_thread *thread)
md_new_event(mddev);
update_time = jiffies;
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->resync_start(mddev, j, max_sectors);
+
blk_start_plug(&plug);
while (j < max_sectors) {
sector_t sectors;
@@ -7614,8 +7845,7 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
break;
- sectors = mddev->pers->sync_request(mddev, j, &skipped,
- currspeed < speed_min(mddev));
+ sectors = mddev->pers->sync_request(mddev, j, &skipped);
if (sectors == 0) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
break;
@@ -7632,6 +7862,8 @@ void md_do_sync(struct md_thread *thread)
j += sectors;
if (j > 2)
mddev->curr_resync = j;
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->resync_info_update(mddev, j, max_sectors);
mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
/* this is the earliest that rebuild will be
@@ -7673,11 +7905,18 @@ void md_do_sync(struct md_thread *thread)
/((jiffies-mddev->resync_mark)/HZ +1) +1;
if (currspeed > speed_min(mddev)) {
- if ((currspeed > speed_max(mddev)) ||
- !is_mddev_idle(mddev, 0)) {
+ if (currspeed > speed_max(mddev)) {
msleep(500);
goto repeat;
}
+ if (!is_mddev_idle(mddev, 0)) {
+ /*
+ * Give other IO more of a chance.
+ * The faster the devices, the less we wait.
+ */
+ wait_event(mddev->recovery_wait,
+ !atomic_read(&mddev->recovery_active));
+ }
}
}
printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
@@ -7690,7 +7929,10 @@ void md_do_sync(struct md_thread *thread)
wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
/* tell personality that we are finished */
- mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
+ mddev->pers->sync_request(mddev, max_sectors, &skipped);
+
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->resync_finish(mddev);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
mddev->curr_resync > 2) {
@@ -7886,6 +8128,15 @@ void md_check_recovery(struct mddev *mddev)
int spares = 0;
if (mddev->ro) {
+ struct md_rdev *rdev;
+ if (!mddev->external && mddev->in_sync)
+ /* 'Blocked' flag not needed as failed devices
+ * will be recorded if array switched to read/write.
+ * Leaving it set will prevent the device
+ * from being removed.
+ */
+ rdev_for_each(rdev, mddev)
+ clear_bit(Blocked, &rdev->flags);
/* On a read-only array we can:
* - remove failed devices
* - add already-in_sync devices if the array itself
@@ -7921,8 +8172,13 @@ void md_check_recovery(struct mddev *mddev)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
- if (mddev->flags & MD_UPDATE_SB_FLAGS)
+ if (mddev->flags & MD_UPDATE_SB_FLAGS) {
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
md_update_sb(mddev, 0);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
+ }
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -8020,6 +8276,8 @@ void md_reap_sync_thread(struct mddev *mddev)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_start(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
mddev->pers->finish_reshape)
mddev->pers->finish_reshape(mddev);
@@ -8032,7 +8290,10 @@ void md_reap_sync_thread(struct mddev *mddev)
rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1);
+ if (mddev_is_clustered(mddev))
+ md_cluster_ops->metadata_update_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
@@ -8652,6 +8913,28 @@ err_wq:
return ret;
}
+void md_reload_sb(struct mddev *mddev)
+{
+ struct md_rdev *rdev, *tmp;
+
+ rdev_for_each_safe(rdev, tmp, mddev) {
+ rdev->sb_loaded = 0;
+ ClearPageUptodate(rdev->sb_page);
+ }
+ mddev->raid_disks = 0;
+ analyze_sbs(mddev);
+ rdev_for_each_safe(rdev, tmp, mddev) {
+ struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+ /* since we don't write to faulty devices, we figure out if the
+ * disk is faulty by comparing events
+ */
+ if (mddev->events > sb->events)
+ set_bit(Faulty, &rdev->flags);
+ }
+
+}
+EXPORT_SYMBOL(md_reload_sb);
+
#ifndef MODULE
/*
@@ -8761,13 +9044,7 @@ static int get_ro(char *buffer, struct kernel_param *kp)
}
static int set_ro(const char *val, struct kernel_param *kp)
{
- char *e;
- int num = simple_strtoul(val, &e, 10);
- if (*val && (*e == '\0' || *e == '\n')) {
- start_readonly = num;
- return 0;
- }
- return -EINVAL;
+ return kstrtouint(val, 10, (unsigned int *)&start_readonly);
}
module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);