From 183bdf5106af069a774146c4f910a80ad1f57485 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 7 Mar 2014 10:30:43 -0600
Subject: Add number of nodes to bitmap structure for clustering

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/bitmap.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 30210b9c4ef9..f4e53eadb083 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -131,7 +131,8 @@ typedef struct bitmap_super_s {
 	__le32 sectors_reserved; /* 64 number of 512-byte sectors that are
 				  * reserved for the bitmap. */
 
-	__u8  pad[256 - 68]; /* set to zero */
+	__le32 nodes;        /* 68 the maximum number of nodes in cluster. */
+	__u8  pad[256 - 72]; /* set to zero */
 } bitmap_super_t;
 
 /* notes:
-- 
cgit v1.2.1


From 8e854e9cfd1cc3837b4bd96643d5174a72d9f741 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 7 Mar 2014 11:21:15 -0600
Subject: Create a separate module for clustering support

Tagged as EXPERIMENTAL for now.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/Kconfig      | 16 ++++++++++++++++
 drivers/md/Makefile     |  1 +
 drivers/md/md-cluster.c | 28 ++++++++++++++++++++++++++++
 3 files changed, 45 insertions(+)
 create mode 100644 drivers/md/md-cluster.c

(limited to 'drivers/md')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 63e05e32b462..eed1fec2d97b 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -175,6 +175,22 @@ config MD_FAULTY
 
 	  In unsure, say N.
 
+
+config MD_CLUSTER
+	tristate "Cluster Support for MD (EXPERIMENTAL)"
+	depends on BLK_DEV_MD
+	depends on DLM
+	default n
+	---help---
+	Clustering support for MD devices. This enables locking and
+	synchronization across multiple systems on the cluster, so all
+	nodes in the cluster can access the MD devices simultaneously.
+
+	This brings the redundancy (and uptime) of RAID levels across the
+	nodes of the cluster.
+
+	If unsure, say N.
+
 source "drivers/md/bcache/Kconfig"
 
 config BLK_DEV_DM_BUILTIN
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a2da532b1c2b..7ed86876f3b7 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -30,6 +30,7 @@ obj-$(CONFIG_MD_RAID10)		+= raid10.o
 obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
+obj-$(CONFIG_MD_CLUSTER)	+= md-cluster.o
 obj-$(CONFIG_BCACHE)		+= bcache/
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
new file mode 100644
index 000000000000..f377e71949c5
--- /dev/null
+++ b/drivers/md/md-cluster.c
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2015, SUSE
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ */
+
+
+#include <linux/module.h>
+
+static int __init cluster_init(void)
+{
+	pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
+	pr_info("Registering Cluster MD functions\n");
+	return 0;
+}
+
+static void cluster_exit(void)
+{
+}
+
+module_init(cluster_init);
+module_exit(cluster_exit);
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Clustering support for MD");
-- 
cgit v1.2.1


From 47741b7ca7b389d1b45d7cf15edc279c9be32fa8 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 7 Mar 2014 13:49:26 -0600
Subject: DLM lock and unlock functions

A dlm_lock_resource is a structure which contains all information
required for locking using DLM. The init function allocates the
lock and acquires the lock in NL mode. The unlock function
converts the lock resource to NL mode. This is done to preserve
LVB and for faster processing of locks. The lock resource is
DLM unlocked only in the lockres_free function, which is the end
of life of the lock resource.

Signed-off-by: Lidong Zhong <lzhong@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index f377e71949c5..bc8ea9d76875 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -10,6 +10,108 @@
 
 
 #include <linux/module.h>
+#include <linux/dlm.h>
+#include <linux/sched.h>
+#include "md.h"
+
+#define LVB_SIZE	64
+
+struct dlm_lock_resource {
+	dlm_lockspace_t *ls;
+	struct dlm_lksb lksb;
+	char *name; /* lock name. */
+	uint32_t flags; /* flags to pass to dlm_lock() */
+	void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
+	struct completion completion; /* completion for synchronized locking */
+};
+
+static void sync_ast(void *arg)
+{
+	struct dlm_lock_resource *res;
+
+	res = (struct dlm_lock_resource *) arg;
+	complete(&res->completion);
+}
+
+static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
+{
+	int ret = 0;
+
+	init_completion(&res->completion);
+	ret = dlm_lock(res->ls, mode, &res->lksb,
+			res->flags, res->name, strlen(res->name),
+			0, sync_ast, res, res->bast);
+	if (ret)
+		return ret;
+	wait_for_completion(&res->completion);
+	return res->lksb.sb_status;
+}
+
+static int dlm_unlock_sync(struct dlm_lock_resource *res)
+{
+	return dlm_lock_sync(res, DLM_LOCK_NL);
+}
+
+static struct dlm_lock_resource *lockres_init(dlm_lockspace_t *lockspace,
+		char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
+{
+	struct dlm_lock_resource *res = NULL;
+	int ret, namelen;
+
+	res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+	if (!res)
+		return NULL;
+	res->ls = lockspace;
+	namelen = strlen(name);
+	res->name = kzalloc(namelen + 1, GFP_KERNEL);
+	if (!res->name) {
+		pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name);
+		goto out_err;
+	}
+	strlcpy(res->name, name, namelen + 1);
+	if (with_lvb) {
+		res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL);
+		if (!res->lksb.sb_lvbptr) {
+			pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name);
+			goto out_err;
+		}
+		res->flags = DLM_LKF_VALBLK;
+	}
+
+	if (bastfn)
+		res->bast = bastfn;
+
+	res->flags |= DLM_LKF_EXPEDITE;
+
+	ret = dlm_lock_sync(res, DLM_LOCK_NL);
+	if (ret) {
+		pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name);
+		goto out_err;
+	}
+	res->flags &= ~DLM_LKF_EXPEDITE;
+	res->flags |= DLM_LKF_CONVERT;
+
+	return res;
+out_err:
+	kfree(res->lksb.sb_lvbptr);
+	kfree(res->name);
+	kfree(res);
+	return NULL;
+}
+
+static void lockres_free(struct dlm_lock_resource *res)
+{
+	if (!res)
+		return;
+
+	init_completion(&res->completion);
+	dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+	wait_for_completion(&res->completion);
+
+	kfree(res->name);
+	kfree(res->lksb.sb_lvbptr);
+	kfree(res);
+}
 
 static int __init cluster_init(void)
 {
-- 
cgit v1.2.1


From edb39c9deda87da5aad9c090e2e8eaf8470c852c Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 29 Mar 2014 10:01:53 -0500
Subject: Introduce md_cluster_operations to handle cluster functions

This allows dynamic registering of cluster hooks.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 18 +++++++++++++++++
 drivers/md/md-cluster.h | 15 ++++++++++++++
 drivers/md/md.c         | 52 +++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/md.h         |  7 +++++++
 4 files changed, 92 insertions(+)
 create mode 100644 drivers/md/md-cluster.h

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index bc8ea9d76875..e2235600a72b 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -13,6 +13,7 @@
 #include <linux/dlm.h>
 #include <linux/sched.h>
 #include "md.h"
+#include "md-cluster.h"
 
 #define LVB_SIZE	64
 
@@ -113,15 +114,32 @@ static void lockres_free(struct dlm_lock_resource *res)
 	kfree(res);
 }
 
+static int join(struct mddev *mddev, int nodes)
+{
+	return 0;
+}
+
+static int leave(struct mddev *mddev)
+{
+	return 0;
+}
+
+static struct md_cluster_operations cluster_ops = {
+	.join   = join,
+	.leave  = leave,
+};
+
 static int __init cluster_init(void)
 {
 	pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
 	pr_info("Registering Cluster MD functions\n");
+	register_md_cluster_operations(&cluster_ops, THIS_MODULE);
 	return 0;
 }
 
 static void cluster_exit(void)
 {
+	unregister_md_cluster_operations();
 }
 
 module_init(cluster_init);
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
new file mode 100644
index 000000000000..aa9f07bd6b96
--- /dev/null
+++ b/drivers/md/md-cluster.h
@@ -0,0 +1,15 @@
+
+
+#ifndef _MD_CLUSTER_H
+#define _MD_CLUSTER_H
+
+#include "md.h"
+
+struct mddev;
+
+struct md_cluster_operations {
+	int (*join)(struct mddev *mddev);
+	int (*leave)(struct mddev *mddev);
+};
+
+#endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index c8d2bac4e28b..57ecb51ec5fd 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -53,6 +53,7 @@
 #include <linux/slab.h>
 #include "md.h"
 #include "bitmap.h"
+#include "md-cluster.h"
 
 #ifndef MODULE
 static void autostart_arrays(int part);
@@ -66,6 +67,10 @@ static void autostart_arrays(int part);
 static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
+struct md_cluster_operations *md_cluster_ops;
+struct module *md_cluster_mod;
+EXPORT_SYMBOL(md_cluster_mod);
+
 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 static struct workqueue_struct *md_wq;
 static struct workqueue_struct *md_misc_wq;
@@ -7231,6 +7236,53 @@ int unregister_md_personality(struct md_personality *p)
 }
 EXPORT_SYMBOL(unregister_md_personality);
 
+int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
+{
+	if (md_cluster_ops != NULL)
+		return -EALREADY;
+	spin_lock(&pers_lock);
+	md_cluster_ops = ops;
+	md_cluster_mod = module;
+	spin_unlock(&pers_lock);
+	return 0;
+}
+EXPORT_SYMBOL(register_md_cluster_operations);
+
+int unregister_md_cluster_operations(void)
+{
+	spin_lock(&pers_lock);
+	md_cluster_ops = NULL;
+	spin_unlock(&pers_lock);
+	return 0;
+}
+EXPORT_SYMBOL(unregister_md_cluster_operations);
+
+int md_setup_cluster(struct mddev *mddev, int nodes)
+{
+	int err;
+
+	err = request_module("md-cluster");
+	if (err) {
+		pr_err("md-cluster module not found.\n");
+		return err;
+	}
+
+	spin_lock(&pers_lock);
+	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+		spin_unlock(&pers_lock);
+		return -ENOENT;
+	}
+	spin_unlock(&pers_lock);
+
+	return md_cluster_ops->join(mddev);
+}
+
+void md_cluster_stop(struct mddev *mddev)
+{
+	md_cluster_ops->leave(mddev);
+	module_put(md_cluster_mod);
+}
+
 static int is_mddev_idle(struct mddev *mddev, int init)
 {
 	struct md_rdev *rdev;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 318ca8fd430f..018593197c4d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -23,6 +23,7 @@
 #include <linux/timer.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
+#include "md-cluster.h"
 
 #define MaxSector (~(sector_t)0)
 
@@ -608,6 +609,11 @@ static inline void safe_put_page(struct page *p)
 
 extern int register_md_personality(struct md_personality *p);
 extern int unregister_md_personality(struct md_personality *p);
+extern int register_md_cluster_operations(struct md_cluster_operations *ops,
+		struct module *module);
+extern int unregister_md_cluster_operations(void);
+extern int md_setup_cluster(struct mddev *mddev, int nodes);
+extern void md_cluster_stop(struct mddev *mddev);
 extern struct md_thread *md_register_thread(
 	void (*run)(struct md_thread *thread),
 	struct mddev *mddev,
@@ -669,4 +675,5 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
 	}
 }
 
+extern struct md_cluster_operations *md_cluster_ops;
 #endif /* _MD_MD_H */
-- 
cgit v1.2.1


From c4ce867fdad200dfd8aa8cbe1eabc26c14c51635 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 29 Mar 2014 10:20:02 -0500
Subject: Introduce md_cluster_info

md_cluster_info stores the cluster information in the MD device.

The join() is called when mddev detects it is a clustered device.
The main responsibilities are:
	1. Setup a DLM lockspace
	2. Setup all initial locks such as super block locks and bitmap lock (will come later)

The leave() clears up the lockspace and all the locks held.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/bitmap.c     |  9 ++++++-
 drivers/md/md-cluster.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++---
 drivers/md/md.c         |  2 ++
 drivers/md/md.h         |  8 ++++++
 4 files changed, 80 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 3a5767968ba0..e2aacca46911 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -433,6 +433,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
 	/* This might have been changed by a reshape */
 	sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
 	sb->chunksize = cpu_to_le32(bitmap->mddev->bitmap_info.chunksize);
+	sb->nodes = cpu_to_le32(bitmap->mddev->bitmap_info.nodes);
 	sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
 					   bitmap_info.space);
 	kunmap_atomic(sb);
@@ -544,6 +545,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	bitmap_super_t *sb;
 	unsigned long chunksize, daemon_sleep, write_behind;
 	unsigned long long events;
+	int nodes = 0;
 	unsigned long sectors_reserved = 0;
 	int err = -EINVAL;
 	struct page *sb_page;
@@ -583,6 +585,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
 	write_behind = le32_to_cpu(sb->write_behind);
 	sectors_reserved = le32_to_cpu(sb->sectors_reserved);
+	nodes = le32_to_cpu(sb->nodes);
 
 	/* verify that the bitmap-specific fields are valid */
 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -643,6 +646,7 @@ out_no_sb:
 	bitmap->mddev->bitmap_info.chunksize = chunksize;
 	bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
 	bitmap->mddev->bitmap_info.max_write_behind = write_behind;
+	bitmap->mddev->bitmap_info.nodes = nodes;
 	if (bitmap->mddev->bitmap_info.space == 0 ||
 	    bitmap->mddev->bitmap_info.space > sectors_reserved)
 		bitmap->mddev->bitmap_info.space = sectors_reserved;
@@ -2186,6 +2190,8 @@ __ATTR(chunksize, S_IRUGO|S_IWUSR, chunksize_show, chunksize_store);
 
 static ssize_t metadata_show(struct mddev *mddev, char *page)
 {
+	if (mddev_is_clustered(mddev))
+		return sprintf(page, "clustered\n");
 	return sprintf(page, "%s\n", (mddev->bitmap_info.external
 				      ? "external" : "internal"));
 }
@@ -2198,7 +2204,8 @@ static ssize_t metadata_store(struct mddev *mddev, const char *buf, size_t len)
 		return -EBUSY;
 	if (strncmp(buf, "external", 8) == 0)
 		mddev->bitmap_info.external = 1;
-	else if (strncmp(buf, "internal", 8) == 0)
+	else if ((strncmp(buf, "internal", 8) == 0) ||
+			(strncmp(buf, "clustered", 9) == 0))
 		mddev->bitmap_info.external = 0;
 	else
 		return -EINVAL;
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index e2235600a72b..d141d4812c8c 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -22,8 +22,16 @@ struct dlm_lock_resource {
 	struct dlm_lksb lksb;
 	char *name; /* lock name. */
 	uint32_t flags; /* flags to pass to dlm_lock() */
-	void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
 	struct completion completion; /* completion for synchronized locking */
+	void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
+	struct mddev *mddev; /* pointing back to mddev. */
+};
+
+struct md_cluster_info {
+	/* dlm lock space and resources for clustered raid. */
+	dlm_lockspace_t *lockspace;
+	struct dlm_lock_resource *sb_lock;
+	struct mutex sb_mutex;
 };
 
 static void sync_ast(void *arg)
@@ -53,16 +61,18 @@ static int dlm_unlock_sync(struct dlm_lock_resource *res)
 	return dlm_lock_sync(res, DLM_LOCK_NL);
 }
 
-static struct dlm_lock_resource *lockres_init(dlm_lockspace_t *lockspace,
+static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
 		char *name, void (*bastfn)(void *arg, int mode), int with_lvb)
 {
 	struct dlm_lock_resource *res = NULL;
 	int ret, namelen;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
 
 	res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
 	if (!res)
 		return NULL;
-	res->ls = lockspace;
+	res->ls = cinfo->lockspace;
+	res->mddev = mddev;
 	namelen = strlen(name);
 	res->name = kzalloc(namelen + 1, GFP_KERNEL);
 	if (!res->name) {
@@ -114,13 +124,62 @@ static void lockres_free(struct dlm_lock_resource *res)
 	kfree(res);
 }
 
+static char *pretty_uuid(char *dest, char *src)
+{
+	int i, len = 0;
+
+	for (i = 0; i < 16; i++) {
+		if (i == 4 || i == 6 || i == 8 || i == 10)
+			len += sprintf(dest + len, "-");
+		len += sprintf(dest + len, "%02x", (__u8)src[i]);
+	}
+	return dest;
+}
+
 static int join(struct mddev *mddev, int nodes)
 {
+	struct md_cluster_info *cinfo;
+	int ret;
+	char str[64];
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENOENT;
+
+	cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
+	if (!cinfo)
+		return -ENOMEM;
+
+	memset(str, 0, 64);
+	pretty_uuid(str, mddev->uuid);
+	ret = dlm_new_lockspace(str, NULL, DLM_LSFL_FS, LVB_SIZE,
+				NULL, NULL, NULL, &cinfo->lockspace);
+	if (ret)
+		goto err;
+	cinfo->sb_lock = lockres_init(mddev, "cmd-super",
+					NULL, 0);
+	if (!cinfo->sb_lock) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	mutex_init(&cinfo->sb_mutex);
+	mddev->cluster_info = cinfo;
 	return 0;
+err:
+	if (cinfo->lockspace)
+		dlm_release_lockspace(cinfo->lockspace, 2);
+	kfree(cinfo);
+	module_put(THIS_MODULE);
+	return ret;
 }
 
 static int leave(struct mddev *mddev)
 {
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	if (!cinfo)
+		return 0;
+	lockres_free(cinfo->sb_lock);
+	dlm_release_lockspace(cinfo->lockspace, 2);
 	return 0;
 }
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 57ecb51ec5fd..3387f940140b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7279,6 +7279,8 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
 
 void md_cluster_stop(struct mddev *mddev)
 {
+	if (!md_cluster_ops)
+		return;
 	md_cluster_ops->leave(mddev);
 	module_put(md_cluster_mod);
 }
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 018593197c4d..80fc89976915 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -203,6 +203,8 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 				int is_new);
 extern void md_ack_all_badblocks(struct badblocks *bb);
 
+struct md_cluster_info;
+
 struct mddev {
 	void				*private;
 	struct md_personality		*pers;
@@ -431,6 +433,7 @@ struct mddev {
 		unsigned long		daemon_sleep; /* how many jiffies between updates? */
 		unsigned long		max_write_behind; /* write-behind mode */
 		int			external;
+		int			nodes; /* Maximum number of nodes in the cluster */
 	} bitmap_info;
 
 	atomic_t			max_corr_read_errors; /* max read retries */
@@ -449,6 +452,7 @@ struct mddev {
 	struct work_struct flush_work;
 	struct work_struct event_work;	/* used by dm to report failure event */
 	void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
+	struct md_cluster_info		*cluster_info;
 };
 
 static inline int __must_check mddev_lock(struct mddev *mddev)
@@ -676,4 +680,8 @@ static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
 }
 
 extern struct md_cluster_operations *md_cluster_ops;
+static inline int mddev_is_clustered(struct mddev *mddev)
+{
+	return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
+}
 #endif /* _MD_MD_H */
-- 
cgit v1.2.1


From ca8895d9bb41e743271c42a4438a296de891b73b Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Wed, 26 Nov 2014 12:22:03 -0600
Subject: Return MD_SB_CLUSTERED if mddev is clustered

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3387f940140b..5ed57688e5c5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5634,6 +5634,9 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 		info.state = (1<<MD_SB_CLEAN);
 	if (mddev->bitmap && mddev->bitmap_info.offset)
 		info.state |= (1<<MD_SB_BITMAP_PRESENT);
+	if (mddev_is_clustered(mddev))
+		info.state |= (1<<MD_SB_CLUSTERED);
+
 	info.active_disks  = insync;
 	info.working_disks = working;
 	info.failed_disks  = failed;
-- 
cgit v1.2.1


From cf921cc19cf7c1e99f730a2faa02d80817d684a2 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sun, 30 Mar 2014 00:42:49 -0500
Subject: Add node recovery callbacks

DLM offers callbacks when a node fails and the lock remastery
is performed:

1. recover_prep: called when DLM discovers a node is down
2. recover_slot: called when DLM identifies the node and recovery
		can start
3. recover_done: called when all nodes have completed recover_slot

recover_slot() and recover_done() are also called when the node joins
initially in order to inform the node with its slot number. These slot
numbers start from one, so we deduct one to make it start with zero
which the cluster-md code uses.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/bitmap.c     |  1 +
 drivers/md/bitmap.h     |  4 ++--
 drivers/md/md-cluster.c | 62 +++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/md-cluster.h |  3 ++-
 drivers/md/md.c         |  2 +-
 drivers/md/md.h         |  1 +
 6 files changed, 64 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e2aacca46911..b43a75a246e7 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -637,6 +637,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN)
 		set_bit(BITMAP_HOSTENDIAN, &bitmap->flags);
 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
+	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
 	err = 0;
 out:
 	kunmap_atomic(sb);
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index f4e53eadb083..ec9032f105b8 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -130,9 +130,9 @@ typedef struct bitmap_super_s {
 	__le32 write_behind; /* 60  number of outstanding write-behind writes */
 	__le32 sectors_reserved; /* 64 number of 512-byte sectors that are
 				  * reserved for the bitmap. */
-
 	__le32 nodes;        /* 68 the maximum number of nodes in cluster. */
-	__u8  pad[256 - 72]; /* set to zero */
+	__u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
+	__u8  pad[256 - 136]; /* set to zero */
 } bitmap_super_t;
 
 /* notes:
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index d141d4812c8c..1f3c8f39ecb2 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -30,6 +30,8 @@ struct dlm_lock_resource {
 struct md_cluster_info {
 	/* dlm lock space and resources for clustered raid. */
 	dlm_lockspace_t *lockspace;
+	int slot_number;
+	struct completion completion;
 	struct dlm_lock_resource *sb_lock;
 	struct mutex sb_mutex;
 };
@@ -136,10 +138,42 @@ static char *pretty_uuid(char *dest, char *src)
 	return dest;
 }
 
+static void recover_prep(void *arg)
+{
+}
+
+static void recover_slot(void *arg, struct dlm_slot *slot)
+{
+	struct mddev *mddev = arg;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
+			mddev->bitmap_info.cluster_name,
+			slot->nodeid, slot->slot,
+			cinfo->slot_number);
+}
+
+static void recover_done(void *arg, struct dlm_slot *slots,
+		int num_slots, int our_slot,
+		uint32_t generation)
+{
+	struct mddev *mddev = arg;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	cinfo->slot_number = our_slot;
+	complete(&cinfo->completion);
+}
+
+static const struct dlm_lockspace_ops md_ls_ops = {
+	.recover_prep = recover_prep,
+	.recover_slot = recover_slot,
+	.recover_done = recover_done,
+};
+
 static int join(struct mddev *mddev, int nodes)
 {
 	struct md_cluster_info *cinfo;
-	int ret;
+	int ret, ops_rv;
 	char str[64];
 
 	if (!try_module_get(THIS_MODULE))
@@ -149,24 +183,30 @@ static int join(struct mddev *mddev, int nodes)
 	if (!cinfo)
 		return -ENOMEM;
 
+	init_completion(&cinfo->completion);
+
+	mutex_init(&cinfo->sb_mutex);
+	mddev->cluster_info = cinfo;
+
 	memset(str, 0, 64);
 	pretty_uuid(str, mddev->uuid);
-	ret = dlm_new_lockspace(str, NULL, DLM_LSFL_FS, LVB_SIZE,
-				NULL, NULL, NULL, &cinfo->lockspace);
+	ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
+				DLM_LSFL_FS, LVB_SIZE,
+				&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);
 	if (ret)
 		goto err;
+	wait_for_completion(&cinfo->completion);
 	cinfo->sb_lock = lockres_init(mddev, "cmd-super",
 					NULL, 0);
 	if (!cinfo->sb_lock) {
 		ret = -ENOMEM;
 		goto err;
 	}
-	mutex_init(&cinfo->sb_mutex);
-	mddev->cluster_info = cinfo;
 	return 0;
 err:
 	if (cinfo->lockspace)
 		dlm_release_lockspace(cinfo->lockspace, 2);
+	mddev->cluster_info = NULL;
 	kfree(cinfo);
 	module_put(THIS_MODULE);
 	return ret;
@@ -183,9 +223,21 @@ static int leave(struct mddev *mddev)
 	return 0;
 }
 
+/* slot_number(): Returns the MD slot number to use
+ * DLM starts the slot numbers from 1, wheras cluster-md
+ * wants the number to be from zero, so we deduct one
+ */
+static int slot_number(struct mddev *mddev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	return cinfo->slot_number - 1;
+}
+
 static struct md_cluster_operations cluster_ops = {
 	.join   = join,
 	.leave  = leave,
+	.slot_number = slot_number,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index aa9f07bd6b96..52a21e0d6dbc 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -8,8 +8,9 @@
 struct mddev;
 
 struct md_cluster_operations {
-	int (*join)(struct mddev *mddev);
+	int (*join)(struct mddev *mddev, int nodes);
 	int (*leave)(struct mddev *mddev);
+	int (*slot_number)(struct mddev *mddev);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5ed57688e5c5..8f310d98f082 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7277,7 +7277,7 @@ int md_setup_cluster(struct mddev *mddev, int nodes)
 	}
 	spin_unlock(&pers_lock);
 
-	return md_cluster_ops->join(mddev);
+	return md_cluster_ops->join(mddev, nodes);
 }
 
 void md_cluster_stop(struct mddev *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 80fc89976915..81e568090d8f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -434,6 +434,7 @@ struct mddev {
 		unsigned long		max_write_behind; /* write-behind mode */
 		int			external;
 		int			nodes; /* Maximum number of nodes in the cluster */
+		char                    cluster_name[64]; /* Name of the cluster */
 	} bitmap_info;
 
 	atomic_t			max_corr_read_errors; /* max read retries */
-- 
cgit v1.2.1


From b97e92574c0bf335db1cd2ec491d8ff5cd5d0b49 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 6 Jun 2014 11:50:56 -0500
Subject: Use separate bitmaps for each nodes in the cluster

On-disk format:

0                    4k                     8k                    12k
-------------------------------------------------------------------
| idle                | md super            | bm super [0] + bits |
| bm bits[0, contd]   | bm super[1] + bits  | bm bits[1, contd]   |
| bm super[2] + bits  | bm bits [2, contd]  | bm super[3] + bits  |
| bm bits [3, contd]  |                     |                     |

Bitmap super has a field nodes, which defines the maximum number
of nodes the device can use. While reading the bitmap super, if
the cluster finds out that the number of nodes is > 0:
1. Requests the md-cluster module.
2. Calls md_cluster_ops->join(), which sets up clustering such as
   joining DLM lockspace.

Since the first time, the first bitmap is read. After the call
to the cluster_setup, the bitmap offset is adjusted and the
superblock is re-read. This also ensures the bitmap is read
the bitmap lock (when bitmap lock is introduced in later patches)

Questions:
1. cluster name is repeated in all bitmap supers. Is that okay?

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/bitmap.c     | 67 +++++++++++++++++++++++++++++++++++++++++--------
 drivers/md/bitmap.h     |  1 +
 drivers/md/md-cluster.c |  6 +++++
 3 files changed, 64 insertions(+), 10 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b43a75a246e7..b1d94eee3346 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -205,6 +205,10 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
 	struct bitmap_storage *store = &bitmap->storage;
+	int node_offset = 0;
+
+	if (mddev_is_clustered(bitmap->mddev))
+		node_offset = bitmap->cluster_slot * store->file_pages;
 
 	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
 		int size = PAGE_SIZE;
@@ -549,6 +553,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	unsigned long sectors_reserved = 0;
 	int err = -EINVAL;
 	struct page *sb_page;
+	int cluster_setup_done = 0;
 
 	if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
 		chunksize = 128 * 1024 * 1024;
@@ -564,6 +569,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 		return -ENOMEM;
 	bitmap->storage.sb_page = sb_page;
 
+re_read:
 	if (bitmap->storage.file) {
 		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
 		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
@@ -579,6 +585,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	if (err)
 		return err;
 
+	err = -EINVAL;
 	sb = kmap_atomic(sb_page);
 
 	chunksize = le32_to_cpu(sb->chunksize);
@@ -586,6 +593,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	write_behind = le32_to_cpu(sb->write_behind);
 	sectors_reserved = le32_to_cpu(sb->sectors_reserved);
 	nodes = le32_to_cpu(sb->nodes);
+	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
 
 	/* verify that the bitmap-specific fields are valid */
 	if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
@@ -622,7 +630,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 			goto out;
 		}
 		events = le64_to_cpu(sb->events);
-		if (events < bitmap->mddev->events) {
+		if (!nodes && (events < bitmap->mddev->events)) {
 			printk(KERN_INFO
 			       "%s: bitmap file is out of date (%llu < %llu) "
 			       "-- forcing full recovery\n",
@@ -639,8 +647,34 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	bitmap->events_cleared = le64_to_cpu(sb->events_cleared);
 	strlcpy(bitmap->mddev->bitmap_info.cluster_name, sb->cluster_name, 64);
 	err = 0;
+
 out:
 	kunmap_atomic(sb);
+	if (nodes && !cluster_setup_done) {
+		sector_t bm_blocks;
+
+		bm_blocks = sector_div(bitmap->mddev->resync_max_sectors, (chunksize >> 9));
+		bm_blocks = bm_blocks << 3;
+		/* We have bitmap supers at 4k boundaries, hence this
+		 * is hardcoded */
+		bm_blocks = DIV_ROUND_UP(bm_blocks, 4096);
+		err = md_setup_cluster(bitmap->mddev, nodes);
+		if (err) {
+			pr_err("%s: Could not setup cluster service (%d)\n",
+					bmname(bitmap), err);
+			goto out_no_sb;
+		}
+		bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
+		bitmap->mddev->bitmap_info.offset +=
+			bitmap->cluster_slot * (bm_blocks << 3);
+		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
+			bitmap->cluster_slot,
+			(unsigned long long)bitmap->mddev->bitmap_info.offset);
+		cluster_setup_done = 1;
+		goto re_read;
+	}
+
+
 out_no_sb:
 	if (test_bit(BITMAP_STALE, &bitmap->flags))
 		bitmap->events_cleared = bitmap->mddev->events;
@@ -651,8 +685,11 @@ out_no_sb:
 	if (bitmap->mddev->bitmap_info.space == 0 ||
 	    bitmap->mddev->bitmap_info.space > sectors_reserved)
 		bitmap->mddev->bitmap_info.space = sectors_reserved;
-	if (err)
+	if (err) {
 		bitmap_print_sb(bitmap);
+		if (cluster_setup_done)
+			md_cluster_stop(bitmap->mddev);
+	}
 	return err;
 }
 
@@ -697,9 +734,10 @@ static inline struct page *filemap_get_page(struct bitmap_storage *store,
 }
 
 static int bitmap_storage_alloc(struct bitmap_storage *store,
-				unsigned long chunks, int with_super)
+				unsigned long chunks, int with_super,
+				int slot_number)
 {
-	int pnum;
+	int pnum, offset = 0;
 	unsigned long num_pages;
 	unsigned long bytes;
 
@@ -708,6 +746,7 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 		bytes += sizeof(bitmap_super_t);
 
 	num_pages = DIV_ROUND_UP(bytes, PAGE_SIZE);
+	offset = slot_number * (num_pages - 1);
 
 	store->filemap = kmalloc(sizeof(struct page *)
 				 * num_pages, GFP_KERNEL);
@@ -718,20 +757,22 @@ static int bitmap_storage_alloc(struct bitmap_storage *store,
 		store->sb_page = alloc_page(GFP_KERNEL|__GFP_ZERO);
 		if (store->sb_page == NULL)
 			return -ENOMEM;
-		store->sb_page->index = 0;
 	}
+
 	pnum = 0;
 	if (store->sb_page) {
 		store->filemap[0] = store->sb_page;
 		pnum = 1;
+		store->sb_page->index = offset;
 	}
+
 	for ( ; pnum < num_pages; pnum++) {
 		store->filemap[pnum] = alloc_page(GFP_KERNEL|__GFP_ZERO);
 		if (!store->filemap[pnum]) {
 			store->file_pages = pnum;
 			return -ENOMEM;
 		}
-		store->filemap[pnum]->index = pnum;
+		store->filemap[pnum]->index = pnum + offset;
 	}
 	store->file_pages = pnum;
 
@@ -940,7 +981,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
  */
 static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 {
-	unsigned long i, chunks, index, oldindex, bit;
+	unsigned long i, chunks, index, oldindex, bit, node_offset = 0;
 	struct page *page = NULL;
 	unsigned long bit_cnt = 0;
 	struct file *file;
@@ -986,6 +1027,9 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 	if (!bitmap->mddev->bitmap_info.external)
 		offset = sizeof(bitmap_super_t);
 
+	if (mddev_is_clustered(bitmap->mddev))
+		node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
+
 	for (i = 0; i < chunks; i++) {
 		int b;
 		index = file_page_index(&bitmap->storage, i);
@@ -1006,7 +1050,7 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
 					bitmap->mddev,
 					bitmap->mddev->bitmap_info.offset,
 					page,
-					index, count);
+					index + node_offset, count);
 
 			if (ret)
 				goto err;
@@ -1212,7 +1256,6 @@ void bitmap_daemon_work(struct mddev *mddev)
 	     j < bitmap->storage.file_pages
 		     && !test_bit(BITMAP_STALE, &bitmap->flags);
 	     j++) {
-
 		if (test_page_attr(bitmap, j,
 				   BITMAP_PAGE_DIRTY))
 			/* bitmap_unplug will handle the rest */
@@ -1596,6 +1639,9 @@ static void bitmap_free(struct bitmap *bitmap)
 	if (!bitmap) /* there was no bitmap */
 		return;
 
+	if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info)
+		md_cluster_stop(bitmap->mddev);
+
 	/* Shouldn't be needed - but just in case.... */
 	wait_event(bitmap->write_wait,
 		   atomic_read(&bitmap->pending_writes) == 0);
@@ -1854,7 +1900,8 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 	memset(&store, 0, sizeof(store));
 	if (bitmap->mddev->bitmap_info.offset || bitmap->mddev->bitmap_info.file)
 		ret = bitmap_storage_alloc(&store, chunks,
-					   !bitmap->mddev->bitmap_info.external);
+					   !bitmap->mddev->bitmap_info.external,
+					   bitmap->cluster_slot);
 	if (ret)
 		goto err;
 
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index ec9032f105b8..4e9acb08bbe0 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -227,6 +227,7 @@ struct bitmap {
 	wait_queue_head_t behind_wait;
 
 	struct kernfs_node *sysfs_can_clear;
+	int cluster_slot;		/* Slot offset for clustered env */
 };
 
 /* the bitmap API */
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 1f3c8f39ecb2..66700e244a40 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -196,6 +196,12 @@ static int join(struct mddev *mddev, int nodes)
 	if (ret)
 		goto err;
 	wait_for_completion(&cinfo->completion);
+	if (nodes <= cinfo->slot_number) {
+		pr_err("md-cluster: Slot allotted(%d) greater than available slots(%d)", cinfo->slot_number - 1,
+			nodes);
+		ret = -ERANGE;
+		goto err;
+	}
 	cinfo->sb_lock = lockres_init(mddev, "cmd-super",
 					NULL, 0);
 	if (!cinfo->sb_lock) {
-- 
cgit v1.2.1


From 54519c5f4b398bcfe599f652b4ef4004d5fa63ff Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 6 Jun 2014 12:12:32 -0500
Subject: Lock bitmap while joining the cluster

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 66700e244a40..75c6602f4c75 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -34,6 +34,7 @@ struct md_cluster_info {
 	struct completion completion;
 	struct dlm_lock_resource *sb_lock;
 	struct mutex sb_mutex;
+	struct dlm_lock_resource *bitmap_lockres;
 };
 
 static void sync_ast(void *arg)
@@ -208,6 +209,18 @@ static int join(struct mddev *mddev, int nodes)
 		ret = -ENOMEM;
 		goto err;
 	}
+
+	pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
+	snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
+	cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1);
+	if (!cinfo->bitmap_lockres)
+		goto err;
+	if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) {
+		pr_err("Failed to get bitmap lock\n");
+		ret = -EINVAL;
+		goto err;
+	}
+
 	return 0;
 err:
 	if (cinfo->lockspace)
@@ -225,6 +238,7 @@ static int leave(struct mddev *mddev)
 	if (!cinfo)
 		return 0;
 	lockres_free(cinfo->sb_lock);
+	lockres_free(cinfo->bitmap_lockres);
 	dlm_release_lockspace(cinfo->lockspace, 2);
 	return 0;
 }
-- 
cgit v1.2.1


From 96ae923ab659e37dd5fc1e05ecbf654e2f94bcbe Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 6 Jun 2014 12:35:34 -0500
Subject: Gather on-going resync information of other nodes

When a node joins, it does not know of other nodes performing resync.
So, each node keeps the resync information in it's LVB. When a new
node joins, it reads the LVB of each "online" bitmap.

[TODO] The new node attempts to get the PW lock on other bitmap, if
it is successful, it reads the bitmap and performs the resync (if
required) on it's behalf.

If the node does not get the PW, it requests CR and reads the LVB
for the resync information.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/md-cluster.h |   1 +
 drivers/md/md.c         |   8 ++++
 3 files changed, 120 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 75c6602f4c75..b59c3a0ebd08 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -27,6 +27,18 @@ struct dlm_lock_resource {
 	struct mddev *mddev; /* pointing back to mddev. */
 };
 
+struct suspend_info {
+	int slot;
+	sector_t lo;
+	sector_t hi;
+	struct list_head list;
+};
+
+struct resync_info {
+	__le64 lo;
+	__le64 hi;
+};
+
 struct md_cluster_info {
 	/* dlm lock space and resources for clustered raid. */
 	dlm_lockspace_t *lockspace;
@@ -35,6 +47,8 @@ struct md_cluster_info {
 	struct dlm_lock_resource *sb_lock;
 	struct mutex sb_mutex;
 	struct dlm_lock_resource *bitmap_lockres;
+	struct list_head suspend_list;
+	spinlock_t suspend_lock;
 };
 
 static void sync_ast(void *arg)
@@ -139,6 +153,37 @@ static char *pretty_uuid(char *dest, char *src)
 	return dest;
 }
 
+static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
+		sector_t lo, sector_t hi)
+{
+	struct resync_info *ri;
+
+	ri = (struct resync_info *)lockres->lksb.sb_lvbptr;
+	ri->lo = cpu_to_le64(lo);
+	ri->hi = cpu_to_le64(hi);
+}
+
+static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres)
+{
+	struct resync_info ri;
+	struct suspend_info *s = NULL;
+	sector_t hi = 0;
+
+	dlm_lock_sync(lockres, DLM_LOCK_CR);
+	memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
+	hi = le64_to_cpu(ri.hi);
+	if (ri.hi > 0) {
+		s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
+		if (!s)
+			goto out;
+		s->hi = hi;
+		s->lo = le64_to_cpu(ri.lo);
+	}
+	dlm_unlock_sync(lockres);
+out:
+	return s;
+}
+
 static void recover_prep(void *arg)
 {
 }
@@ -171,6 +216,53 @@ static const struct dlm_lockspace_ops md_ls_ops = {
 	.recover_done = recover_done,
 };
 
+static int gather_all_resync_info(struct mddev *mddev, int total_slots)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int i, ret = 0;
+	struct dlm_lock_resource *bm_lockres;
+	struct suspend_info *s;
+	char str[64];
+
+
+	for (i = 0; i < total_slots; i++) {
+		memset(str, '\0', 64);
+		snprintf(str, 64, "bitmap%04d", i);
+		bm_lockres = lockres_init(mddev, str, NULL, 1);
+		if (!bm_lockres)
+			return -ENOMEM;
+		if (i == (cinfo->slot_number - 1))
+			continue;
+
+		bm_lockres->flags |= DLM_LKF_NOQUEUE;
+		ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+		if (ret == -EAGAIN) {
+			memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE);
+			s = read_resync_info(mddev, bm_lockres);
+			if (s) {
+				pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n",
+						__func__, __LINE__,
+						(unsigned long long) s->lo,
+						(unsigned long long) s->hi, i);
+				spin_lock_irq(&cinfo->suspend_lock);
+				s->slot = i;
+				list_add(&s->list, &cinfo->suspend_list);
+				spin_unlock_irq(&cinfo->suspend_lock);
+			}
+			ret = 0;
+			lockres_free(bm_lockres);
+			continue;
+		}
+		if (ret)
+			goto out;
+		/* TODO: Read the disk bitmap sb and check if it needs recovery */
+		dlm_unlock_sync(bm_lockres);
+		lockres_free(bm_lockres);
+	}
+out:
+	return ret;
+}
+
 static int join(struct mddev *mddev, int nodes)
 {
 	struct md_cluster_info *cinfo;
@@ -221,8 +313,17 @@ static int join(struct mddev *mddev, int nodes)
 		goto err;
 	}
 
+	INIT_LIST_HEAD(&cinfo->suspend_list);
+	spin_lock_init(&cinfo->suspend_lock);
+
+	ret = gather_all_resync_info(mddev, nodes);
+	if (ret)
+		goto err;
+
 	return 0;
 err:
+	lockres_free(cinfo->bitmap_lockres);
+	lockres_free(cinfo->sb_lock);
 	if (cinfo->lockspace)
 		dlm_release_lockspace(cinfo->lockspace, 2);
 	mddev->cluster_info = NULL;
@@ -254,10 +355,20 @@ static int slot_number(struct mddev *mddev)
 	return cinfo->slot_number - 1;
 }
 
+static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
+	/* Re-acquire the lock to refresh LVB */
+	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
+}
+
 static struct md_cluster_operations cluster_ops = {
 	.join   = join,
 	.leave  = leave,
 	.slot_number = slot_number,
+	.resync_info_update = resync_info_update,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 52a21e0d6dbc..51a24df15b64 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -11,6 +11,7 @@ struct md_cluster_operations {
 	int (*join)(struct mddev *mddev, int nodes);
 	int (*leave)(struct mddev *mddev);
 	int (*slot_number)(struct mddev *mddev);
+	void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8f310d98f082..71f655015385 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7626,6 +7626,9 @@ void md_do_sync(struct md_thread *thread)
 	md_new_event(mddev);
 	update_time = jiffies;
 
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->resync_info_update(mddev, j, max_sectors);
+
 	blk_start_plug(&plug);
 	while (j < max_sectors) {
 		sector_t sectors;
@@ -7686,6 +7689,8 @@ void md_do_sync(struct md_thread *thread)
 		j += sectors;
 		if (j > 2)
 			mddev->curr_resync = j;
+		if (mddev_is_clustered(mddev))
+			md_cluster_ops->resync_info_update(mddev, j, max_sectors);
 		mddev->curr_mark_cnt = io_sectors;
 		if (last_check == 0)
 			/* this is the earliest that rebuild will be
@@ -7746,6 +7751,9 @@ void md_do_sync(struct md_thread *thread)
 	/* tell personality that we are finished */
 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
 
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->resync_info_update(mddev, 0, 0);
+
 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
 	    mddev->curr_resync > 2) {
 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
-- 
cgit v1.2.1


From f9209a323547f054c7439a3bf67c45e64a054bdd Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Fri, 6 Jun 2014 12:43:49 -0500
Subject: bitmap_create returns bitmap pointer

This is done to have multiple bitmaps open at the same time.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/bitmap.c | 60 ++++++++++++++++++++++++++++++++---------------------
 drivers/md/bitmap.h |  2 +-
 drivers/md/md.c     | 25 ++++++++++++++++------
 3 files changed, 56 insertions(+), 31 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index b1d94eee3346..f02551f50bb5 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -553,7 +553,6 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	unsigned long sectors_reserved = 0;
 	int err = -EINVAL;
 	struct page *sb_page;
-	int cluster_setup_done = 0;
 
 	if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
 		chunksize = 128 * 1024 * 1024;
@@ -570,6 +569,18 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 	bitmap->storage.sb_page = sb_page;
 
 re_read:
+	/* If cluster_slot is set, the cluster is setup */
+	if (bitmap->cluster_slot >= 0) {
+		long long bm_blocks;
+
+		bm_blocks = bitmap->mddev->resync_max_sectors / (bitmap->mddev->bitmap_info.chunksize >> 9);
+		bm_blocks = bm_blocks << 3;
+		bm_blocks = DIV_ROUND_UP(bm_blocks, 4096);
+		bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
+		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
+			bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset);
+	}
+
 	if (bitmap->storage.file) {
 		loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
 		int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
@@ -650,14 +661,9 @@ re_read:
 
 out:
 	kunmap_atomic(sb);
-	if (nodes && !cluster_setup_done) {
-		sector_t bm_blocks;
-
-		bm_blocks = sector_div(bitmap->mddev->resync_max_sectors, (chunksize >> 9));
-		bm_blocks = bm_blocks << 3;
-		/* We have bitmap supers at 4k boundaries, hence this
-		 * is hardcoded */
-		bm_blocks = DIV_ROUND_UP(bm_blocks, 4096);
+	/* Assiging chunksize is required for "re_read" */
+	bitmap->mddev->bitmap_info.chunksize = chunksize;
+	if (nodes && (bitmap->cluster_slot < 0)) {
 		err = md_setup_cluster(bitmap->mddev, nodes);
 		if (err) {
 			pr_err("%s: Could not setup cluster service (%d)\n",
@@ -665,12 +671,9 @@ out:
 			goto out_no_sb;
 		}
 		bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
-		bitmap->mddev->bitmap_info.offset +=
-			bitmap->cluster_slot * (bm_blocks << 3);
 		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
 			bitmap->cluster_slot,
 			(unsigned long long)bitmap->mddev->bitmap_info.offset);
-		cluster_setup_done = 1;
 		goto re_read;
 	}
 
@@ -687,7 +690,7 @@ out_no_sb:
 		bitmap->mddev->bitmap_info.space = sectors_reserved;
 	if (err) {
 		bitmap_print_sb(bitmap);
-		if (cluster_setup_done)
+		if (bitmap->cluster_slot < 0)
 			md_cluster_stop(bitmap->mddev);
 	}
 	return err;
@@ -1639,7 +1642,8 @@ static void bitmap_free(struct bitmap *bitmap)
 	if (!bitmap) /* there was no bitmap */
 		return;
 
-	if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info)
+	if (mddev_is_clustered(bitmap->mddev) && bitmap->mddev->cluster_info &&
+		bitmap->cluster_slot == md_cluster_ops->slot_number(bitmap->mddev))
 		md_cluster_stop(bitmap->mddev);
 
 	/* Shouldn't be needed - but just in case.... */
@@ -1687,7 +1691,7 @@ void bitmap_destroy(struct mddev *mddev)
  * initialize the bitmap structure
  * if this returns an error, bitmap_destroy must be called to do clean up
  */
-int bitmap_create(struct mddev *mddev)
+struct bitmap *bitmap_create(struct mddev *mddev, int slot)
 {
 	struct bitmap *bitmap;
 	sector_t blocks = mddev->resync_max_sectors;
@@ -1701,7 +1705,7 @@ int bitmap_create(struct mddev *mddev)
 
 	bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
 	if (!bitmap)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	spin_lock_init(&bitmap->counts.lock);
 	atomic_set(&bitmap->pending_writes, 0);
@@ -1710,6 +1714,7 @@ int bitmap_create(struct mddev *mddev)
 	init_waitqueue_head(&bitmap->behind_wait);
 
 	bitmap->mddev = mddev;
+	bitmap->cluster_slot = slot;
 
 	if (mddev->kobj.sd)
 		bm = sysfs_get_dirent(mddev->kobj.sd, "bitmap");
@@ -1757,12 +1762,14 @@ int bitmap_create(struct mddev *mddev)
 	printk(KERN_INFO "created bitmap (%lu pages) for device %s\n",
 	       bitmap->counts.pages, bmname(bitmap));
 
-	mddev->bitmap = bitmap;
-	return test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
+	err = test_bit(BITMAP_WRITE_ERROR, &bitmap->flags) ? -EIO : 0;
+	if (err)
+		goto error;
 
+	return bitmap;
  error:
 	bitmap_free(bitmap);
-	return err;
+	return ERR_PTR(err);
 }
 
 int bitmap_load(struct mddev *mddev)
@@ -2073,13 +2080,18 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
 				return -EINVAL;
 			mddev->bitmap_info.offset = offset;
 			if (mddev->pers) {
+				struct bitmap *bitmap;
 				mddev->pers->quiesce(mddev, 1);
-				rv = bitmap_create(mddev);
-				if (!rv)
+				bitmap = bitmap_create(mddev, -1);
+				if (IS_ERR(bitmap))
+					rv = PTR_ERR(bitmap);
+				else {
+					mddev->bitmap = bitmap;
 					rv = bitmap_load(mddev);
-				if (rv) {
-					bitmap_destroy(mddev);
-					mddev->bitmap_info.offset = 0;
+					if (rv) {
+						bitmap_destroy(mddev);
+						mddev->bitmap_info.offset = 0;
+					}
 				}
 				mddev->pers->quiesce(mddev, 0);
 				if (rv)
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 4e9acb08bbe0..67c7f77c67dd 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -233,7 +233,7 @@ struct bitmap {
 /* the bitmap API */
 
 /* these are used only by md/bitmap */
-int  bitmap_create(struct mddev *mddev);
+struct bitmap *bitmap_create(struct mddev *mddev, int slot);
 int bitmap_load(struct mddev *mddev);
 void bitmap_flush(struct mddev *mddev);
 void bitmap_destroy(struct mddev *mddev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 71f655015385..630a9142a819 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5076,10 +5076,16 @@ int md_run(struct mddev *mddev)
 	}
 	if (err == 0 && pers->sync_request &&
 	    (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
-		err = bitmap_create(mddev);
-		if (err)
+		struct bitmap *bitmap;
+
+		bitmap = bitmap_create(mddev, -1);
+		if (IS_ERR(bitmap)) {
+			err = PTR_ERR(bitmap);
 			printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
 			       mdname(mddev), err);
+		} else
+			mddev->bitmap = bitmap;
+
 	}
 	if (err) {
 		mddev_detach(mddev);
@@ -6039,9 +6045,13 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 	if (mddev->pers) {
 		mddev->pers->quiesce(mddev, 1);
 		if (fd >= 0) {
-			err = bitmap_create(mddev);
-			if (!err)
+			struct bitmap *bitmap;
+
+			bitmap = bitmap_create(mddev, -1);
+			if (!IS_ERR(bitmap)) {
+				mddev->bitmap = bitmap;
 				err = bitmap_load(mddev);
+			}
 		}
 		if (fd < 0 || err) {
 			bitmap_destroy(mddev);
@@ -6306,6 +6316,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 		if (mddev->recovery || mddev->sync_thread)
 			return -EBUSY;
 		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
+			struct bitmap *bitmap;
 			/* add the bitmap */
 			if (mddev->bitmap)
 				return -EEXIST;
@@ -6316,9 +6327,11 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 			mddev->bitmap_info.space =
 				mddev->bitmap_info.default_space;
 			mddev->pers->quiesce(mddev, 1);
-			rv = bitmap_create(mddev);
-			if (!rv)
+			bitmap = bitmap_create(mddev, -1);
+			if (!IS_ERR(bitmap)) {
+				mddev->bitmap = bitmap;
 				rv = bitmap_load(mddev);
+			}
 			if (rv)
 				bitmap_destroy(mddev);
 			mddev->pers->quiesce(mddev, 0);
-- 
cgit v1.2.1


From 11dd35daaab86d12270d23a10e8d242846a8830a Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 7 Jun 2014 00:36:26 -0500
Subject: Copy set bits from another slot

bitmap_copy_from_slot reads the bitmap from the slot mentioned.
It then copies the set bits to the node local bitmap.

This is helper function for the resync operation on node failure.

bitmap_set_memory_bits() currently assumes it is only run at startup and that
they bitmap is currently empty.  So if it finds that a region is already
marked as dirty, it won't mark it dirty again. Change bitmap_set_memory_bits()
to always set the NEEDED_MASK bit if 'needed' is set.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/bitmap.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/bitmap.h |  2 ++
 2 files changed, 79 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index f02551f50bb5..dd8c78043eab 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -934,6 +934,28 @@ static void bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
 	}
 }
 
+static int bitmap_file_test_bit(struct bitmap *bitmap, sector_t block)
+{
+	unsigned long bit;
+	struct page *page;
+	void *paddr;
+	unsigned long chunk = block >> bitmap->counts.chunkshift;
+	int set = 0;
+
+	page = filemap_get_page(&bitmap->storage, chunk);
+	if (!page)
+		return -EINVAL;
+	bit = file_page_offset(&bitmap->storage, chunk);
+	paddr = kmap_atomic(page);
+	if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
+		set = test_bit(bit, paddr);
+	else
+		set = test_bit_le(bit, paddr);
+	kunmap_atomic(paddr);
+	return set;
+}
+
+
 /* this gets called when the md device is ready to unplug its underlying
  * (slave) device queues -- before we let any writes go down, we need to
  * sync the dirty pages of the bitmap file to disk */
@@ -1581,11 +1603,13 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
 		return;
 	}
 	if (!*bmc) {
-		*bmc = 2 | (needed ? NEEDED_MASK : 0);
+		*bmc = 2;
 		bitmap_count_page(&bitmap->counts, offset, 1);
 		bitmap_set_pending(&bitmap->counts, offset);
 		bitmap->allclean = 0;
 	}
+	if (needed)
+		*bmc |= NEEDED_MASK;
 	spin_unlock_irq(&bitmap->counts.lock);
 }
 
@@ -1823,6 +1847,58 @@ out:
 }
 EXPORT_SYMBOL_GPL(bitmap_load);
 
+/* Loads the bitmap associated with slot and copies the resync information
+ * to our bitmap
+ */
+int bitmap_copy_from_slot(struct mddev *mddev, int slot,
+		sector_t *low, sector_t *high)
+{
+	int rv = 0, i, j;
+	sector_t block, lo = 0, hi = 0;
+	struct bitmap_counts *counts;
+	struct bitmap *bitmap = bitmap_create(mddev, slot);
+
+	if (IS_ERR(bitmap))
+		return PTR_ERR(bitmap);
+
+	rv = bitmap_read_sb(bitmap);
+	if (rv)
+		goto err;
+
+	rv = bitmap_init_from_disk(bitmap, 0);
+	if (rv)
+		goto err;
+
+	counts = &bitmap->counts;
+	for (j = 0; j < counts->chunks; j++) {
+		block = (sector_t)j << counts->chunkshift;
+		if (bitmap_file_test_bit(bitmap, block)) {
+			if (!lo)
+				lo = block;
+			hi = block;
+			bitmap_file_clear_bit(bitmap, block);
+			bitmap_set_memory_bits(mddev->bitmap, block, 1);
+			bitmap_file_set_bit(mddev->bitmap, block);
+		}
+	}
+
+	bitmap_update_sb(bitmap);
+	/* Setting this for the ev_page should be enough.
+	 * And we do not require both write_all and PAGE_DIRT either
+	 */
+	for (i = 0; i < bitmap->storage.file_pages; i++)
+		set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+	bitmap_write_all(bitmap);
+	bitmap_unplug(bitmap);
+	*low = lo;
+	*high = hi;
+err:
+	bitmap_free(bitmap);
+	return rv;
+}
+EXPORT_SYMBOL_GPL(bitmap_copy_from_slot);
+
+
 void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 {
 	unsigned long chunk_kb;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 67c7f77c67dd..4aabc74ef7b9 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -262,6 +262,8 @@ void bitmap_daemon_work(struct mddev *mddev);
 
 int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 		  int chunksize, int init);
+int bitmap_copy_from_slot(struct mddev *mddev, int slot,
+				sector_t *lo, sector_t *hi);
 #endif
 
 #endif
-- 
cgit v1.2.1


From e94987db2ed983aea4e45d22db9e17c6bbf2a623 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 7 Jun 2014 00:45:22 -0500
Subject: Initiate recovery on node failure

The DLM informs us in case of node failure with the DLM slot number.
cluster_info->recovery_map sets the bit corresponding to the slot number
and wakes up the recovery thread.

The recovery thread:
1. Derives the slot number from the recovery_map
2. Locks the bitmap corresponding to the slot
3. Copies the set bits to the node-local bitmap

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index b59c3a0ebd08..1f82d0d731ae 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -13,6 +13,7 @@
 #include <linux/dlm.h>
 #include <linux/sched.h>
 #include "md.h"
+#include "bitmap.h"
 #include "md-cluster.h"
 
 #define LVB_SIZE	64
@@ -49,6 +50,8 @@ struct md_cluster_info {
 	struct dlm_lock_resource *bitmap_lockres;
 	struct list_head suspend_list;
 	spinlock_t suspend_lock;
+	struct md_thread *recovery_thread;
+	unsigned long recovery_map;
 };
 
 static void sync_ast(void *arg)
@@ -184,6 +187,50 @@ out:
 	return s;
 }
 
+void recover_bitmaps(struct md_thread *thread)
+{
+	struct mddev *mddev = thread->mddev;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct dlm_lock_resource *bm_lockres;
+	char str[64];
+	int slot, ret;
+	struct suspend_info *s, *tmp;
+	sector_t lo, hi;
+
+	while (cinfo->recovery_map) {
+		slot = fls64((u64)cinfo->recovery_map) - 1;
+
+		/* Clear suspend_area associated with the bitmap */
+		spin_lock_irq(&cinfo->suspend_lock);
+		list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
+			if (slot == s->slot) {
+				list_del(&s->list);
+				kfree(s);
+			}
+		spin_unlock_irq(&cinfo->suspend_lock);
+
+		snprintf(str, 64, "bitmap%04d", slot);
+		bm_lockres = lockres_init(mddev, str, NULL, 1);
+		if (!bm_lockres) {
+			pr_err("md-cluster: Cannot initialize bitmaps\n");
+			goto clear_bit;
+		}
+
+		ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW);
+		if (ret) {
+			pr_err("md-cluster: Could not DLM lock %s: %d\n",
+					str, ret);
+			goto clear_bit;
+		}
+		ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi);
+		if (ret)
+			pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
+		dlm_unlock_sync(bm_lockres);
+clear_bit:
+		clear_bit(slot, &cinfo->recovery_map);
+	}
+}
+
 static void recover_prep(void *arg)
 {
 }
@@ -197,6 +244,16 @@ static void recover_slot(void *arg, struct dlm_slot *slot)
 			mddev->bitmap_info.cluster_name,
 			slot->nodeid, slot->slot,
 			cinfo->slot_number);
+	set_bit(slot->slot - 1, &cinfo->recovery_map);
+	if (!cinfo->recovery_thread) {
+		cinfo->recovery_thread = md_register_thread(recover_bitmaps,
+				mddev, "recover");
+		if (!cinfo->recovery_thread) {
+			pr_warn("md-cluster: Could not create recovery thread\n");
+			return;
+		}
+	}
+	md_wakeup_thread(cinfo->recovery_thread);
 }
 
 static void recover_done(void *arg, struct dlm_slot *slots,
@@ -338,6 +395,7 @@ static int leave(struct mddev *mddev)
 
 	if (!cinfo)
 		return 0;
+	md_unregister_thread(&cinfo->recovery_thread);
 	lockres_free(cinfo->sb_lock);
 	lockres_free(cinfo->bitmap_lockres);
 	dlm_release_lockspace(cinfo->lockspace, 2);
-- 
cgit v1.2.1


From 4b26a08af92c0d9c0bce07612b56ff326112321a Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 7 Jun 2014 00:52:29 -0500
Subject: Perform resync for cluster node failure

If bitmap_copy_slot returns hi>0, we need to perform resync.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 1f82d0d731ae..d2987130be34 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -223,8 +223,18 @@ void recover_bitmaps(struct md_thread *thread)
 			goto clear_bit;
 		}
 		ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi);
-		if (ret)
+		if (ret) {
 			pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
+			goto dlm_unlock;
+		}
+		if (hi > 0) {
+			/* TODO:Wait for current resync to get over */
+			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			if (lo < mddev->recovery_cp)
+				mddev->recovery_cp = lo;
+			md_check_recovery(mddev);
+		}
+dlm_unlock:
 		dlm_unlock_sync(bm_lockres);
 clear_bit:
 		clear_bit(slot, &cinfo->recovery_map);
-- 
cgit v1.2.1


From 4664680c389828928efc61ce3d2cf2c65ad35c97 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 7 Jun 2014 01:08:29 -0500
Subject: Communication Framework: Receiving

1. receive status

   sender                         receiver                   receiver
   ACK:CR                          ACK:CR                     ACK:CR

2. sender get EX of TOKEN
   sender get EX of MESSAGE
   sender                          receiver                   receiver
   TOKEN:EX                         ACK:CR                     ACK:CR
   MESSAGE:EX
   ACK:CR

3. sender write LVB.
   sender down-convert MESSAGE from EX to CR
   sender try to get EX of ACK
   [ wait until all receiver has *processed* the MESSAGE ]

                                     [ triggered by bast of ACK ]
                                     receiver get CR of MESSAGE
                                     receiver read LVB
                                     receiver processes the message
				     [ wait finish ]
                                     receiver release ACK

   sender                         receiver                   receiver
   TOKEN:EX                       MESSAGE:CR                 MESSAGE:CR
   MESSAGE:CR
   ACK:EX

4. sender down-convert ACK from EX to CR
   sender release MESSAGE
   sender release TOKEN
				  receiver upconvert to EX of MESSAGE
                                  receiver get CR of ACK
				  receiver release MESSAGE

   sender                        receiver                   receiver
   ACK:CR                         ACK:CR                     ACK:CR

Signed-off-by: Lidong Zhong <lzhong@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index d2987130be34..96734cdb9b45 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -52,6 +52,23 @@ struct md_cluster_info {
 	spinlock_t suspend_lock;
 	struct md_thread *recovery_thread;
 	unsigned long recovery_map;
+	/* communication loc resources */
+	struct dlm_lock_resource *ack_lockres;
+	struct dlm_lock_resource *message_lockres;
+	struct dlm_lock_resource *token_lockres;
+	struct md_thread *recv_thread;
+};
+
+enum msg_type {
+	METADATA_UPDATED = 0,
+	RESYNCING,
+};
+
+struct cluster_msg {
+	int type;
+	int slot;
+	sector_t low;
+	sector_t high;
 };
 
 static void sync_ast(void *arg)
@@ -283,6 +300,64 @@ static const struct dlm_lockspace_ops md_ls_ops = {
 	.recover_done = recover_done,
 };
 
+/*
+ * The BAST function for the ack lock resource
+ * This function wakes up the receive thread in
+ * order to receive and process the message.
+ */
+static void ack_bast(void *arg, int mode)
+{
+	struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
+	struct md_cluster_info *cinfo = res->mddev->cluster_info;
+
+	if (mode == DLM_LOCK_EX)
+		md_wakeup_thread(cinfo->recv_thread);
+}
+
+static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
+{
+	switch (msg->type) {
+	case METADATA_UPDATED:
+		pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
+			__func__, __LINE__, msg->slot);
+		break;
+	case RESYNCING:
+		pr_info("%s: %d Received message: RESYNCING from %d\n",
+			__func__, __LINE__, msg->slot);
+		break;
+	};
+}
+
+/*
+ * thread for receiving message
+ */
+static void recv_daemon(struct md_thread *thread)
+{
+	struct md_cluster_info *cinfo = thread->mddev->cluster_info;
+	struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
+	struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
+	struct cluster_msg msg;
+
+	/*get CR on Message*/
+	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
+		pr_err("md/raid1:failed to get CR on MESSAGE\n");
+		return;
+	}
+
+	/* read lvb and wake up thread to process this message_lockres */
+	memcpy(&msg, message_lockres->lksb.sb_lvbptr, sizeof(struct cluster_msg));
+	process_recvd_msg(thread->mddev, &msg);
+
+	/*release CR on ack_lockres*/
+	dlm_unlock_sync(ack_lockres);
+	/*up-convert to EX on message_lockres*/
+	dlm_lock_sync(message_lockres, DLM_LOCK_EX);
+	/*get CR on ack_lockres again*/
+	dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+	/*release CR on message_lockres*/
+	dlm_unlock_sync(message_lockres);
+}
+
 static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
@@ -368,6 +443,26 @@ static int join(struct mddev *mddev, int nodes)
 		ret = -ENOMEM;
 		goto err;
 	}
+	/* Initiate the communication resources */
+	ret = -ENOMEM;
+	cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
+	if (!cinfo->recv_thread) {
+		pr_err("md-cluster: cannot allocate memory for recv_thread!\n");
+		goto err;
+	}
+	cinfo->message_lockres = lockres_init(mddev, "message", NULL, 1);
+	if (!cinfo->message_lockres)
+		goto err;
+	cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0);
+	if (!cinfo->token_lockres)
+		goto err;
+	cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
+	if (!cinfo->ack_lockres)
+		goto err;
+	/* get sync CR lock on ACK. */
+	if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
+		pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
+				ret);
 
 	pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
 	snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
@@ -389,6 +484,9 @@ static int join(struct mddev *mddev, int nodes)
 
 	return 0;
 err:
+	lockres_free(cinfo->message_lockres);
+	lockres_free(cinfo->token_lockres);
+	lockres_free(cinfo->ack_lockres);
 	lockres_free(cinfo->bitmap_lockres);
 	lockres_free(cinfo->sb_lock);
 	if (cinfo->lockspace)
@@ -406,6 +504,10 @@ static int leave(struct mddev *mddev)
 	if (!cinfo)
 		return 0;
 	md_unregister_thread(&cinfo->recovery_thread);
+	md_unregister_thread(&cinfo->recv_thread);
+	lockres_free(cinfo->message_lockres);
+	lockres_free(cinfo->token_lockres);
+	lockres_free(cinfo->ack_lockres);
 	lockres_free(cinfo->sb_lock);
 	lockres_free(cinfo->bitmap_lockres);
 	dlm_release_lockspace(cinfo->lockspace, 2);
-- 
cgit v1.2.1


From 601b515c5dcc00fa71148cd9d2405ea1f70bc9cd Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 7 Jun 2014 01:28:53 -0500
Subject: Communication Framework: Sending functions

The sending part is split in two functions to make sure
atomicity of the operations, such as the MD superblock update.

Signed-off-by: Lidong Zhong <lzhong@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 96734cdb9b45..9a4abe1b4aa4 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -358,6 +358,93 @@ static void recv_daemon(struct md_thread *thread)
 	dlm_unlock_sync(message_lockres);
 }
 
+/* lock_comm()
+ * Takes the lock on the TOKEN lock resource so no other
+ * node can communicate while the operation is underway.
+ */
+static int lock_comm(struct md_cluster_info *cinfo)
+{
+	int error;
+
+	error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
+	if (error)
+		pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
+				__func__, __LINE__, error);
+	return error;
+}
+
+static void unlock_comm(struct md_cluster_info *cinfo)
+{
+	dlm_unlock_sync(cinfo->token_lockres);
+}
+
+/* __sendmsg()
+ * This function performs the actual sending of the message. This function is
+ * usually called after performing the encompassing operation
+ * The function:
+ * 1. Grabs the message lockresource in EX mode
+ * 2. Copies the message to the message LVB
+ * 3. Downconverts message lockresource to CR
+ * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
+ *    and the other nodes read the message. The thread will wait here until all other
+ *    nodes have released ack lock resource.
+ * 5. Downconvert ack lockresource to CR
+ */
+static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
+{
+	int error;
+	int slot = cinfo->slot_number - 1;
+
+	cmsg->slot = cpu_to_le32(slot);
+	/*get EX on Message*/
+	error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_EX);
+	if (error) {
+		pr_err("md-cluster: failed to get EX on MESSAGE (%d)\n", error);
+		goto failed_message;
+	}
+
+	memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
+			sizeof(struct cluster_msg));
+	/*down-convert EX to CR on Message*/
+	error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
+	if (error) {
+		pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
+				error);
+		goto failed_message;
+	}
+
+	/*up-convert CR to EX on Ack*/
+	error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_EX);
+	if (error) {
+		pr_err("md-cluster: failed to convert CR to EX on ACK(%d)\n",
+				error);
+		goto failed_ack;
+	}
+
+	/*down-convert EX to CR on Ack*/
+	error = dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR);
+	if (error) {
+		pr_err("md-cluster: failed to convert EX to CR on ACK(%d)\n",
+				error);
+		goto failed_ack;
+	}
+
+failed_ack:
+	dlm_unlock_sync(cinfo->message_lockres);
+failed_message:
+	return error;
+}
+
+static int sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
+{
+	int ret;
+
+	lock_comm(cinfo);
+	ret = __sendmsg(cinfo, cmsg);
+	unlock_comm(cinfo);
+	return ret;
+}
+
 static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-- 
cgit v1.2.1


From 293467aa1f161cd50920ccf7fc1efa3946a4d50c Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 7 Jun 2014 01:44:51 -0500
Subject: metadata_update sends message to other nodes

   - request to send a message
   - make changes to superblock
   - send messages telling everyone that the superblock has changed
   - other nodes all read the superblock
   - other nodes all ack the messages
   - updating node release the "I'm sending a message" resource.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 28 ++++++++++++++++
 drivers/md/md-cluster.h |  3 ++
 drivers/md/md.c         | 89 +++++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 106 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 9a4abe1b4aa4..5db491010835 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -621,11 +621,39 @@ static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
 	dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
 }
 
+static int metadata_update_start(struct mddev *mddev)
+{
+	return lock_comm(mddev->cluster_info);
+}
+
+static int metadata_update_finish(struct mddev *mddev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct cluster_msg cmsg;
+	int ret;
+
+	memset(&cmsg, 0, sizeof(cmsg));
+	cmsg.type = cpu_to_le32(METADATA_UPDATED);
+	ret = __sendmsg(cinfo, &cmsg);
+	unlock_comm(cinfo);
+	return ret;
+}
+
+static int metadata_update_cancel(struct mddev *mddev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	return dlm_unlock_sync(cinfo->token_lockres);
+}
+
 static struct md_cluster_operations cluster_ops = {
 	.join   = join,
 	.leave  = leave,
 	.slot_number = slot_number,
 	.resync_info_update = resync_info_update,
+	.metadata_update_start = metadata_update_start,
+	.metadata_update_finish = metadata_update_finish,
+	.metadata_update_cancel = metadata_update_cancel,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 51a24df15b64..658982afcf9b 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -12,6 +12,9 @@ struct md_cluster_operations {
 	int (*leave)(struct mddev *mddev);
 	int (*slot_number)(struct mddev *mddev);
 	void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
+	int (*metadata_update_start)(struct mddev *mddev);
+	int (*metadata_update_finish)(struct mddev *mddev);
+	int (*metadata_update_cancel)(struct mddev *mddev);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 630a9142a819..0052e433d8a6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2472,10 +2472,14 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 			err = -EBUSY;
 		else {
 			struct mddev *mddev = rdev->mddev;
+			if (mddev_is_clustered(mddev))
+				md_cluster_ops->metadata_update_start(mddev);
 			kick_rdev_from_array(rdev);
 			if (mddev->pers)
 				md_update_sb(mddev, 1);
 			md_new_event(mddev);
+			if (mddev_is_clustered(mddev))
+				md_cluster_ops->metadata_update_finish(mddev);
 			err = 0;
 		}
 	} else if (cmd_match(buf, "writemostly")) {
@@ -4008,8 +4012,12 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
 	if (err)
 		return err;
 	if (mddev->pers) {
+		if (mddev_is_clustered(mddev))
+			md_cluster_ops->metadata_update_start(mddev);
 		err = update_size(mddev, sectors);
 		md_update_sb(mddev, 1);
+		if (mddev_is_clustered(mddev))
+			md_cluster_ops->metadata_update_finish(mddev);
 	} else {
 		if (mddev->dev_sectors == 0 ||
 		    mddev->dev_sectors > sectors)
@@ -5236,6 +5244,8 @@ static void md_clean(struct mddev *mddev)
 
 static void __md_stop_writes(struct mddev *mddev)
 {
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_start(mddev);
 	set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	flush_workqueue(md_misc_wq);
 	if (mddev->sync_thread) {
@@ -5254,6 +5264,8 @@ static void __md_stop_writes(struct mddev *mddev)
 		mddev->in_sync = 1;
 		md_update_sb(mddev, 1);
 	}
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_finish(mddev);
 }
 
 void md_stop_writes(struct mddev *mddev)
@@ -5902,6 +5914,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 	if (!rdev)
 		return -ENXIO;
 
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_start(mddev);
+
 	clear_bit(Blocked, &rdev->flags);
 	remove_and_add_spares(mddev, rdev);
 
@@ -5912,8 +5927,13 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 	md_update_sb(mddev, 1);
 	md_new_event(mddev);
 
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_finish(mddev);
+
 	return 0;
 busy:
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_cancel(mddev);
 	printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
 		bdevname(rdev->bdev,b), mdname(mddev));
 	return -EBUSY;
@@ -5963,12 +5983,15 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 		err = -EINVAL;
 		goto abort_export;
 	}
+
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_start(mddev);
 	clear_bit(In_sync, &rdev->flags);
 	rdev->desc_nr = -1;
 	rdev->saved_raid_disk = -1;
 	err = bind_rdev_to_array(rdev, mddev);
 	if (err)
-		goto abort_export;
+		goto abort_clustered;
 
 	/*
 	 * The rest should better be atomic, we can have disk failures
@@ -5979,6 +6002,8 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 
 	md_update_sb(mddev, 1);
 
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_finish(mddev);
 	/*
 	 * Kick recovery, maybe this spare has to be added to the
 	 * array immediately.
@@ -5988,6 +6013,9 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
 	md_new_event(mddev);
 	return 0;
 
+abort_clustered:
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_cancel(mddev);
 abort_export:
 	export_rdev(rdev);
 	return err;
@@ -6304,6 +6332,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 			return rv;
 		}
 	}
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_start(mddev);
 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
 		rv = update_size(mddev, (sector_t)info->size * 2);
 
@@ -6311,17 +6341,25 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 		rv = update_raid_disks(mddev, info->raid_disks);
 
 	if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
-		if (mddev->pers->quiesce == NULL || mddev->thread == NULL)
-			return -EINVAL;
-		if (mddev->recovery || mddev->sync_thread)
-			return -EBUSY;
+		if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
+			rv = -EINVAL;
+			goto err;
+		}
+		if (mddev->recovery || mddev->sync_thread) {
+			rv = -EBUSY;
+			goto err;
+		}
 		if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
 			struct bitmap *bitmap;
 			/* add the bitmap */
-			if (mddev->bitmap)
-				return -EEXIST;
-			if (mddev->bitmap_info.default_offset == 0)
-				return -EINVAL;
+			if (mddev->bitmap) {
+				rv = -EEXIST;
+				goto err;
+			}
+			if (mddev->bitmap_info.default_offset == 0) {
+				rv = -EINVAL;
+				goto err;
+			}
 			mddev->bitmap_info.offset =
 				mddev->bitmap_info.default_offset;
 			mddev->bitmap_info.space =
@@ -6337,10 +6375,14 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 			mddev->pers->quiesce(mddev, 0);
 		} else {
 			/* remove the bitmap */
-			if (!mddev->bitmap)
-				return -ENOENT;
-			if (mddev->bitmap->storage.file)
-				return -EINVAL;
+			if (!mddev->bitmap) {
+				rv = -ENOENT;
+				goto err;
+			}
+			if (mddev->bitmap->storage.file) {
+				rv = -EINVAL;
+				goto err;
+			}
 			mddev->pers->quiesce(mddev, 1);
 			bitmap_destroy(mddev);
 			mddev->pers->quiesce(mddev, 0);
@@ -6348,6 +6390,12 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 		}
 	}
 	md_update_sb(mddev, 1);
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_finish(mddev);
+	return rv;
+err:
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_cancel(mddev);
 	return rv;
 }
 
@@ -7438,7 +7486,11 @@ int md_allow_write(struct mddev *mddev)
 		    mddev->safemode == 0)
 			mddev->safemode = 1;
 		spin_unlock(&mddev->lock);
+		if (mddev_is_clustered(mddev))
+			md_cluster_ops->metadata_update_start(mddev);
 		md_update_sb(mddev, 0);
+		if (mddev_is_clustered(mddev))
+			md_cluster_ops->metadata_update_finish(mddev);
 		sysfs_notify_dirent_safe(mddev->sysfs_state);
 	} else
 		spin_unlock(&mddev->lock);
@@ -7996,8 +8048,13 @@ void md_check_recovery(struct mddev *mddev)
 				sysfs_notify_dirent_safe(mddev->sysfs_state);
 		}
 
-		if (mddev->flags & MD_UPDATE_SB_FLAGS)
+		if (mddev->flags & MD_UPDATE_SB_FLAGS) {
+			if (mddev_is_clustered(mddev))
+				md_cluster_ops->metadata_update_start(mddev);
 			md_update_sb(mddev, 0);
+			if (mddev_is_clustered(mddev))
+				md_cluster_ops->metadata_update_finish(mddev);
+		}
 
 		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
 		    !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
@@ -8095,6 +8152,8 @@ void md_reap_sync_thread(struct mddev *mddev)
 			set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		}
 	}
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_start(mddev);
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    mddev->pers->finish_reshape)
 		mddev->pers->finish_reshape(mddev);
@@ -8107,6 +8166,8 @@ void md_reap_sync_thread(struct mddev *mddev)
 			rdev->saved_raid_disk = -1;
 
 	md_update_sb(mddev, 1);
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->metadata_update_finish(mddev);
 	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
-- 
cgit v1.2.1


From 1d7e3e96117a864fe2ab3d02a14e49855319fdde Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 7 Jun 2014 01:53:00 -0500
Subject: Reload superblock if METADATA_UPDATED is received

Re-reads the devices by invalidating the cache.
Since we don't write to faulty devices, this is detected using
events recorded in the devices. If it is old as compared to the mddev
mark it is faulty.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c |  1 +
 drivers/md/md.c         | 22 ++++++++++++++++++++++
 drivers/md/md.h         |  1 +
 3 files changed, 24 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 5db491010835..7e419f05b568 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -320,6 +320,7 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 	case METADATA_UPDATED:
 		pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
 			__func__, __LINE__, msg->slot);
+		md_reload_sb(mddev);
 		break;
 	case RESYNCING:
 		pr_info("%s: %d Received message: RESYNCING from %d\n",
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0052e433d8a6..3eb45dc0537f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8788,6 +8788,28 @@ err_wq:
 	return ret;
 }
 
+void md_reload_sb(struct mddev *mddev)
+{
+	struct md_rdev *rdev, *tmp;
+
+	rdev_for_each_safe(rdev, tmp, mddev) {
+		rdev->sb_loaded = 0;
+		ClearPageUptodate(rdev->sb_page);
+	}
+	mddev->raid_disks = 0;
+	analyze_sbs(mddev);
+	rdev_for_each_safe(rdev, tmp, mddev) {
+		struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+		/* since we don't write to faulty devices, we figure out if the
+		 *  disk is faulty by comparing events
+		 */
+		if (mddev->events > sb->events)
+			set_bit(Faulty, &rdev->flags);
+	}
+
+}
+EXPORT_SYMBOL(md_reload_sb);
+
 #ifndef MODULE
 
 /*
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 81e568090d8f..bfebcfdf54e6 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -665,6 +665,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 				   struct mddev *mddev);
 
 extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
+extern void md_reload_sb(struct mddev *mddev);
 static inline int mddev_check_plugged(struct mddev *mddev)
 {
 	return !!blk_check_plugged(md_unplug, mddev,
-- 
cgit v1.2.1


From 965400eb615dfb32d62cb3319a895bd94eb9f3b4 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 7 Jun 2014 02:16:58 -0500
Subject: Send RESYNCING while performing resync start/stop

When a resync is initiated, RESYNCING message is sent to all active
nodes with the range (lo,hi). When the resync is over, a RESYNCING
message is sent with (0,0). A high sector value of zero indicates
that the resync is over.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 32 ++++++++++++++++++++++++++++++++
 drivers/md/md-cluster.h |  2 ++
 drivers/md/md.c         |  4 ++--
 3 files changed, 36 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 7e419f05b568..6428cc3ce38d 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -647,11 +647,43 @@ static int metadata_update_cancel(struct mddev *mddev)
 	return dlm_unlock_sync(cinfo->token_lockres);
 }
 
+static int resync_send(struct mddev *mddev, enum msg_type type,
+		sector_t lo, sector_t hi)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct cluster_msg cmsg;
+	int slot = cinfo->slot_number - 1;
+
+	pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
+			(unsigned long long)lo,
+			(unsigned long long)hi);
+	resync_info_update(mddev, lo, hi);
+	cmsg.type = cpu_to_le32(type);
+	cmsg.slot = cpu_to_le32(slot);
+	cmsg.low = cpu_to_le64(lo);
+	cmsg.high = cpu_to_le64(hi);
+	return sendmsg(cinfo, &cmsg);
+}
+
+static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
+{
+	pr_info("%s:%d\n", __func__, __LINE__);
+	return resync_send(mddev, RESYNCING, lo, hi);
+}
+
+static void resync_finish(struct mddev *mddev)
+{
+	pr_info("%s:%d\n", __func__, __LINE__);
+	resync_send(mddev, RESYNCING, 0, 0);
+}
+
 static struct md_cluster_operations cluster_ops = {
 	.join   = join,
 	.leave  = leave,
 	.slot_number = slot_number,
 	.resync_info_update = resync_info_update,
+	.resync_start = resync_start,
+	.resync_finish = resync_finish,
 	.metadata_update_start = metadata_update_start,
 	.metadata_update_finish = metadata_update_finish,
 	.metadata_update_cancel = metadata_update_cancel,
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 658982afcf9b..054f9eafa065 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -12,6 +12,8 @@ struct md_cluster_operations {
 	int (*leave)(struct mddev *mddev);
 	int (*slot_number)(struct mddev *mddev);
 	void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
+	int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
+	void (*resync_finish)(struct mddev *mddev);
 	int (*metadata_update_start)(struct mddev *mddev);
 	int (*metadata_update_finish)(struct mddev *mddev);
 	int (*metadata_update_cancel)(struct mddev *mddev);
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3eb45dc0537f..a1af24d369fc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7692,7 +7692,7 @@ void md_do_sync(struct md_thread *thread)
 	update_time = jiffies;
 
 	if (mddev_is_clustered(mddev))
-		md_cluster_ops->resync_info_update(mddev, j, max_sectors);
+		md_cluster_ops->resync_start(mddev, j, max_sectors);
 
 	blk_start_plug(&plug);
 	while (j < max_sectors) {
@@ -7817,7 +7817,7 @@ void md_do_sync(struct md_thread *thread)
 	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
 
 	if (mddev_is_clustered(mddev))
-		md_cluster_ops->resync_info_update(mddev, 0, 0);
+		md_cluster_ops->resync_finish(mddev);
 
 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
 	    mddev->curr_resync > 2) {
-- 
cgit v1.2.1


From e59721ccdc65dd4fbd8f311a063ecc8f6232dbcc Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 7 Jun 2014 02:30:30 -0500
Subject: Resync start/Finish actions

When a RESYNC_START message arrives, the node removes the entry
with the current slot number and adds the range to the
suspend_list.

Simlarly, when a RESYNC_FINISHED message is received, node clears
entry with respect to the bitmap number.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 6428cc3ce38d..6b0dffebc90f 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -314,6 +314,50 @@ static void ack_bast(void *arg, int mode)
 		md_wakeup_thread(cinfo->recv_thread);
 }
 
+static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
+{
+	struct suspend_info *s, *tmp;
+
+	list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
+		if (slot == s->slot) {
+			pr_info("%s:%d Deleting suspend_info: %d\n",
+					__func__, __LINE__, slot);
+			list_del(&s->list);
+			kfree(s);
+			break;
+		}
+}
+
+static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
+{
+	spin_lock_irq(&cinfo->suspend_lock);
+	__remove_suspend_info(cinfo, slot);
+	spin_unlock_irq(&cinfo->suspend_lock);
+}
+
+
+static void process_suspend_info(struct md_cluster_info *cinfo,
+		int slot, sector_t lo, sector_t hi)
+{
+	struct suspend_info *s;
+
+	if (!hi) {
+		remove_suspend_info(cinfo, slot);
+		return;
+	}
+	s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
+	if (!s)
+		return;
+	s->slot = slot;
+	s->lo = lo;
+	s->hi = hi;
+	spin_lock_irq(&cinfo->suspend_lock);
+	/* Remove existing entry (if exists) before adding */
+	__remove_suspend_info(cinfo, slot);
+	list_add(&s->list, &cinfo->suspend_list);
+	spin_unlock_irq(&cinfo->suspend_lock);
+}
+
 static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 {
 	switch (msg->type) {
@@ -325,6 +369,8 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 	case RESYNCING:
 		pr_info("%s: %d Received message: RESYNCING from %d\n",
 			__func__, __LINE__, msg->slot);
+		process_suspend_info(mddev->cluster_info, msg->slot,
+				msg->low, msg->high);
 		break;
 	};
 }
-- 
cgit v1.2.1


From 589a1c491621ab81a1955d17d634636522c1b4c1 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Sat, 7 Jun 2014 02:39:37 -0500
Subject: Suspend writes in RAID1 if within range

If there is a resync going on, all nodes must suspend writes to the
range. This is recorded in the suspend_info/suspend_list.

If there is an I/O within the ranges of any of the suspend_info,
should_suspend will return 1.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 20 ++++++++++++++++++++
 drivers/md/md-cluster.h |  1 +
 drivers/md/md.c         |  1 +
 drivers/md/raid1.c      | 11 ++++++++---
 4 files changed, 30 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 6b0dffebc90f..d85a6ca4443e 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -723,6 +723,25 @@ static void resync_finish(struct mddev *mddev)
 	resync_send(mddev, RESYNCING, 0, 0);
 }
 
+static int area_resyncing(struct mddev *mddev, sector_t lo, sector_t hi)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int ret = 0;
+	struct suspend_info *s;
+
+	spin_lock_irq(&cinfo->suspend_lock);
+	if (list_empty(&cinfo->suspend_list))
+		goto out;
+	list_for_each_entry(s, &cinfo->suspend_list, list)
+		if (hi > s->lo && lo < s->hi) {
+			ret = 1;
+			break;
+		}
+out:
+	spin_unlock_irq(&cinfo->suspend_lock);
+	return ret;
+}
+
 static struct md_cluster_operations cluster_ops = {
 	.join   = join,
 	.leave  = leave,
@@ -733,6 +752,7 @@ static struct md_cluster_operations cluster_ops = {
 	.metadata_update_start = metadata_update_start,
 	.metadata_update_finish = metadata_update_finish,
 	.metadata_update_cancel = metadata_update_cancel,
+	.area_resyncing = area_resyncing,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 054f9eafa065..03785402afaa 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -17,6 +17,7 @@ struct md_cluster_operations {
 	int (*metadata_update_start)(struct mddev *mddev);
 	int (*metadata_update_finish)(struct mddev *mddev);
 	int (*metadata_update_cancel)(struct mddev *mddev);
+	int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a1af24d369fc..fe0484648de4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -68,6 +68,7 @@ static LIST_HEAD(pers_list);
 static DEFINE_SPINLOCK(pers_lock);
 
 struct md_cluster_operations *md_cluster_ops;
+EXPORT_SYMBOL(md_cluster_ops);
 struct module *md_cluster_mod;
 EXPORT_SYMBOL(md_cluster_mod);
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4153da5d4011..3aa58ab5b1fd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1101,8 +1101,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
 	if (bio_data_dir(bio) == WRITE &&
-	    bio_end_sector(bio) > mddev->suspend_lo &&
-	    bio->bi_iter.bi_sector < mddev->suspend_hi) {
+	    ((bio_end_sector(bio) > mddev->suspend_lo &&
+	    bio->bi_iter.bi_sector < mddev->suspend_hi) ||
+	    (mddev_is_clustered(mddev) &&
+	     md_cluster_ops->area_resyncing(mddev, bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
 		/* As the suspend_* range is controlled by
 		 * userspace, we want an interruptible
 		 * wait.
@@ -1113,7 +1115,10 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_INTERRUPTIBLE);
 			if (bio_end_sector(bio) <= mddev->suspend_lo ||
-			    bio->bi_iter.bi_sector >= mddev->suspend_hi)
+			    bio->bi_iter.bi_sector >= mddev->suspend_hi ||
+			    (mddev_is_clustered(mddev) &&
+			     !md_cluster_ops->area_resyncing(mddev,
+				     bio->bi_iter.bi_sector, bio_end_sector(bio))))
 				break;
 			schedule();
 		}
-- 
cgit v1.2.1


From 7d49ffcfa3cc08aa2301bf3fdb1e423a3fd33ee7 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Tue, 12 Aug 2014 10:13:19 -0500
Subject: Read from the first device when an area is resyncing

set choose_first true for cluster read in read balance when the area
is resyncing.

Signed-off-by: Lidong Zhong <lzhong@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/raid1.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 3aa58ab5b1fd..f70d74189d16 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -539,7 +539,13 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
 	has_nonrot_disk = 0;
 	choose_next_idle = 0;
 
-	choose_first = (conf->mddev->recovery_cp < this_sector + sectors);
+	if ((conf->mddev->recovery_cp < this_sector + sectors) ||
+	    (mddev_is_clustered(conf->mddev) &&
+	    md_cluster_ops->area_resyncing(conf->mddev, this_sector,
+		    this_sector + sectors)))
+		choose_first = 1;
+	else
+		choose_first = 0;
 
 	for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
 		sector_t dist;
-- 
cgit v1.2.1


From 1aee41f637694d4bbf91c24195f2b63e3f6badd2 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.com>
Date: Wed, 29 Oct 2014 18:51:31 -0500
Subject: Add new disk to clustered array

Algorithm:
1. Node 1 issues mdadm --manage /dev/mdX --add /dev/sdYY which issues
   ioctl(ADD_NEW_DISC with disc.state set to MD_DISK_CLUSTER_ADD)
2. Node 1 sends NEWDISK with uuid and slot number
3. Other nodes issue kobject_uevent_env with uuid and slot number
(Steps 4,5 could be a udev rule)
4. In userspace, the node searches for the disk, perhaps
   using blkid -t SUB_UUID=""
5. Other nodes issue either of the following depending on whether the disk
   was found:
   ioctl(ADD_NEW_DISK with disc.state set to MD_DISK_CANDIDATE and
	 disc.number set to slot number)
   ioctl(CLUSTERED_DISK_NACK)
6. Other nodes drop lock on no-new-devs (CR) if device is found
7. Node 1 attempts EX lock on no-new-devs
8. If node 1 gets the lock, it sends METADATA_UPDATED after unmarking the disk
   as SpareLocal
9. If not (get no-new-dev lock), it fails the operation and sends METADATA_UPDATED
10. Other nodes understand if the device is added or not by reading the superblock again after receiving the METADATA_UPDATED message.

Signed-off-by: Lidong Zhong <lzhong@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 drivers/md/md-cluster.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/md-cluster.h |   4 ++
 drivers/md/md.c         |  52 ++++++++++++++++++++++--
 drivers/md/md.h         |   5 +++
 drivers/md/raid1.c      |   1 +
 5 files changed, 162 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index d85a6ca4443e..03e521a9ca7d 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -12,11 +12,13 @@
 #include <linux/module.h>
 #include <linux/dlm.h>
 #include <linux/sched.h>
+#include <linux/raid/md_p.h>
 #include "md.h"
 #include "bitmap.h"
 #include "md-cluster.h"
 
 #define LVB_SIZE	64
+#define NEW_DEV_TIMEOUT 5000
 
 struct dlm_lock_resource {
 	dlm_lockspace_t *ls;
@@ -56,19 +58,25 @@ struct md_cluster_info {
 	struct dlm_lock_resource *ack_lockres;
 	struct dlm_lock_resource *message_lockres;
 	struct dlm_lock_resource *token_lockres;
+	struct dlm_lock_resource *no_new_dev_lockres;
 	struct md_thread *recv_thread;
+	struct completion newdisk_completion;
 };
 
 enum msg_type {
 	METADATA_UPDATED = 0,
 	RESYNCING,
+	NEWDISK,
 };
 
 struct cluster_msg {
 	int type;
 	int slot;
+	/* TODO: Unionize this for smaller footprint */
 	sector_t low;
 	sector_t high;
+	char uuid[16];
+	int raid_slot;
 };
 
 static void sync_ast(void *arg)
@@ -358,13 +366,41 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
 	spin_unlock_irq(&cinfo->suspend_lock);
 }
 
+static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
+{
+	char disk_uuid[64];
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	char event_name[] = "EVENT=ADD_DEVICE";
+	char raid_slot[16];
+	char *envp[] = {event_name, disk_uuid, raid_slot, NULL};
+	int len;
+
+	len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
+	pretty_uuid(disk_uuid + len, cmsg->uuid);
+	snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
+	pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
+	init_completion(&cinfo->newdisk_completion);
+	kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
+	wait_for_completion_timeout(&cinfo->newdisk_completion,
+			NEW_DEV_TIMEOUT);
+}
+
+
+static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	md_reload_sb(mddev);
+	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+}
+
 static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 {
 	switch (msg->type) {
 	case METADATA_UPDATED:
 		pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
 			__func__, __LINE__, msg->slot);
-		md_reload_sb(mddev);
+		process_metadata_update(mddev, msg);
 		break;
 	case RESYNCING:
 		pr_info("%s: %d Received message: RESYNCING from %d\n",
@@ -372,6 +408,10 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 		process_suspend_info(mddev->cluster_info, msg->slot,
 				msg->low, msg->high);
 		break;
+	case NEWDISK:
+		pr_info("%s: %d Received message: NEWDISK from %d\n",
+			__func__, __LINE__, msg->slot);
+		process_add_new_disk(mddev, msg);
 	};
 }
 
@@ -593,10 +633,18 @@ static int join(struct mddev *mddev, int nodes)
 	cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0);
 	if (!cinfo->ack_lockres)
 		goto err;
+	cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0);
+	if (!cinfo->no_new_dev_lockres)
+		goto err;
+
 	/* get sync CR lock on ACK. */
 	if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR))
 		pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n",
 				ret);
+	/* get sync CR lock on no-new-dev. */
+	if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR))
+		pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);
+
 
 	pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number);
 	snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1);
@@ -621,6 +669,7 @@ err:
 	lockres_free(cinfo->message_lockres);
 	lockres_free(cinfo->token_lockres);
 	lockres_free(cinfo->ack_lockres);
+	lockres_free(cinfo->no_new_dev_lockres);
 	lockres_free(cinfo->bitmap_lockres);
 	lockres_free(cinfo->sb_lock);
 	if (cinfo->lockspace)
@@ -642,6 +691,7 @@ static int leave(struct mddev *mddev)
 	lockres_free(cinfo->message_lockres);
 	lockres_free(cinfo->token_lockres);
 	lockres_free(cinfo->ack_lockres);
+	lockres_free(cinfo->no_new_dev_lockres);
 	lockres_free(cinfo->sb_lock);
 	lockres_free(cinfo->bitmap_lockres);
 	dlm_release_lockspace(cinfo->lockspace, 2);
@@ -742,6 +792,55 @@ out:
 	return ret;
 }
 
+static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct cluster_msg cmsg;
+	int ret = 0;
+	struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
+	char *uuid = sb->device_uuid;
+
+	memset(&cmsg, 0, sizeof(cmsg));
+	cmsg.type = cpu_to_le32(NEWDISK);
+	memcpy(cmsg.uuid, uuid, 16);
+	cmsg.raid_slot = rdev->desc_nr;
+	lock_comm(cinfo);
+	ret = __sendmsg(cinfo, &cmsg);
+	if (ret)
+		return ret;
+	cinfo->no_new_dev_lockres->flags |= DLM_LKF_NOQUEUE;
+	ret = dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_EX);
+	cinfo->no_new_dev_lockres->flags &= ~DLM_LKF_NOQUEUE;
+	/* Some node does not "see" the device */
+	if (ret == -EAGAIN)
+		ret = -ENOENT;
+	else
+		dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
+	return ret;
+}
+
+static int add_new_disk_finish(struct mddev *mddev)
+{
+	struct cluster_msg cmsg;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	int ret;
+	/* Write sb and inform others */
+	md_update_sb(mddev, 1);
+	cmsg.type = METADATA_UPDATED;
+	ret = __sendmsg(cinfo, &cmsg);
+	unlock_comm(cinfo);
+	return ret;
+}
+
+static void new_disk_ack(struct mddev *mddev, bool ack)
+{
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	if (ack)
+		dlm_unlock_sync(cinfo->no_new_dev_lockres);
+	complete(&cinfo->newdisk_completion);
+}
+
 static struct md_cluster_operations cluster_ops = {
 	.join   = join,
 	.leave  = leave,
@@ -753,6 +852,9 @@ static struct md_cluster_operations cluster_ops = {
 	.metadata_update_finish = metadata_update_finish,
 	.metadata_update_cancel = metadata_update_cancel,
 	.area_resyncing = area_resyncing,
+	.add_new_disk_start = add_new_disk_start,
+	.add_new_disk_finish = add_new_disk_finish,
+	.new_disk_ack = new_disk_ack,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 03785402afaa..60d7e58964f5 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -6,6 +6,7 @@
 #include "md.h"
 
 struct mddev;
+struct md_rdev;
 
 struct md_cluster_operations {
 	int (*join)(struct mddev *mddev, int nodes);
@@ -18,6 +19,9 @@ struct md_cluster_operations {
 	int (*metadata_update_finish)(struct mddev *mddev);
 	int (*metadata_update_cancel)(struct mddev *mddev);
 	int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
+	int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
+	int (*add_new_disk_finish)(struct mddev *mddev);
+	void (*new_disk_ack)(struct mddev *mddev, bool ack);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fe0484648de4..5703c2e89f3a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2210,7 +2210,7 @@ static void sync_sbs(struct mddev *mddev, int nospares)
 	}
 }
 
-static void md_update_sb(struct mddev *mddev, int force_change)
+void md_update_sb(struct mddev *mddev, int force_change)
 {
 	struct md_rdev *rdev;
 	int sync_req;
@@ -2371,6 +2371,7 @@ repeat:
 		wake_up(&rdev->blocked_wait);
 	}
 }
+EXPORT_SYMBOL(md_update_sb);
 
 /* words written to sysfs files may, or may not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
@@ -3151,7 +3152,7 @@ static void analyze_sbs(struct mddev *mddev)
 			kick_rdev_from_array(rdev);
 			continue;
 		}
-		if (rdev != freshest)
+		if (rdev != freshest) {
 			if (super_types[mddev->major_version].
 			    validate_super(mddev, rdev)) {
 				printk(KERN_WARNING "md: kicking non-fresh %s"
@@ -3160,6 +3161,15 @@ static void analyze_sbs(struct mddev *mddev)
 				kick_rdev_from_array(rdev);
 				continue;
 			}
+			/* No device should have a Candidate flag
+			 * when reading devices
+			 */
+			if (test_bit(Candidate, &rdev->flags)) {
+				pr_info("md: kicking Cluster Candidate %s from array!\n",
+					bdevname(rdev->bdev, b));
+				kick_rdev_from_array(rdev);
+			}
+		}
 		if (mddev->level == LEVEL_MULTIPATH) {
 			rdev->desc_nr = i++;
 			rdev->raid_disk = rdev->desc_nr;
@@ -5655,7 +5665,6 @@ static int get_array_info(struct mddev *mddev, void __user *arg)
 		info.state |= (1<<MD_SB_BITMAP_PRESENT);
 	if (mddev_is_clustered(mddev))
 		info.state |= (1<<MD_SB_CLUSTERED);
-
 	info.active_disks  = insync;
 	info.working_disks = working;
 	info.failed_disks  = failed;
@@ -5744,6 +5753,13 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 	struct md_rdev *rdev;
 	dev_t dev = MKDEV(info->major,info->minor);
 
+	if (mddev_is_clustered(mddev) &&
+		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
+		pr_err("%s: Cannot add to clustered mddev. Try --cluster-add\n",
+			       mdname(mddev));
+		return -EINVAL;
+	}
+
 	if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
 		return -EOVERFLOW;
 
@@ -5830,6 +5846,25 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 		else
 			clear_bit(WriteMostly, &rdev->flags);
 
+		/*
+		 * check whether the device shows up in other nodes
+		 */
+		if (mddev_is_clustered(mddev)) {
+			if (info->state & (1 << MD_DISK_CANDIDATE)) {
+				/* Through --cluster-confirm */
+				set_bit(Candidate, &rdev->flags);
+				md_cluster_ops->new_disk_ack(mddev, true);
+			} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
+				/* --add initiated by this node */
+				err = md_cluster_ops->add_new_disk_start(mddev, rdev);
+				if (err) {
+					md_cluster_ops->add_new_disk_finish(mddev);
+					export_rdev(rdev);
+					return err;
+				}
+			}
+		}
+
 		rdev->raid_disk = -1;
 		err = bind_rdev_to_array(rdev, mddev);
 		if (!err && !mddev->pers->hot_remove_disk) {
@@ -5855,6 +5890,9 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 		if (!err)
 			md_new_event(mddev);
 		md_wakeup_thread(mddev->thread);
+		if (mddev_is_clustered(mddev) &&
+				(info->state & (1 << MD_DISK_CLUSTER_ADD)))
+			md_cluster_ops->add_new_disk_finish(mddev);
 		return err;
 	}
 
@@ -6456,6 +6494,7 @@ static inline bool md_ioctl_valid(unsigned int cmd)
 	case SET_DISK_FAULTY:
 	case STOP_ARRAY:
 	case STOP_ARRAY_RO:
+	case CLUSTERED_DISK_NACK:
 		return true;
 	default:
 		return false;
@@ -6728,6 +6767,13 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
 		goto unlock;
 	}
 
+	case CLUSTERED_DISK_NACK:
+		if (mddev_is_clustered(mddev))
+			md_cluster_ops->new_disk_ack(mddev, false);
+		else
+			err = -EINVAL;
+		goto unlock;
+
 	case HOT_ADD_DISK:
 		err = hot_add_disk(mddev, new_decode_dev(arg));
 		goto unlock;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index bfebcfdf54e6..6dc0ce09f50c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -171,6 +171,10 @@ enum flag_bits {
 				 * a want_replacement device with same
 				 * raid_disk number.
 				 */
+	Candidate,		/* For clustered environments only:
+				 * This device is seen locally but not
+				 * by the whole cluster
+				 */
 };
 
 #define BB_LEN_MASK	(0x00000000000001FFULL)
@@ -666,6 +670,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 
 extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
 extern void md_reload_sb(struct mddev *mddev);
+extern void md_update_sb(struct mddev *mddev, int force);
 static inline int mddev_check_plugged(struct mddev *mddev)
 {
 	return !!blk_check_plugged(md_unplug, mddev,
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f70d74189d16..53ed5d48308f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1571,6 +1571,7 @@ static int raid1_spare_active(struct mddev *mddev)
 		struct md_rdev *rdev = conf->mirrors[i].rdev;
 		struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
 		if (repl
+		    && !test_bit(Candidate, &repl->flags)
 		    && repl->recovery_offset == MaxSector
 		    && !test_bit(Faulty, &repl->flags)
 		    && !test_and_set_bit(In_sync, &repl->flags)) {
-- 
cgit v1.2.1


From ba599aca520d6005138d1e5edb125fb83a130141 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Feb 2015 11:44:11 +1100
Subject: md: fix error paths from bitmap_create.

Recent change to bitmap_create mishandles errors.
In particular a failure doesn't alway cause 'err' to be set.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5703c2e89f3a..ae3432e57ccb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6118,7 +6118,8 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
 			if (!IS_ERR(bitmap)) {
 				mddev->bitmap = bitmap;
 				err = bitmap_load(mddev);
-			}
+			} else
+				err = PTR_ERR(bitmap);
 		}
 		if (fd < 0 || err) {
 			bitmap_destroy(mddev);
@@ -6408,7 +6409,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 			if (!IS_ERR(bitmap)) {
 				mddev->bitmap = bitmap;
 				rv = bitmap_load(mddev);
-			}
+			} else
+				rv = PTR_ERR(bitmap);
 			if (rv)
 				bitmap_destroy(mddev);
 			mddev->pers->quiesce(mddev, 0);
-- 
cgit v1.2.1


From 935f3d4fc62c1f4d99cd13344762e766cd3bf115 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 2 Mar 2015 17:02:29 +1100
Subject: md/bitmap: fix incorrect DIV_ROUND_UP usage.

DIV_ROUTND_UP doesn't work on "long long", - and it should be
sector_t anyway.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index dd8c78043eab..03e0752af99f 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -571,11 +571,11 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 re_read:
 	/* If cluster_slot is set, the cluster is setup */
 	if (bitmap->cluster_slot >= 0) {
-		long long bm_blocks;
+		sector_t bm_blocks;
 
 		bm_blocks = bitmap->mddev->resync_max_sectors / (bitmap->mddev->bitmap_info.chunksize >> 9);
 		bm_blocks = bm_blocks << 3;
-		bm_blocks = DIV_ROUND_UP(bm_blocks, 4096);
+		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
 		bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
 		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
 			bitmap->cluster_slot, (unsigned long long)bitmap->mddev->bitmap_info.offset);
-- 
cgit v1.2.1


From 3b0e6aacbfe04fa144c4732f269b09ce91177566 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 3 Mar 2015 13:35:31 +1100
Subject: md/bitmap: use sector_div for sector_t divisions

neilb: modified to not corrupt ->resync_max_sectors.

sector_div usage fixed by Guoqing Jiang <gqjiang@suse.com>

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 03e0752af99f..ac79fef68143 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -571,9 +571,10 @@ static int bitmap_read_sb(struct bitmap *bitmap)
 re_read:
 	/* If cluster_slot is set, the cluster is setup */
 	if (bitmap->cluster_slot >= 0) {
-		sector_t bm_blocks;
+		sector_t bm_blocks = bitmap->mddev->resync_max_sectors;
 
-		bm_blocks = bitmap->mddev->resync_max_sectors / (bitmap->mddev->bitmap_info.chunksize >> 9);
+		sector_div(bm_blocks,
+			   bitmap->mddev->bitmap_info.chunksize >> 9);
 		bm_blocks = bm_blocks << 3;
 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
 		bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
-- 
cgit v1.2.1


From fa8259da0e10b189e41ee60907ec2a499bb66019 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Mon, 2 Mar 2015 10:55:49 -0600
Subject: md: Fix stray --cluster-confirm crash

A --cluster-confirm without an --add (by another node) can
crash the kernel.

Fix it by guarding it using a state.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md-cluster.c | 15 ++++++++++++++-
 drivers/md/md-cluster.h |  2 +-
 drivers/md/md.c         |  8 ++++++--
 3 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 03e521a9ca7d..96679b22cfc0 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -42,6 +42,10 @@ struct resync_info {
 	__le64 hi;
 };
 
+/* md_cluster_info flags */
+#define		MD_CLUSTER_WAITING_FOR_NEWDISK		1
+
+
 struct md_cluster_info {
 	/* dlm lock space and resources for clustered raid. */
 	dlm_lockspace_t *lockspace;
@@ -61,6 +65,7 @@ struct md_cluster_info {
 	struct dlm_lock_resource *no_new_dev_lockres;
 	struct md_thread *recv_thread;
 	struct completion newdisk_completion;
+	unsigned long state;
 };
 
 enum msg_type {
@@ -380,9 +385,11 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 	snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
 	pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
 	init_completion(&cinfo->newdisk_completion);
+	set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
 	kobject_uevent_env(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE, envp);
 	wait_for_completion_timeout(&cinfo->newdisk_completion,
 			NEW_DEV_TIMEOUT);
+	clear_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
 }
 
 
@@ -832,13 +839,19 @@ static int add_new_disk_finish(struct mddev *mddev)
 	return ret;
 }
 
-static void new_disk_ack(struct mddev *mddev, bool ack)
+static int new_disk_ack(struct mddev *mddev, bool ack)
 {
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 
+	if (!test_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state)) {
+		pr_warn("md-cluster(%s): Spurious cluster confirmation\n", mdname(mddev));
+		return -EINVAL;
+	}
+
 	if (ack)
 		dlm_unlock_sync(cinfo->no_new_dev_lockres);
 	complete(&cinfo->newdisk_completion);
+	return 0;
 }
 
 static struct md_cluster_operations cluster_ops = {
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 60d7e58964f5..7417133c4295 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -21,7 +21,7 @@ struct md_cluster_operations {
 	int (*area_resyncing)(struct mddev *mddev, sector_t lo, sector_t hi);
 	int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
 	int (*add_new_disk_finish)(struct mddev *mddev);
-	void (*new_disk_ack)(struct mddev *mddev, bool ack);
+	int (*new_disk_ack)(struct mddev *mddev, bool ack);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ae3432e57ccb..eb6f92e57ab6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5755,7 +5755,7 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 
 	if (mddev_is_clustered(mddev) &&
 		!(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
-		pr_err("%s: Cannot add to clustered mddev. Try --cluster-add\n",
+		pr_err("%s: Cannot add to clustered mddev.\n",
 			       mdname(mddev));
 		return -EINVAL;
 	}
@@ -5853,7 +5853,11 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 			if (info->state & (1 << MD_DISK_CANDIDATE)) {
 				/* Through --cluster-confirm */
 				set_bit(Candidate, &rdev->flags);
-				md_cluster_ops->new_disk_ack(mddev, true);
+				err = md_cluster_ops->new_disk_ack(mddev, true);
+				if (err) {
+					export_rdev(rdev);
+					return err;
+				}
 			} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
 				/* --add initiated by this node */
 				err = md_cluster_ops->add_new_disk_start(mddev, rdev);
-- 
cgit v1.2.1


From 6dc69c9c460b0cf05b5b3f323a8b944a2e52e76d Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Sat, 28 Feb 2015 07:04:37 +0800
Subject: md: recover_bitmaps() can be static

drivers/md/md-cluster.c:190:6: sparse: symbol 'recover_bitmaps' was not declared. Should it be static?

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md-cluster.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 96679b22cfc0..5062bd1929be 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -217,7 +217,7 @@ out:
 	return s;
 }
 
-void recover_bitmaps(struct md_thread *thread)
+static void recover_bitmaps(struct md_thread *thread)
 {
 	struct mddev *mddev = thread->mddev;
 	struct md_cluster_info *cinfo = mddev->cluster_info;
-- 
cgit v1.2.1


From 09dd1af2e011a13adce65b74425dfe31e1985e64 Mon Sep 17 00:00:00 2001
From: kbuild test robot <fengguang.wu@intel.com>
Date: Sat, 28 Feb 2015 09:16:08 +0800
Subject: md/cluster: Communication Framework: fix semicolon.cocci warnings

drivers/md/md-cluster.c:328:2-3: Unneeded semicolon

 Removes unneeded semicolon.

Generated by: scripts/coccinelle/misc/semicolon.cocci

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md-cluster.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 5062bd1929be..ae8bb547f94d 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -419,7 +419,7 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 		pr_info("%s: %d Received message: NEWDISK from %d\n",
 			__func__, __LINE__, msg->slot);
 		process_add_new_disk(mddev, msg);
-	};
+	}
 }
 
 /*
-- 
cgit v1.2.1


From 124eb761edfdee13c02e48815b05d9bed7666d4c Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 24 Mar 2015 11:29:05 -0500
Subject: md: Fix bitmap offset calculations

The calculations of bitmap offset is incorrect with respect to bits to bytes
conversion.

Also, remove an irrelevant duplicate message.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index ac79fef68143..e98db04eb4f9 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -575,7 +575,9 @@ re_read:
 
 		sector_div(bm_blocks,
 			   bitmap->mddev->bitmap_info.chunksize >> 9);
-		bm_blocks = bm_blocks << 3;
+		/* bits to bytes */
+		bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
+		/* to 4k blocks */
 		bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
 		bitmap->mddev->bitmap_info.offset += bitmap->cluster_slot * (bm_blocks << 3);
 		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
@@ -672,9 +674,6 @@ out:
 			goto out_no_sb;
 		}
 		bitmap->cluster_slot = md_cluster_ops->slot_number(bitmap->mddev);
-		pr_info("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
-			bitmap->cluster_slot,
-			(unsigned long long)bitmap->mddev->bitmap_info.offset);
 		goto re_read;
 	}
 
-- 
cgit v1.2.1


From ac1f9ef211d5dd70110fa4bec6e8866b2c3a965e Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 12 Feb 2015 15:20:35 -0500
Subject: dm log userspace: split flush_entry_pool to be per dirty-log

Use a single slab cache to allocate a mempool for each dirty-log.
This _should_ eliminate DM's need for io_schedule_timeout() in
mempool_alloc(); so io_schedule() should be sufficient now.

Also, rename struct flush_entry to dm_dirty_log_flush_entry to allow
KMEM_CACHE() to create a meaningful global name for the slab cache.

Also, eliminate some holes in struct log_c by rearranging members.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Heinz Mauelshagen <heinzm@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 84 ++++++++++++++++++++------------------
 1 file changed, 45 insertions(+), 39 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 03177ca0b009..39fa00733431 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -17,7 +17,9 @@
 
 #define DM_LOG_USERSPACE_VSN "1.3.0"
 
-struct flush_entry {
+#define FLUSH_ENTRY_POOL_SIZE 16
+
+struct dm_dirty_log_flush_entry {
 	int type;
 	region_t region;
 	struct list_head list;
@@ -34,22 +36,14 @@ struct flush_entry {
 struct log_c {
 	struct dm_target *ti;
 	struct dm_dev *log_dev;
-	uint32_t region_size;
-	region_t region_count;
-	uint64_t luid;
-	char uuid[DM_UUID_LEN];
 
 	char *usr_argv_str;
 	uint32_t usr_argc;
 
-	/*
-	 * in_sync_hint gets set when doing is_remote_recovering.  It
-	 * represents the first region that needs recovery.  IOW, the
-	 * first zero bit of sync_bits.  This can be useful for to limit
-	 * traffic for calls like is_remote_recovering and get_resync_work,
-	 * but be take care in its use for anything else.
-	 */
-	uint64_t in_sync_hint;
+	uint32_t region_size;
+	region_t region_count;
+	uint64_t luid;
+	char uuid[DM_UUID_LEN];
 
 	/*
 	 * Mark and clear requests are held until a flush is issued
@@ -61,6 +55,15 @@ struct log_c {
 	struct list_head mark_list;
 	struct list_head clear_list;
 
+	/*
+	 * in_sync_hint gets set when doing is_remote_recovering.  It
+	 * represents the first region that needs recovery.  IOW, the
+	 * first zero bit of sync_bits.  This can be useful for to limit
+	 * traffic for calls like is_remote_recovering and get_resync_work,
+	 * but be take care in its use for anything else.
+	 */
+	uint64_t in_sync_hint;
+
 	/*
 	 * Workqueue for flush of clear region requests.
 	 */
@@ -72,19 +75,11 @@ struct log_c {
 	 * Combine userspace flush and mark requests for efficiency.
 	 */
 	uint32_t integrated_flush;
-};
-
-static mempool_t *flush_entry_pool;
 
-static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
-{
-	return kmalloc(sizeof(struct flush_entry), gfp_mask);
-}
+	mempool_t *flush_entry_pool;
+};
 
-static void flush_entry_free(void *element, void *pool_data)
-{
-	kfree(element);
-}
+static struct kmem_cache *_flush_entry_cache;
 
 static int userspace_do_request(struct log_c *lc, const char *uuid,
 				int request_type, char *data, size_t data_size,
@@ -254,6 +249,14 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 		goto out;
 	}
 
+	lc->flush_entry_pool = mempool_create_slab_pool(FLUSH_ENTRY_POOL_SIZE,
+							_flush_entry_cache);
+	if (!lc->flush_entry_pool) {
+		DMERR("Failed to create flush_entry_pool");
+		r = -ENOMEM;
+		goto out;
+	}
+
 	/*
 	 * Send table string and get back any opened device.
 	 */
@@ -310,6 +313,8 @@ static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
 out:
 	kfree(devices_rdata);
 	if (r) {
+		if (lc->flush_entry_pool)
+			mempool_destroy(lc->flush_entry_pool);
 		kfree(lc);
 		kfree(ctr_str);
 	} else {
@@ -338,6 +343,8 @@ static void userspace_dtr(struct dm_dirty_log *log)
 	if (lc->log_dev)
 		dm_put_device(lc->ti, lc->log_dev);
 
+	mempool_destroy(lc->flush_entry_pool);
+
 	kfree(lc->usr_argv_str);
 	kfree(lc);
 
@@ -461,7 +468,7 @@ static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
 static int flush_one_by_one(struct log_c *lc, struct list_head *flush_list)
 {
 	int r = 0;
-	struct flush_entry *fe;
+	struct dm_dirty_log_flush_entry *fe;
 
 	list_for_each_entry(fe, flush_list, list) {
 		r = userspace_do_request(lc, lc->uuid, fe->type,
@@ -481,7 +488,7 @@ static int flush_by_group(struct log_c *lc, struct list_head *flush_list,
 	int r = 0;
 	int count;
 	uint32_t type = 0;
-	struct flush_entry *fe, *tmp_fe;
+	struct dm_dirty_log_flush_entry *fe, *tmp_fe;
 	LIST_HEAD(tmp_list);
 	uint64_t group[MAX_FLUSH_GROUP_COUNT];
 
@@ -563,7 +570,8 @@ static int userspace_flush(struct dm_dirty_log *log)
 	LIST_HEAD(clear_list);
 	int mark_list_is_empty;
 	int clear_list_is_empty;
-	struct flush_entry *fe, *tmp_fe;
+	struct dm_dirty_log_flush_entry *fe, *tmp_fe;
+	mempool_t *flush_entry_pool = lc->flush_entry_pool;
 
 	spin_lock_irqsave(&lc->flush_lock, flags);
 	list_splice_init(&lc->mark_list, &mark_list);
@@ -643,10 +651,10 @@ static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
 {
 	unsigned long flags;
 	struct log_c *lc = log->context;
-	struct flush_entry *fe;
+	struct dm_dirty_log_flush_entry *fe;
 
 	/* Wait for an allocation, but _never_ fail */
-	fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+	fe = mempool_alloc(lc->flush_entry_pool, GFP_NOIO);
 	BUG_ON(!fe);
 
 	spin_lock_irqsave(&lc->flush_lock, flags);
@@ -672,7 +680,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 {
 	unsigned long flags;
 	struct log_c *lc = log->context;
-	struct flush_entry *fe;
+	struct dm_dirty_log_flush_entry *fe;
 
 	/*
 	 * If we fail to allocate, we skip the clearing of
@@ -680,7 +688,7 @@ static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
 	 * to cause the region to be resync'ed when the
 	 * device is activated next time.
 	 */
-	fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+	fe = mempool_alloc(lc->flush_entry_pool, GFP_ATOMIC);
 	if (!fe) {
 		DMERR("Failed to allocate memory to clear region.");
 		return;
@@ -886,18 +894,16 @@ static int __init userspace_dirty_log_init(void)
 {
 	int r = 0;
 
-	flush_entry_pool = mempool_create(100, flush_entry_alloc,
-					  flush_entry_free, NULL);
-
-	if (!flush_entry_pool) {
-		DMWARN("Unable to create flush_entry_pool:  No memory.");
+	_flush_entry_cache = KMEM_CACHE(dm_dirty_log_flush_entry, 0);
+	if (!_flush_entry_cache) {
+		DMWARN("Unable to create flush_entry_cache: No memory.");
 		return -ENOMEM;
 	}
 
 	r = dm_ulog_tfr_init();
 	if (r) {
 		DMWARN("Unable to initialize userspace log communications");
-		mempool_destroy(flush_entry_pool);
+		kmem_cache_destroy(_flush_entry_cache);
 		return r;
 	}
 
@@ -905,7 +911,7 @@ static int __init userspace_dirty_log_init(void)
 	if (r) {
 		DMWARN("Couldn't register userspace dirty log type");
 		dm_ulog_tfr_exit();
-		mempool_destroy(flush_entry_pool);
+		kmem_cache_destroy(_flush_entry_cache);
 		return r;
 	}
 
@@ -917,7 +923,7 @@ static void __exit userspace_dirty_log_exit(void)
 {
 	dm_dirty_log_type_unregister(&_userspace_type);
 	dm_ulog_tfr_exit();
-	mempool_destroy(flush_entry_pool);
+	kmem_cache_destroy(_flush_entry_cache);
 
 	DMINFO("version " DM_LOG_USERSPACE_VSN " unloaded");
 	return;
-- 
cgit v1.2.1


From 75da39bf256c27e25f395b191ead79f323772672 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Feb 2015 12:58:03 +0000
Subject: dm cache policy mq: keep track of the number of entries in a
 multiqueue

Small optimisation, now queue_empty() doesn't need to walk all levels of
the multiqueue.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 13f547a4eeb6..ca05d69191e8 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -126,6 +126,7 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 #define NR_QUEUE_LEVELS 16u
 
 struct queue {
+	unsigned nr_elts;
 	struct list_head qs[NR_QUEUE_LEVELS];
 };
 
@@ -133,23 +134,14 @@ static void queue_init(struct queue *q)
 {
 	unsigned i;
 
+	q->nr_elts = 0;
 	for (i = 0; i < NR_QUEUE_LEVELS; i++)
 		INIT_LIST_HEAD(q->qs + i);
 }
 
-/*
- * Checks to see if the queue is empty.
- * FIXME: reduce cpu usage.
- */
 static bool queue_empty(struct queue *q)
 {
-	unsigned i;
-
-	for (i = 0; i < NR_QUEUE_LEVELS; i++)
-		if (!list_empty(q->qs + i))
-			return false;
-
-	return true;
+	return q->nr_elts == 0;
 }
 
 /*
@@ -157,11 +149,13 @@ static bool queue_empty(struct queue *q)
  */
 static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
 {
+	q->nr_elts++;
 	list_add_tail(elt, q->qs + level);
 }
 
-static void queue_remove(struct list_head *elt)
+static void queue_remove(struct queue *q, struct list_head *elt)
 {
+	q->nr_elts--;
 	list_del(elt);
 }
 
@@ -197,6 +191,7 @@ static struct list_head *queue_pop(struct queue *q)
 	struct list_head *r = queue_peek(q);
 
 	if (r) {
+		q->nr_elts--;
 		list_del(r);
 
 		/* have we just emptied the bottom level? */
@@ -496,7 +491,11 @@ static void push(struct mq_policy *mq, struct entry *e)
  */
 static void del(struct mq_policy *mq, struct entry *e)
 {
-	queue_remove(&e->list);
+	if (in_cache(mq, e))
+		queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
+	else
+		queue_remove(&mq->pre_cache, &e->list);
+
 	hash_remove(e);
 }
 
-- 
cgit v1.2.1


From c74ffc5c63b0b2753bedd49bdc1196d570f66803 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Feb 2015 13:01:22 +0000
Subject: dm cache policy mq: remove queue_shift_down()

queue_shift_down() didn't adjust the hit_counts to the new levels, so it
just had the effect of scrambling levels.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 16 ----------------
 1 file changed, 16 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index ca05d69191e8..3c86b5efe78f 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -159,18 +159,6 @@ static void queue_remove(struct queue *q, struct list_head *elt)
 	list_del(elt);
 }
 
-/*
- * Shifts all regions down one level.  This has no effect on the order of
- * the queue.
- */
-static void queue_shift_down(struct queue *q)
-{
-	unsigned level;
-
-	for (level = 1; level < NR_QUEUE_LEVELS; level++)
-		list_splice_init(q->qs + level, q->qs + level - 1);
-}
-
 /*
  * Gives us the oldest entry of the lowest popoulated level.  If the first
  * level is emptied then we shift down one level.
@@ -193,10 +181,6 @@ static struct list_head *queue_pop(struct queue *q)
 	if (r) {
 		q->nr_elts--;
 		list_del(r);
-
-		/* have we just emptied the bottom level? */
-		if (list_empty(q->qs))
-			queue_shift_down(q);
 	}
 
 	return r;
-- 
cgit v1.2.1


From 3e45c91e5cdd0cfd3cc1228628602c8e7e587157 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Feb 2015 13:49:45 +0000
Subject: dm cache policy mq: track entries hit this 'tick' via sentinel
 objects

A sentinel object is placed on each level of the multiqueues.  When an
object is hit it is requeued behind the sentinel.  When the tick is
incremented we iterate through all objects behind the sentinel and
update the hit_count, then reposition the sentinel at the very back.

This saves memory by avoiding tracking the tick explicitly for every
struct entry object in the multiqueues.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 117 ++++++++++++++++++++++++++++------------
 1 file changed, 82 insertions(+), 35 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 3c86b5efe78f..97b14309df90 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -124,10 +124,12 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
  * sorted queue.
  */
 #define NR_QUEUE_LEVELS 16u
+#define NR_SENTINELS NR_QUEUE_LEVELS * 3
 
 struct queue {
 	unsigned nr_elts;
 	struct list_head qs[NR_QUEUE_LEVELS];
+	struct list_head sentinels[NR_SENTINELS];
 };
 
 static void queue_init(struct queue *q)
@@ -135,8 +137,10 @@ static void queue_init(struct queue *q)
 	unsigned i;
 
 	q->nr_elts = 0;
-	for (i = 0; i < NR_QUEUE_LEVELS; i++)
+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
 		INIT_LIST_HEAD(q->qs + i);
+		INIT_LIST_HEAD(q->sentinels + i);
+	}
 }
 
 static bool queue_empty(struct queue *q)
@@ -159,6 +163,11 @@ static void queue_remove(struct queue *q, struct list_head *elt)
 	list_del(elt);
 }
 
+static bool is_sentinel(struct queue *q, struct list_head *h)
+{
+	return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
+}
+
 /*
  * Gives us the oldest entry of the lowest popoulated level.  If the first
  * level is emptied then we shift down one level.
@@ -166,10 +175,12 @@ static void queue_remove(struct queue *q, struct list_head *elt)
 static struct list_head *queue_peek(struct queue *q)
 {
 	unsigned level;
+	struct list_head *h;
 
 	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		if (!list_empty(q->qs + level))
-			return q->qs[level].next;
+		list_for_each(h, q->qs + level)
+			if (!is_sentinel(q, h))
+				return h;
 
 	return NULL;
 }
@@ -196,6 +207,37 @@ static struct list_head *list_pop(struct list_head *lh)
 	return r;
 }
 
+/*
+ * Sometimes we want to iterate through entries that have been pushed since
+ * a certain event.  We use sentinel entries on the queues to delimit these
+ * 'tick' events.
+ */
+static void queue_tick(struct queue *q)
+{
+	unsigned i;
+
+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+		list_del(q->sentinels + i);
+		list_add_tail(q->sentinels + i, q->qs + i);
+	}
+}
+
+typedef void (*iter_fn)(struct list_head *, void *);
+static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
+{
+	unsigned i;
+	struct list_head *h;
+
+	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+		list_for_each_prev(h, q->qs + i) {
+			if (is_sentinel(q, h))
+				break;
+
+			fn(h, context);
+		}
+	}
+}
+
 /*----------------------------------------------------------------*/
 
 /*
@@ -212,7 +254,6 @@ struct entry {
 	bool dirty:1;
 	unsigned hit_count;
 	unsigned generation;
-	unsigned tick;
 };
 
 /*
@@ -460,7 +501,6 @@ static bool in_cache(struct mq_policy *mq, struct entry *e)
  */
 static void push(struct mq_policy *mq, struct entry *e)
 {
-	e->tick = mq->tick;
 	hash_insert(mq, e);
 
 	if (in_cache(mq, e))
@@ -507,14 +547,6 @@ static struct entry *peek(struct queue *q)
 	return h ? container_of(h, struct entry, list) : NULL;
 }
 
-/*
- * Has this entry already been updated?
- */
-static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
-{
-	return mq->tick == e->tick;
-}
-
 /*
  * The promotion threshold is adjusted every generation.  As are the counts
  * of the entries.
@@ -566,20 +598,9 @@ static void check_generation(struct mq_policy *mq)
  * Whenever we use an entry we bump up it's hit counter, and push it to the
  * back to it's current level.
  */
-static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
+static void requeue(struct mq_policy *mq, struct entry *e)
 {
-	if (updated_this_tick(mq, e))
-		return;
-
-	e->hit_count++;
-	mq->hit_count++;
 	check_generation(mq);
-
-	/* generation adjustment, to stop the counts increasing forever. */
-	/* FIXME: divide? */
-	/* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
-	e->generation = mq->generation;
-
 	del(mq, e);
 	push(mq, e);
 }
@@ -686,7 +707,7 @@ static int cache_entry_found(struct mq_policy *mq,
 			     struct entry *e,
 			     struct policy_result *result)
 {
-	requeue_and_update_tick(mq, e);
+	requeue(mq, e);
 
 	if (in_cache(mq, e)) {
 		result->op = POLICY_HIT;
@@ -724,7 +745,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 	new_e->dirty = false;
 	new_e->hit_count = e->hit_count;
 	new_e->generation = e->generation;
-	new_e->tick = e->tick;
 
 	del(mq, e);
 	free_entry(&mq->pre_cache_pool, e);
@@ -740,18 +760,16 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 				 int data_dir, struct policy_result *result)
 {
 	int r = 0;
-	bool updated = updated_this_tick(mq, e);
 
-	if ((!discarded_oblock && updated) ||
-	    !should_promote(mq, e, discarded_oblock, data_dir)) {
-		requeue_and_update_tick(mq, e);
+	if (!should_promote(mq, e, discarded_oblock, data_dir)) {
+		requeue(mq, e);
 		result->op = POLICY_MISS;
 
 	} else if (!can_migrate)
 		r = -EWOULDBLOCK;
 
 	else {
-		requeue_and_update_tick(mq, e);
+		requeue(mq, e);
 		r = pre_cache_to_cache(mq, e, result);
 	}
 
@@ -888,12 +906,36 @@ static void mq_destroy(struct dm_cache_policy *p)
 	kfree(mq);
 }
 
+static void update_pre_cache_hits(struct list_head *h, void *context)
+{
+	struct entry *e = container_of(h, struct entry, list);
+	e->hit_count++;
+}
+
+static void update_cache_hits(struct list_head *h, void *context)
+{
+	struct mq_policy *mq = context;
+	struct entry *e = container_of(h, struct entry, list);
+	e->hit_count++;
+	mq->hit_count++;
+}
+
 static void copy_tick(struct mq_policy *mq)
 {
-	unsigned long flags;
+	unsigned long flags, tick;
 
 	spin_lock_irqsave(&mq->tick_lock, flags);
-	mq->tick = mq->tick_protected;
+	tick = mq->tick_protected;
+	if (tick != mq->tick) {
+		queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
+		queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
+		queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
+		mq->tick = tick;
+	}
+
+	queue_tick(&mq->pre_cache);
+	queue_tick(&mq->cache_dirty);
+	queue_tick(&mq->cache_clean);
 	spin_unlock_irqrestore(&mq->tick_lock, flags);
 }
 
@@ -995,10 +1037,15 @@ static int mq_save_hints(struct mq_policy *mq, struct queue *q,
 {
 	int r;
 	unsigned level;
+	struct list_head *h;
 	struct entry *e;
 
 	for (level = 0; level < NR_QUEUE_LEVELS; level++)
-		list_for_each_entry(e, q->qs + level, list) {
+		list_for_each(h, q->qs + level) {
+			if (is_sentinel(q, h))
+				continue;
+
+			e = container_of(h, struct entry, list);
 			r = fn(context, infer_cblock(&mq->cache_pool, e),
 			       e->oblock, e->hit_count);
 			if (r)
-- 
cgit v1.2.1


From fdecee3224d90e51c611198baeb0c38e568ca0e8 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Feb 2015 13:54:14 +0000
Subject: dm cache policy mq: remove unused generation member of struct entry

Remove to stop wasting memory.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 97b14309df90..6bfb39411fa9 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -253,7 +253,6 @@ struct entry {
 	 */
 	bool dirty:1;
 	unsigned hit_count;
-	unsigned generation;
 };
 
 /*
@@ -744,7 +743,6 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 	new_e->oblock = e->oblock;
 	new_e->dirty = false;
 	new_e->hit_count = e->hit_count;
-	new_e->generation = e->generation;
 
 	del(mq, e);
 	free_entry(&mq->pre_cache_pool, e);
@@ -796,7 +794,6 @@ static void insert_in_pre_cache(struct mq_policy *mq,
 	e->dirty = false;
 	e->oblock = oblock;
 	e->hit_count = 1;
-	e->generation = mq->generation;
 	push(mq, e);
 }
 
@@ -829,7 +826,6 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 	e->oblock = oblock;
 	e->dirty = false;
 	e->hit_count = 1;
-	e->generation = mq->generation;
 	push(mq, e);
 
 	result->cblock = infer_cblock(&mq->cache_pool, e);
@@ -1026,7 +1022,6 @@ static int mq_load_mapping(struct dm_cache_policy *p,
 	e->oblock = oblock;
 	e->dirty = false;	/* this gets corrected in a minute */
 	e->hit_count = hint_valid ? hint : 1;
-	e->generation = mq->generation;
 	push(mq, e);
 
 	return 0;
-- 
cgit v1.2.1


From e65ff8703f56273c6dc8b77373f4d2bef6e35107 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 20 Feb 2015 14:22:17 +0000
Subject: dm cache policy mq: try not to writeback data that changed in the
 last second

Writeback takes out a lock on the cache block, so will increase the
latency for any concurrent io.

This patch works by placing 2 sentinel objects on each level of the
multiqueues.  Every WRITEBACK_PERIOD the oldest sentinel gets moved to
the newest end of the queue level.

When looking for writeback work:
  if less than 25% of the cache is clean:
    we select the oldest object with the lowest hit count
  otherwise:
    we select the oldest object that is not past a writeback sentinel.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c | 94 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 93 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 6bfb39411fa9..3ddd1162334d 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -8,6 +8,7 @@
 #include "dm.h"
 
 #include <linux/hash.h>
+#include <linux/jiffies.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
@@ -126,8 +127,12 @@ static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 #define NR_QUEUE_LEVELS 16u
 #define NR_SENTINELS NR_QUEUE_LEVELS * 3
 
+#define WRITEBACK_PERIOD HZ
+
 struct queue {
 	unsigned nr_elts;
+	bool current_writeback_sentinels;
+	unsigned long next_writeback;
 	struct list_head qs[NR_QUEUE_LEVELS];
 	struct list_head sentinels[NR_SENTINELS];
 };
@@ -137,12 +142,21 @@ static void queue_init(struct queue *q)
 	unsigned i;
 
 	q->nr_elts = 0;
+	q->current_writeback_sentinels = false;
+	q->next_writeback = 0;
 	for (i = 0; i < NR_QUEUE_LEVELS; i++) {
 		INIT_LIST_HEAD(q->qs + i);
 		INIT_LIST_HEAD(q->sentinels + i);
+		INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
+		INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
 	}
 }
 
+static unsigned queue_size(struct queue *q)
+{
+	return q->nr_elts;
+}
+
 static bool queue_empty(struct queue *q)
 {
 	return q->nr_elts == 0;
@@ -197,6 +211,27 @@ static struct list_head *queue_pop(struct queue *q)
 	return r;
 }
 
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct list_head *queue_pop_old(struct queue *q)
+{
+	unsigned level;
+	struct list_head *h;
+
+	for (level = 0; level < NR_QUEUE_LEVELS; level++)
+		list_for_each(h, q->qs + level) {
+			if (is_sentinel(q, h))
+				break;
+
+			q->nr_elts--;
+			list_del(h);
+			return h;
+		}
+
+	return NULL;
+}
+
 static struct list_head *list_pop(struct list_head *lh)
 {
 	struct list_head *r = lh->next;
@@ -207,6 +242,31 @@ static struct list_head *list_pop(struct list_head *lh)
 	return r;
 }
 
+static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
+{
+	if (q->current_writeback_sentinels)
+		return q->sentinels + NR_QUEUE_LEVELS + level;
+	else
+		return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
+}
+
+static void queue_update_writeback_sentinels(struct queue *q)
+{
+	unsigned i;
+	struct list_head *h;
+
+	if (time_after(jiffies, q->next_writeback)) {
+		for (i = 0; i < NR_QUEUE_LEVELS; i++) {
+			h = writeback_sentinel(q, i);
+			list_del(h);
+			list_add_tail(h, q->qs + i);
+		}
+
+		q->next_writeback = jiffies + WRITEBACK_PERIOD;
+		q->current_writeback_sentinels = !q->current_writeback_sentinels;
+	}
+}
+
 /*
  * Sometimes we want to iterate through entries that have been pushed since
  * a certain event.  We use sentinel entries on the queues to delimit these
@@ -540,6 +600,20 @@ static struct entry *pop(struct mq_policy *mq, struct queue *q)
 	return e;
 }
 
+static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
+{
+	struct entry *e;
+	struct list_head *h = queue_pop_old(q);
+
+	if (!h)
+		return NULL;
+
+	e = container_of(h, struct entry, list);
+	hash_remove(e);
+
+	return e;
+}
+
 static struct entry *peek(struct queue *q)
 {
 	struct list_head *h = queue_peek(q);
@@ -932,6 +1006,7 @@ static void copy_tick(struct mq_policy *mq)
 	queue_tick(&mq->pre_cache);
 	queue_tick(&mq->cache_dirty);
 	queue_tick(&mq->cache_clean);
+	queue_update_writeback_sentinels(&mq->cache_dirty);
 	spin_unlock_irqrestore(&mq->tick_lock, flags);
 }
 
@@ -1112,10 +1187,27 @@ static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
 	return r;
 }
 
+#define CLEAN_TARGET_PERCENTAGE 25
+
+static bool clean_target_met(struct mq_policy *mq)
+{
+	/*
+	 * Cache entries may not be populated.  So we're cannot rely on the
+	 * size of the clean queue.
+	 */
+	unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
+	unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
+
+	return nr_clean >= target;
+}
+
 static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
 			      dm_cblock_t *cblock)
 {
-	struct entry *e = pop(mq, &mq->cache_dirty);
+	struct entry *e = pop_old(mq, &mq->cache_dirty);
+
+	if (!e && !clean_target_met(mq))
+		e = pop(mq, &mq->cache_dirty);
 
 	if (!e)
 		return -ENODATA;
-- 
cgit v1.2.1


From 09c2d53101da87f5ab4084643d2f8c718b3ab3cf Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 27 Feb 2015 22:25:26 -0500
Subject: dm: rename __dm_get_reserved_ios() helper to __dm_get_module_param()

__dm_get_module_param() could be useful for future DM module parameters
besides those related to "reserved_ios".

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8001fe9e3434..0e5f3441fcda 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -250,35 +250,35 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
  */
 static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 
-static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
+static unsigned __dm_get_module_param(unsigned *module_param,
 				      unsigned def, unsigned max)
 {
-	unsigned ios = ACCESS_ONCE(*reserved_ios);
-	unsigned modified_ios = 0;
+	unsigned param = ACCESS_ONCE(*module_param);
+	unsigned modified_param = 0;
 
-	if (!ios)
-		modified_ios = def;
-	else if (ios > max)
-		modified_ios = max;
+	if (!param)
+		modified_param = def;
+	else if (param > max)
+		modified_param = max;
 
-	if (modified_ios) {
-		(void)cmpxchg(reserved_ios, ios, modified_ios);
-		ios = modified_ios;
+	if (modified_param) {
+		(void)cmpxchg(module_param, param, modified_param);
+		param = modified_param;
 	}
 
-	return ios;
+	return param;
 }
 
 unsigned dm_get_reserved_bio_based_ios(void)
 {
-	return __dm_get_reserved_ios(&reserved_bio_based_ios,
+	return __dm_get_module_param(&reserved_bio_based_ios,
 				     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 
 unsigned dm_get_reserved_rq_based_ios(void)
 {
-	return __dm_get_reserved_ios(&reserved_rq_based_ios,
+	return __dm_get_module_param(&reserved_rq_based_ios,
 				     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
 }
 EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
-- 
cgit v1.2.1


From 52b09914af86fa3e728175c1125c91520e437b2f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 23 Feb 2015 16:36:41 -0500
Subject: dm: remove unnecessary wrapper around blk_lld_busy

There is no need for DM to export a wrapper around the already exported
blk_lld_busy().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c | 2 +-
 drivers/md/dm.c       | 6 ------
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d376dc87716e..add6391f3f8e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1627,7 +1627,7 @@ static int __pgpath_busy(struct pgpath *pgpath)
 {
 	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
 
-	return dm_underlying_device_busy(q);
+	return blk_lld_busy(q);
 }
 
 /*
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0e5f3441fcda..e7095ebb8d64 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2006,12 +2006,6 @@ out:
 	dm_put_live_table(md, srcu_idx);
 }
 
-int dm_underlying_device_busy(struct request_queue *q)
-{
-	return blk_lld_busy(q);
-}
-EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
-
 static int dm_lld_busy(struct request_queue *q)
 {
 	int r;
-- 
cgit v1.2.1


From d56b9b28a4a5d9e61dd99154b986e760373e2392 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 23 Feb 2015 19:10:15 -0500
Subject: dm: remove request-based DM queue's lld_busy_fn hook

DM multipath is the only caller of blk_lld_busy() -- which calls a
queue's lld_busy_fn hook.  Request-based DM doesn't support stacking
multipath devices so there is no reason to register the lld_busy_fn hook
on a multipath device's queue using blk_queue_lld_busy().

As such, remove functions dm_lld_busy and dm_table_any_busy_target.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 14 --------------
 drivers/md/dm.c       | 17 -----------------
 drivers/md/dm.h       |  1 -
 3 files changed, 32 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 6554d9148927..057312048b68 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1677,20 +1677,6 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 	return r;
 }
 
-int dm_table_any_busy_target(struct dm_table *t)
-{
-	unsigned i;
-	struct dm_target *ti;
-
-	for (i = 0; i < t->num_targets; i++) {
-		ti = t->targets + i;
-		if (ti->type->busy && ti->type->busy(ti))
-			return 1;
-	}
-
-	return 0;
-}
-
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
 	return t->md;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e7095ebb8d64..cc8aed2e3f88 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2006,22 +2006,6 @@ out:
 	dm_put_live_table(md, srcu_idx);
 }
 
-static int dm_lld_busy(struct request_queue *q)
-{
-	int r;
-	struct mapped_device *md = q->queuedata;
-	struct dm_table *map = dm_get_live_table_fast(md);
-
-	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
-		r = 1;
-	else
-		r = dm_table_any_busy_target(map);
-
-	dm_put_live_table_fast(md);
-
-	return r;
-}
-
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r = bdi_bits;
@@ -2545,7 +2529,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 	dm_init_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
-	blk_queue_lld_busy(md->queue, dm_lld_busy);
 
 	/* Also initialize the request-based DM worker thread */
 	init_kthread_worker(&md->kworker);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 59f53e79db82..db495863fa5f 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -70,7 +70,6 @@ void dm_table_presuspend_undo_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
-int dm_table_any_busy_target(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
 struct target_type *dm_table_get_immutable_target_type(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
-- 
cgit v1.2.1


From 74672d069b298b03e9f657fd70915e055739882e Mon Sep 17 00:00:00 2001
From: Gu Zheng <guz.fnst@cn.fujitsu.com>
Date: Fri, 3 Apr 2015 08:44:47 +0800
Subject: md: fix md io stats accounting broken

Simon reported the md io stats accounting issue:
"
I'm seeing "iostat -x -k 1" print this after a RAID1 rebuild on 4.0-rc5.
It's not abnormal other than it's 3-disk, with one being SSD (sdc) and
the other two being write-mostly:

Device:         rrqm/s   wrqm/s     r/s     w/s    rkB/s    wkB/s avgrq-sz avgqu-sz   await r_await w_await  svctm  %util
sda               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00    0.00    0.00   0.00   0.00
sdb               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00    0.00    0.00   0.00   0.00
sdc               0.00     0.00    0.00    0.00     0.00     0.00     0.00     0.00    0.00    0.00    0.00   0.00   0.00
md0               0.00     0.00    0.00    0.00     0.00     0.00     0.00   345.00    0.00    0.00    0.00   0.00 100.00
md2               0.00     0.00    0.00    0.00     0.00     0.00     0.00 58779.00    0.00    0.00    0.00   0.00 100.00
md1               0.00     0.00    0.00    0.00     0.00     0.00     0.00    12.00    0.00    0.00    0.00   0.00 100.00
"
The cause is commit "18c0b223cf9901727ef3b02da6711ac930b4e5d4" uses the
generic_start_io_acct to account the disk stats rather than the open code,
but it also introduced the increase to .in_flight[rw] which is needless to
md. So we re-use the open code here to fix it.

Reported-by: Simon Kirby <sim@hostway.ca>
Cc: <stable@vger.kernel.org> 3.19
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 717daad71fb1..e6178787ce3d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -249,6 +249,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 	const int rw = bio_data_dir(bio);
 	struct mddev *mddev = q->queuedata;
 	unsigned int sectors;
+	int cpu;
 
 	if (mddev == NULL || mddev->pers == NULL
 	    || !mddev->ready) {
@@ -284,7 +285,10 @@ static void md_make_request(struct request_queue *q, struct bio *bio)
 	sectors = bio_sectors(bio);
 	mddev->pers->make_request(mddev, bio);
 
-	generic_start_io_acct(rw, sectors, &mddev->gendisk->part0);
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
+	part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
+	part_stat_unlock();
 
 	if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
 		wake_up(&mddev->sb_wait);
-- 
cgit v1.2.1


From 47d68979cc968535cb87f3e5f2e6a3533ea48fbd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 10 Apr 2015 13:19:04 +1000
Subject: md/raid0: fix bug with chunksize not a power of 2.

Since commit 20d0189b1012a37d2533a87fb451f7852f2418d1
in v3.14-rc1 RAID0 has performed incorrect calculations
when the chunksize is not a power of 2.

This happens because "sector_div()" modifies its first argument, but
this wasn't taken into account in the patch.

So restore that first arg before re-using the variable.

Reported-by: Joe Landman <joe.landman@gmail.com>
Reported-by: Dave Chinner <david@fromorbit.com>
Fixes: 20d0189b1012a37d2533a87fb451f7852f2418d1
Cc: stable@vger.kernel.org (3.14 and later).
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 3ed9f42ddca6..3b5d7f704aa3 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -313,7 +313,7 @@ static struct strip_zone *find_zone(struct r0conf *conf,
 
 /*
  * remaps the bio to the target device. we separate two flows.
- * power 2 flow and a general flow for the sake of perfromance
+ * power 2 flow and a general flow for the sake of performance
 */
 static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
 				sector_t sector, sector_t *sector_offset)
@@ -524,6 +524,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 			split = bio;
 		}
 
+		sector = bio->bi_iter.bi_sector;
 		zone = find_zone(mddev->private, &sector);
 		tmp_dev = map_sector(mddev, zone, sector, &sector);
 		split->bi_bdev = tmp_dev->bdev;
-- 
cgit v1.2.1


From ff36ab34583ae23250a4bf39805d69771e7e0131 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 23 Feb 2015 17:56:37 -0500
Subject: dm: remove request-based logic from make_request_fn wrapper

The old dm_request() method used for q->make_request_fn had a branch for
request-based DM support but it isn't needed given that
dm_init_request_based_queue() sets it to the standard blk_queue_bio()
anyway.

Cleanup dm_init_md_queue() to be DM device-type agnostic and have
dm_setup_md_queue() properly finish queue setup based on DM device-type
(bio-based vs request-based).

A followup block patch can be made to remove the export for
blk_queue_bio() now that DM no longer calls it directly.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index cc8aed2e3f88..43e0d1a85a60 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1693,7 +1693,7 @@ out:
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
  */
-static void _dm_request(struct request_queue *q, struct bio *bio)
+static void dm_make_request(struct request_queue *q, struct bio *bio)
 {
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
@@ -1725,16 +1725,6 @@ int dm_request_based(struct mapped_device *md)
 	return blk_queue_stackable(md->queue);
 }
 
-static void dm_request(struct request_queue *q, struct bio *bio)
-{
-	struct mapped_device *md = q->queuedata;
-
-	if (dm_request_based(md))
-		blk_queue_bio(q, bio);
-	else
-		_dm_request(q, bio);
-}
-
 static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 {
 	int r;
@@ -2100,9 +2090,8 @@ static void dm_init_md_queue(struct mapped_device *md)
 	md->queue->queuedata = md;
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
-	blk_queue_make_request(md->queue, dm_request);
+
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
-	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 }
 
 /*
@@ -2335,7 +2324,7 @@ int dm_queue_merge_is_compulsory(struct request_queue *q)
 	if (!q->merge_bvec_fn)
 		return 0;
 
-	if (q->make_request_fn == dm_request) {
+	if (q->make_request_fn == dm_make_request) {
 		dev_md = q->queuedata;
 		if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
 			return 0;
@@ -2545,9 +2534,15 @@ static int dm_init_request_based_queue(struct mapped_device *md)
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
-	if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
-		DMWARN("Cannot initialize queue for request-based mapped device");
-		return -EINVAL;
+	if (dm_md_type_request_based(md)) {
+		if (!dm_init_request_based_queue(md)) {
+			DMWARN("Cannot initialize queue for request-based mapped device");
+			return -EINVAL;
+		}
+	} else {
+		/* bio-based specific initialization */
+		blk_queue_make_request(md->queue, dm_make_request);
+		blk_queue_merge_bvec(md->queue, dm_merge_bvec);
 	}
 
 	return 0;
-- 
cgit v1.2.1


From 9a0e609e3fd8a95c96629b9fbde6b8c5b9a1456a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 24 Feb 2015 11:03:22 -0500
Subject: dm: only run the queue on completion if congested or no requests
 pending

On really fast storage it can be beneficial to delay running the
request_queue to allow the elevator more opportunity to merge requests.

Otherwise, it has been observed that requests are being sent to
q->request_fn much quicker than is ideal on IOPS-bound backends.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 43e0d1a85a60..7924c00e0716 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1024,10 +1024,13 @@ static void end_clone_bio(struct bio *clone, int error)
  */
 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 {
+	int nr_requests_pending;
+
 	atomic_dec(&md->pending[rw]);
 
 	/* nudge anyone waiting on suspend queue */
-	if (!md_in_flight(md))
+	nr_requests_pending = md_in_flight(md);
+	if (!nr_requests_pending)
 		wake_up(&md->wait);
 
 	/*
@@ -1036,8 +1039,11 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 	 * back into ->request_fn() could deadlock attempting to grab the
 	 * queue lock again.
 	 */
-	if (run_queue)
-		blk_run_queue_async(md->queue);
+	if (run_queue) {
+		if (!nr_requests_pending ||
+		    (nr_requests_pending >= md->queue->nr_congestion_on))
+			blk_run_queue_async(md->queue);
+	}
 
 	/*
 	 * dm_put() must be at the end of this function. See the comment above
-- 
cgit v1.2.1


From 9d1deb83d489364f8749a3a1ba1689efb07d94b0 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 24 Feb 2015 20:49:18 -0500
Subject: dm: don't schedule delayed run of the queue if nothing to do

In request-based DM's dm_request_fn(), if blk_peek_request() returns
NULL just return.  Avoids unnecessary blk_delay_queue().

Reported-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7924c00e0716..6f854287384b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1963,7 +1963,7 @@ static void dm_request_fn(struct request_queue *q)
 	while (!blk_queue_stopped(q)) {
 		rq = blk_peek_request(q);
 		if (!rq)
-			goto delay_and_out;
+			goto out;
 
 		/* always use block 0 to find the target for flushes for now */
 		pos = 0;
-- 
cgit v1.2.1


From d548b34b062b60b4f4df295a0b4823dfda1f1fc4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 5 Mar 2015 22:21:10 -0500
Subject: dm: reduce the queue delay used in dm_request_fn from 100ms to 10ms

Commit 7eaceaccab ("block: remove per-queue plugging") didn't justify
DM's use of a 100ms delay; such an extended delay is a liability when
there is reason to re-kick the queue.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 6f854287384b..98eb02d32e6e 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1997,7 +1997,7 @@ static void dm_request_fn(struct request_queue *q)
 	goto out;
 
 delay_and_out:
-	blk_delay_queue(q, HZ / 10);
+	blk_delay_queue(q, HZ / 100);
 out:
 	dm_put_live_table(md, srcu_idx);
 }
-- 
cgit v1.2.1


From de3ec86dff160d35c817bb70eeaeff6e392f44a4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 24 Feb 2015 21:58:21 -0500
Subject: dm: don't start current request if it would've merged with the
 previous

Request-based DM's dm_request_fn() is so fast to pull requests off the
queue that steps need to be taken to promote merging by avoiding request
processing if it makes sense.

If the current request would've merged with previous request let the
current request stay on the queue longer.

Suggested-by: Jens Axboe <axboe@fb.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 98eb02d32e6e..2ae78b31e4c0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,7 @@
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
+#include <linux/elevator.h> /* for rq_end_sector() */
 
 #include <trace/events/block.h>
 
@@ -216,6 +217,10 @@ struct mapped_device {
 
 	struct kthread_worker kworker;
 	struct task_struct *kworker_task;
+
+	/* for request-based merge heuristic in dm_request_fn() */
+	sector_t last_rq_pos;
+	int last_rq_rw;
 };
 
 /*
@@ -1930,6 +1935,9 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 	blk_start_request(orig);
 	atomic_inc(&md->pending[rq_data_dir(orig)]);
 
+	md->last_rq_pos = rq_end_sector(orig);
+	md->last_rq_rw = rq_data_dir(orig);
+
 	/*
 	 * Hold the md reference here for the in-flight I/O.
 	 * We can't rely on the reference count by device opener,
@@ -1982,6 +1990,10 @@ static void dm_request_fn(struct request_queue *q)
 			continue;
 		}
 
+		if (md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+		    md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
+			goto delay_and_out;
+
 		if (ti->type->busy && ti->type->busy(ti))
 			goto delay_and_out;
 
-- 
cgit v1.2.1


From b898320d683d54c2bc17b748b9742d2b601ad453 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 27 Feb 2015 17:58:42 -0500
Subject: dm sysfs: introduce ability to add writable attributes

Add DM_ATTR_RW() macro and establish .store method in dm_sysfs_ops.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-sysfs.c | 32 +++++++++++++++++++++++++++-----
 1 file changed, 27 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index c62c5ab6aed5..1271c31709fd 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -11,7 +11,7 @@
 struct dm_sysfs_attr {
 	struct attribute attr;
 	ssize_t (*show)(struct mapped_device *, char *);
-	ssize_t (*store)(struct mapped_device *, char *);
+	ssize_t (*store)(struct mapped_device *, const char *, size_t count);
 };
 
 #define DM_ATTR_RO(_name) \
@@ -39,6 +39,31 @@ static ssize_t dm_attr_show(struct kobject *kobj, struct attribute *attr,
 	return ret;
 }
 
+#define DM_ATTR_RW(_name) \
+struct dm_sysfs_attr dm_attr_##_name = \
+	__ATTR(_name, S_IRUGO | S_IWUSR, dm_attr_##_name##_show, dm_attr_##_name##_store)
+
+static ssize_t dm_attr_store(struct kobject *kobj, struct attribute *attr,
+			     const char *page, size_t count)
+{
+	struct dm_sysfs_attr *dm_attr;
+	struct mapped_device *md;
+	ssize_t ret;
+
+	dm_attr = container_of(attr, struct dm_sysfs_attr, attr);
+	if (!dm_attr->store)
+		return -EIO;
+
+	md = dm_get_from_kobject(kobj);
+	if (!md)
+		return -EINVAL;
+
+	ret = dm_attr->store(md, page, count);
+	dm_put(md);
+
+	return ret;
+}
+
 static ssize_t dm_attr_name_show(struct mapped_device *md, char *buf)
 {
 	if (dm_copy_name_and_uuid(md, buf, NULL))
@@ -77,12 +102,9 @@ static struct attribute *dm_attrs[] = {
 
 static const struct sysfs_ops dm_sysfs_ops = {
 	.show	= dm_attr_show,
+	.store	= dm_attr_store,
 };
 
-/*
- * dm kobject is embedded in mapped_device structure
- * no need to define release function here
- */
 static struct kobj_type dm_ktype = {
 	.sysfs_ops	= &dm_sysfs_ops,
 	.default_attrs	= dm_attrs,
-- 
cgit v1.2.1


From 0ce65797a77ee780f62909d3128bf08b9735718b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 26 Feb 2015 00:50:28 -0500
Subject: dm: impose configurable deadline for dm_request_fn's merge heuristic

Otherwise, for sequential workloads, the dm_request_fn can allow
excessive request merging at the expense of increased service time.

Add a per-device sysfs attribute to allow the user to control how long a
request, that is a reasonable merge candidate, can be queued on the
request queue.  The resolution of this request dispatch deadline is in
microseconds (ranging from 1 to 100000 usecs), to set a 20us deadline:
  echo 20 > /sys/block/dm-7/dm/rq_based_seq_io_merge_deadline

The dm_request_fn's merge heuristic and associated extra accounting is
disabled by default (rq_based_seq_io_merge_deadline is 0).

This sysfs attribute is not applicable to bio-based DM devices so it
will only ever report 0 for them.

By allowing a request to remain on the queue it will block others
requests on the queue.  But introducing a short dequeue delay has proven
very effective at enabling certain sequential IO workloads on really
fast, yet IOPS constrained, devices to build up slightly larger IOs --
yielding 90+% throughput improvements.  Having precise control over the
time taken to wait for larger requests to build affords control beyond
that of waiting for certain IO sizes to accumulate (which would require
a deadline anyway).  This knob will only ever make sense with sequential
IO workloads and the particular value used is storage configuration
specific.

Given the expected niche use-case for when this knob is useful it has
been deemed acceptable to expose this relatively crude method for
crafting optimal IO on specific storage -- especially given the solution
is simple yet effective.  In the context of DM multipath, it is
advisable to tune this sysfs attribute to a value that offers the best
performance for the common case (e.g. if 4 paths are expected active,
tune for that; if paths fail then performance may be slightly reduced).

Alternatives were explored to have request-based DM autotune this value
(e.g. if/when paths fail) but they were quickly deemed too fragile and
complex to warrant further design and development time.  If this problem
proves more common as faster storage emerges we'll have to look at
elevating a generic solution into the block core.

Tested-by: Shiva Krishna Merla <shivakrishna.merla@netapp.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-sysfs.c |  2 ++
 drivers/md/dm.c       | 57 +++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/dm.h       |  4 ++++
 3 files changed, 59 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index 1271c31709fd..f5bb3944f75e 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -92,11 +92,13 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
 static DM_ATTR_RO(name);
 static DM_ATTR_RO(uuid);
 static DM_ATTR_RO(suspended);
+static DM_ATTR_RW(rq_based_seq_io_merge_deadline);
 
 static struct attribute *dm_attrs[] = {
 	&dm_attr_name.attr,
 	&dm_attr_uuid.attr,
 	&dm_attr_suspended.attr,
+	&dm_attr_rq_based_seq_io_merge_deadline.attr,
 	NULL,
 };
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2ae78b31e4c0..5294e016e92b 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -21,6 +21,7 @@
 #include <linux/delay.h>
 #include <linux/wait.h>
 #include <linux/kthread.h>
+#include <linux/ktime.h>
 #include <linux/elevator.h> /* for rq_end_sector() */
 
 #include <trace/events/block.h>
@@ -219,8 +220,10 @@ struct mapped_device {
 	struct task_struct *kworker_task;
 
 	/* for request-based merge heuristic in dm_request_fn() */
-	sector_t last_rq_pos;
+	unsigned seq_rq_merge_deadline_usecs;
 	int last_rq_rw;
+	sector_t last_rq_pos;
+	ktime_t last_rq_start_time;
 };
 
 /*
@@ -1935,8 +1938,11 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 	blk_start_request(orig);
 	atomic_inc(&md->pending[rq_data_dir(orig)]);
 
-	md->last_rq_pos = rq_end_sector(orig);
-	md->last_rq_rw = rq_data_dir(orig);
+	if (md->seq_rq_merge_deadline_usecs) {
+		md->last_rq_pos = rq_end_sector(orig);
+		md->last_rq_rw = rq_data_dir(orig);
+		md->last_rq_start_time = ktime_get();
+	}
 
 	/*
 	 * Hold the md reference here for the in-flight I/O.
@@ -1948,6 +1954,45 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 	dm_get(md);
 }
 
+#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
+{
+	return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
+}
+
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+						     const char *buf, size_t count)
+{
+	unsigned deadline;
+
+	if (!dm_request_based(md))
+		return count;
+
+	if (kstrtouint(buf, 10, &deadline))
+		return -EINVAL;
+
+	if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
+		deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
+
+	md->seq_rq_merge_deadline_usecs = deadline;
+
+	return count;
+}
+
+static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
+{
+	ktime_t kt_deadline;
+
+	if (!md->seq_rq_merge_deadline_usecs)
+		return false;
+
+	kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
+	kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
+
+	return !ktime_after(ktime_get(), kt_deadline);
+}
+
 /*
  * q->request_fn for request-based dm.
  * Called with the queue lock held.
@@ -1990,7 +2035,8 @@ static void dm_request_fn(struct request_queue *q)
 			continue;
 		}
 
-		if (md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+		if (dm_request_peeked_before_merge_deadline(md) &&
+		    md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
 		    md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
 			goto delay_and_out;
 
@@ -2532,6 +2578,9 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 	if (!q)
 		return 0;
 
+	/* disable dm_request_fn's merge heuristic by default */
+	md->seq_rq_merge_deadline_usecs = 0;
+
 	md->queue = q;
 	dm_init_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index db495863fa5f..5522422cc6c4 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -234,4 +234,8 @@ static inline bool dm_message_test_buffer_overflow(char *result, unsigned maxlen
 	return !maxlen || strlen(result) + 1 >= maxlen;
 }
 
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf);
+ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
+						     const char *buf, size_t count);
+
 #endif
-- 
cgit v1.2.1


From bfebd1cdb497a57757c83f5fbf1a29931591e2a4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Sun, 8 Mar 2015 00:51:47 -0500
Subject: dm: add full blk-mq support to request-based DM

Commit e5863d9ad ("dm: allocate requests in target when stacking on
blk-mq devices") served as the first step toward fully utilizing blk-mq
in request-based DM -- it enabled stacking an old-style (request_fn)
request_queue ontop of the underlying blk-mq device(s).  That first step
didn't improve performance of DM multipath ontop of fast blk-mq devices
(e.g. NVMe) because the top-level old-style request_queue was severely
limited by the queue_lock.

The second step offered here enables stacking a blk-mq request_queue
ontop of the underlying blk-mq device(s).  This unlocks significant
performance gains on fast blk-mq devices, Keith Busch tested on his NVMe
testbed and offered this really positive news:

 "Just providing a performance update. All my fio tests are getting
  roughly equal performance whether accessed through the raw block
  device or the multipath device mapper (~470k IOPS). I could only push
  ~20% of the raw iops through dm before this conversion, so this latest
  tree is looking really solid from a performance standpoint."

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Tested-by: Keith Busch <keith.busch@intel.com>
---
 drivers/md/dm-mpath.c |   2 +-
 drivers/md/dm-table.c |  11 +-
 drivers/md/dm.c       | 317 +++++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 259 insertions(+), 71 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index add6391f3f8e..c8f07e5a9a17 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1703,7 +1703,7 @@ out:
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 8, 0},
+	.version = {1, 9, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 057312048b68..66600cab9fa5 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -18,6 +18,7 @@
 #include <linux/mutex.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
+#include <linux/blk-mq.h>
 
 #define DM_MSG_PREFIX "table"
 
@@ -1695,9 +1696,13 @@ void dm_table_run_md_queue_async(struct dm_table *t)
 	md = dm_table_get_md(t);
 	queue = dm_get_md_queue(md);
 	if (queue) {
-		spin_lock_irqsave(queue->queue_lock, flags);
-		blk_run_queue_async(queue);
-		spin_unlock_irqrestore(queue->queue_lock, flags);
+		if (queue->mq_ops)
+			blk_mq_run_hw_queues(queue, true);
+		else {
+			spin_lock_irqsave(queue->queue_lock, flags);
+			blk_run_queue_async(queue);
+			spin_unlock_irqrestore(queue->queue_lock, flags);
+		}
 	}
 }
 EXPORT_SYMBOL(dm_table_run_md_queue_async);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5294e016e92b..3a66baac76ed 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -23,6 +23,7 @@
 #include <linux/kthread.h>
 #include <linux/ktime.h>
 #include <linux/elevator.h> /* for rq_end_sector() */
+#include <linux/blk-mq.h>
 
 #include <trace/events/block.h>
 
@@ -224,6 +225,9 @@ struct mapped_device {
 	int last_rq_rw;
 	sector_t last_rq_pos;
 	ktime_t last_rq_start_time;
+
+	/* for blk-mq request-based DM support */
+	struct blk_mq_tag_set tag_set;
 };
 
 /*
@@ -1025,6 +1029,11 @@ static void end_clone_bio(struct bio *clone, int error)
 	blk_update_request(tio->orig, 0, nr_bytes);
 }
 
+static struct dm_rq_target_io *tio_from_request(struct request *rq)
+{
+	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
+}
+
 /*
  * Don't touch any member of the md after calling this function because
  * the md may be freed in dm_put() at the end of this function.
@@ -1048,8 +1057,10 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 	 * queue lock again.
 	 */
 	if (run_queue) {
-		if (!nr_requests_pending ||
-		    (nr_requests_pending >= md->queue->nr_congestion_on))
+		if (md->queue->mq_ops)
+			blk_mq_run_hw_queues(md->queue, true);
+		else if (!nr_requests_pending ||
+			 (nr_requests_pending >= md->queue->nr_congestion_on))
 			blk_run_queue_async(md->queue);
 	}
 
@@ -1062,13 +1073,17 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 static void free_rq_clone(struct request *clone)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
 
 	blk_rq_unprep_clone(clone);
+
 	if (clone->q && clone->q->mq_ops)
 		tio->ti->type->release_clone_rq(clone);
 	else
-		free_clone_request(tio->md, clone);
-	free_rq_tio(tio);
+		free_clone_request(md, clone);
+
+	if (!md->queue->mq_ops)
+		free_rq_tio(tio);
 }
 
 /*
@@ -1097,17 +1112,22 @@ static void dm_end_request(struct request *clone, int error)
 	}
 
 	free_rq_clone(clone);
-	blk_end_request_all(rq, error);
+	if (!rq->q->mq_ops)
+		blk_end_request_all(rq, error);
+	else
+		blk_mq_end_request(rq, error);
 	rq_completed(md, rw, true);
 }
 
 static void dm_unprep_request(struct request *rq)
 {
-	struct dm_rq_target_io *tio = rq->special;
+	struct dm_rq_target_io *tio = tio_from_request(rq);
 	struct request *clone = tio->clone;
 
-	rq->special = NULL;
-	rq->cmd_flags &= ~REQ_DONTPREP;
+	if (!rq->q->mq_ops) {
+		rq->special = NULL;
+		rq->cmd_flags &= ~REQ_DONTPREP;
+	}
 
 	if (clone)
 		free_rq_clone(clone);
@@ -1116,18 +1136,29 @@ static void dm_unprep_request(struct request *rq)
 /*
  * Requeue the original request of a clone.
  */
-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
-						 struct request *rq)
+static void old_requeue_request(struct request *rq)
 {
-	int rw = rq_data_dir(rq);
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 
-	dm_unprep_request(rq);
-
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_requeue_request(q, rq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_requeue_unmapped_original_request(struct mapped_device *md,
+						 struct request *rq)
+{
+	int rw = rq_data_dir(rq);
+
+	dm_unprep_request(rq);
+
+	if (!rq->q->mq_ops)
+		old_requeue_request(rq);
+	else {
+		blk_mq_requeue_request(rq);
+		blk_mq_kick_requeue_list(rq->q);
+	}
 
 	rq_completed(md, rw, false);
 }
@@ -1139,35 +1170,44 @@ static void dm_requeue_unmapped_request(struct request *clone)
 	dm_requeue_unmapped_original_request(tio->md, tio->orig);
 }
 
-static void __stop_queue(struct request_queue *q)
-{
-	blk_stop_queue(q);
-}
-
-static void stop_queue(struct request_queue *q)
+static void old_stop_queue(struct request_queue *q)
 {
 	unsigned long flags;
 
+	if (blk_queue_stopped(q))
+		return;
+
 	spin_lock_irqsave(q->queue_lock, flags);
-	__stop_queue(q);
+	blk_stop_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void __start_queue(struct request_queue *q)
+static void stop_queue(struct request_queue *q)
 {
-	if (blk_queue_stopped(q))
-		blk_start_queue(q);
+	if (!q->mq_ops)
+		old_stop_queue(q);
+	else
+		blk_mq_stop_hw_queues(q);
 }
 
-static void start_queue(struct request_queue *q)
+static void old_start_queue(struct request_queue *q)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	__start_queue(q);
+	if (blk_queue_stopped(q))
+		blk_start_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+static void start_queue(struct request_queue *q)
+{
+	if (!q->mq_ops)
+		old_start_queue(q);
+	else
+		blk_mq_start_stopped_hw_queues(q, true);
+}
+
 static void dm_done(struct request *clone, int error, bool mapped)
 {
 	int r = error;
@@ -1206,13 +1246,20 @@ static void dm_done(struct request *clone, int error, bool mapped)
 static void dm_softirq_done(struct request *rq)
 {
 	bool mapped = true;
-	struct dm_rq_target_io *tio = rq->special;
+	struct dm_rq_target_io *tio = tio_from_request(rq);
 	struct request *clone = tio->clone;
+	int rw;
 
 	if (!clone) {
-		blk_end_request_all(rq, tio->error);
-		rq_completed(tio->md, rq_data_dir(rq), false);
-		free_rq_tio(tio);
+		rw = rq_data_dir(rq);
+		if (!rq->q->mq_ops) {
+			blk_end_request_all(rq, tio->error);
+			rq_completed(tio->md, rw, false);
+			free_rq_tio(tio);
+		} else {
+			blk_mq_end_request(rq, tio->error);
+			rq_completed(tio->md, rw, false);
+		}
 		return;
 	}
 
@@ -1228,7 +1275,7 @@ static void dm_softirq_done(struct request *rq)
  */
 static void dm_complete_request(struct request *rq, int error)
 {
-	struct dm_rq_target_io *tio = rq->special;
+	struct dm_rq_target_io *tio = tio_from_request(rq);
 
 	tio->error = error;
 	blk_complete_request(rq);
@@ -1247,7 +1294,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
 }
 
 /*
- * Called with the clone's queue lock held
+ * Called with the clone's queue lock held (for non-blk-mq)
  */
 static void end_clone_request(struct request *clone, int error)
 {
@@ -1808,6 +1855,18 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 
 static void map_tio_request(struct kthread_work *work);
 
+static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
+		     struct mapped_device *md)
+{
+	tio->md = md;
+	tio->ti = NULL;
+	tio->clone = NULL;
+	tio->orig = rq;
+	tio->error = 0;
+	memset(&tio->info, 0, sizeof(tio->info));
+	init_kthread_work(&tio->work, map_tio_request);
+}
+
 static struct dm_rq_target_io *prep_tio(struct request *rq,
 					struct mapped_device *md, gfp_t gfp_mask)
 {
@@ -1819,13 +1878,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
 	if (!tio)
 		return NULL;
 
-	tio->md = md;
-	tio->ti = NULL;
-	tio->clone = NULL;
-	tio->orig = rq;
-	tio->error = 0;
-	memset(&tio->info, 0, sizeof(tio->info));
-	init_kthread_work(&tio->work, map_tio_request);
+	init_tio(tio, rq, md);
 
 	table = dm_get_live_table(md, &srcu_idx);
 	if (!dm_table_mq_request_based(table)) {
@@ -1869,11 +1922,11 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
  * DM_MAPIO_REQUEUE : the original request needs to be requeued
  * < 0              : the request was completed due to failure
  */
-static int map_request(struct dm_target *ti, struct request *rq,
+static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		       struct mapped_device *md)
 {
 	int r;
-	struct dm_rq_target_io *tio = rq->special;
+	struct dm_target *ti = tio->ti;
 	struct request *clone = NULL;
 
 	if (tio->clone) {
@@ -1888,7 +1941,7 @@ static int map_request(struct dm_target *ti, struct request *rq,
 		}
 		if (IS_ERR(clone))
 			return DM_MAPIO_REQUEUE;
-		if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
+		if (setup_clone(clone, rq, tio, GFP_NOIO)) {
 			/* -ENOMEM */
 			ti->type->release_clone_rq(clone);
 			return DM_MAPIO_REQUEUE;
@@ -1929,13 +1982,16 @@ static void map_tio_request(struct kthread_work *work)
 	struct request *rq = tio->orig;
 	struct mapped_device *md = tio->md;
 
-	if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
+	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
 		dm_requeue_unmapped_original_request(md, rq);
 }
 
 static void dm_start_request(struct mapped_device *md, struct request *orig)
 {
-	blk_start_request(orig);
+	if (!orig->q->mq_ops)
+		blk_start_request(orig);
+	else
+		blk_mq_start_request(orig);
 	atomic_inc(&md->pending[rq_data_dir(orig)]);
 
 	if (md->seq_rq_merge_deadline_usecs) {
@@ -2045,7 +2101,7 @@ static void dm_request_fn(struct request_queue *q)
 
 		dm_start_request(md, rq);
 
-		tio = rq->special;
+		tio = tio_from_request(rq);
 		/* Establish tio->ti before queuing work (map_tio_request) */
 		tio->ti = ti;
 		queue_kthread_work(&md->kworker, &tio->work);
@@ -2142,7 +2198,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 {
 	/*
 	 * Request-based dm devices cannot be stacked on top of bio-based dm
-	 * devices.  The type of this dm device has not been decided yet.
+	 * devices.  The type of this dm device may not have been decided yet.
 	 * The type is decided at the first table loading time.
 	 * To prevent problematic device stacking, clear the queue flag
 	 * for request stacking support until then.
@@ -2150,7 +2206,15 @@ static void dm_init_md_queue(struct mapped_device *md)
 	 * This queue is new, so no concurrency on the queue_flags.
 	 */
 	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+}
+
+static void dm_init_old_md_queue(struct mapped_device *md)
+{
+	dm_init_md_queue(md);
 
+	/*
+	 * Initialize aspects of queue that aren't relevant for blk-mq
+	 */
 	md->queue->queuedata = md;
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
@@ -2273,6 +2337,7 @@ static void unlock_fs(struct mapped_device *md);
 static void free_dev(struct mapped_device *md)
 {
 	int minor = MINOR(disk_devt(md->disk));
+	bool using_blk_mq = !!md->queue->mq_ops;
 
 	unlock_fs(md);
 	destroy_workqueue(md->wq);
@@ -2298,6 +2363,8 @@ static void free_dev(struct mapped_device *md)
 	del_gendisk(md->disk);
 	put_disk(md->disk);
 	blk_cleanup_queue(md->queue);
+	if (using_blk_mq)
+		blk_mq_free_tag_set(&md->tag_set);
 	bdput(md->bdev);
 	free_minor(minor);
 
@@ -2457,7 +2524,7 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 	 * This must be done before setting the queue restrictions,
 	 * because request-based dm may be run just after the setting.
 	 */
-	if (dm_table_request_based(t) && !blk_queue_stopped(q))
+	if (dm_table_request_based(t))
 		stop_queue(q);
 
 	__bind_mempools(md, t);
@@ -2539,14 +2606,6 @@ unsigned dm_get_md_type(struct mapped_device *md)
 	return md->type;
 }
 
-static bool dm_md_type_request_based(struct mapped_device *md)
-{
-	unsigned table_type = dm_get_md_type(md);
-
-	return (table_type == DM_TYPE_REQUEST_BASED ||
-		table_type == DM_TYPE_MQ_REQUEST_BASED);
-}
-
 struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
 {
 	return md->immutable_target_type;
@@ -2563,6 +2622,14 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
 }
 EXPORT_SYMBOL_GPL(dm_get_queue_limits);
 
+static void init_rq_based_worker_thread(struct mapped_device *md)
+{
+	/* Initialize the request-based DM worker thread */
+	init_kthread_worker(&md->kworker);
+	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
+				       "kdmwork-%s", dm_device_name(md));
+}
+
 /*
  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
  */
@@ -2571,29 +2638,131 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 	struct request_queue *q = NULL;
 
 	if (md->queue->elevator)
-		return 1;
+		return 0;
 
 	/* Fully initialize the queue */
 	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
 	if (!q)
-		return 0;
+		return -EINVAL;
 
 	/* disable dm_request_fn's merge heuristic by default */
 	md->seq_rq_merge_deadline_usecs = 0;
 
 	md->queue = q;
-	dm_init_md_queue(md);
+	dm_init_old_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 
-	/* Also initialize the request-based DM worker thread */
-	init_kthread_worker(&md->kworker);
-	md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
-				       "kdmwork-%s", dm_device_name(md));
+	init_rq_based_worker_thread(md);
 
 	elv_register_queue(md->queue);
 
-	return 1;
+	return 0;
+}
+
+static int dm_mq_init_request(void *data, struct request *rq,
+			      unsigned int hctx_idx, unsigned int request_idx,
+			      unsigned int numa_node)
+{
+	struct mapped_device *md = data;
+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+
+	/*
+	 * Must initialize md member of tio, otherwise it won't
+	 * be available in dm_mq_queue_rq.
+	 */
+	tio->md = md;
+
+	return 0;
+}
+
+static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+			  const struct blk_mq_queue_data *bd)
+{
+	struct request *rq = bd->rq;
+	struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
+	struct mapped_device *md = tio->md;
+	int srcu_idx;
+	struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+	struct dm_target *ti;
+	sector_t pos;
+
+	/* always use block 0 to find the target for flushes for now */
+	pos = 0;
+	if (!(rq->cmd_flags & REQ_FLUSH))
+		pos = blk_rq_pos(rq);
+
+	ti = dm_table_find_target(map, pos);
+	if (!dm_target_is_valid(ti)) {
+		dm_put_live_table(md, srcu_idx);
+		DMERR_LIMIT("request attempted access beyond the end of device");
+		/*
+		 * Must perform setup, that rq_completed() requires,
+		 * before returning BLK_MQ_RQ_QUEUE_ERROR
+		 */
+		dm_start_request(md, rq);
+		return BLK_MQ_RQ_QUEUE_ERROR;
+	}
+	dm_put_live_table(md, srcu_idx);
+
+	if (ti->type->busy && ti->type->busy(ti))
+		return BLK_MQ_RQ_QUEUE_BUSY;
+
+	dm_start_request(md, rq);
+
+	/* Init tio using md established in .init_request */
+	init_tio(tio, rq, md);
+
+	/* Establish tio->ti before queuing work (map_tio_request) */
+	tio->ti = ti;
+	queue_kthread_work(&md->kworker, &tio->work);
+
+	return BLK_MQ_RQ_QUEUE_OK;
+}
+
+static struct blk_mq_ops dm_mq_ops = {
+	.queue_rq = dm_mq_queue_rq,
+	.map_queue = blk_mq_map_queue,
+	.complete = dm_softirq_done,
+	.init_request = dm_mq_init_request,
+};
+
+static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
+{
+	struct request_queue *q;
+	int err;
+
+	memset(&md->tag_set, 0, sizeof(md->tag_set));
+	md->tag_set.ops = &dm_mq_ops;
+	md->tag_set.queue_depth = BLKDEV_MAX_RQ;
+	md->tag_set.numa_node = NUMA_NO_NODE;
+	md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	md->tag_set.nr_hw_queues = 1;
+	md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
+	md->tag_set.driver_data = md;
+
+	err = blk_mq_alloc_tag_set(&md->tag_set);
+	if (err)
+		return err;
+
+	q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
+	if (IS_ERR(q)) {
+		err = PTR_ERR(q);
+		goto out_tag_set;
+	}
+	md->queue = q;
+	dm_init_md_queue(md);
+
+	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
+	blk_mq_register_disk(md->disk);
+
+	init_rq_based_worker_thread(md);
+
+	return 0;
+
+out_tag_set:
+	blk_mq_free_tag_set(&md->tag_set);
+	return err;
 }
 
 /*
@@ -2601,15 +2770,29 @@ static int dm_init_request_based_queue(struct mapped_device *md)
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
-	if (dm_md_type_request_based(md)) {
-		if (!dm_init_request_based_queue(md)) {
+	int r;
+	unsigned md_type = dm_get_md_type(md);
+
+	switch (md_type) {
+	case DM_TYPE_REQUEST_BASED:
+		r = dm_init_request_based_queue(md);
+		if (r) {
 			DMWARN("Cannot initialize queue for request-based mapped device");
-			return -EINVAL;
+			return r;
 		}
-	} else {
-		/* bio-based specific initialization */
+		break;
+	case DM_TYPE_MQ_REQUEST_BASED:
+		r = dm_init_request_based_blk_mq_queue(md);
+		if (r) {
+			DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
+			return r;
+		}
+		break;
+	case DM_TYPE_BIO_BASED:
+		dm_init_old_md_queue(md);
 		blk_queue_make_request(md->queue, dm_make_request);
 		blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+		break;
 	}
 
 	return 0;
-- 
cgit v1.2.1


From 022333427a8aa4ccb318a9db90cea4e69ca1826b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 10 Mar 2015 23:49:26 -0400
Subject: dm: optimize dm_mq_queue_rq to _not_ use kthread if using pure blk-mq

dm_mq_queue_rq() is in atomic context so care must be taken to not
sleep -- as such GFP_ATOMIC is used for the md->bs bioset allocations
and dm-mpath's call to blk_get_request().  In the future the bioset
allocations will hopefully go away (by removing support for partial
completions of bios in a cloned request).

Also prepare for supporting DM blk-mq ontop of old-style request_fn
device(s) if a new dm-mod 'use_blk_mq' parameter is set.  The kthread
will still be used to queue work if blk-mq is used ontop of old-style
request_fn device(s).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-mpath.c |  2 +-
 drivers/md/dm.c       | 64 +++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 50 insertions(+), 16 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index c8f07e5a9a17..63953477a07c 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -428,7 +428,7 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 	} else {
 		/* blk-mq request-based interface */
 		*__clone = blk_get_request(bdev_get_queue(bdev),
-					   rq_data_dir(rq), GFP_KERNEL);
+					   rq_data_dir(rq), GFP_ATOMIC);
 		if (IS_ERR(*__clone))
 			/* ENOMEM, requeue */
 			return r;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3a66baac76ed..55cadb1a2735 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1077,9 +1077,10 @@ static void free_rq_clone(struct request *clone)
 
 	blk_rq_unprep_clone(clone);
 
-	if (clone->q && clone->q->mq_ops)
+	if (clone->q->mq_ops)
 		tio->ti->type->release_clone_rq(clone);
-	else
+	else if (!md->queue->mq_ops)
+		/* request_fn queue stacked on request_fn queue(s) */
 		free_clone_request(md, clone);
 
 	if (!md->queue->mq_ops)
@@ -1838,15 +1839,25 @@ static int setup_clone(struct request *clone, struct request *rq,
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 				struct dm_rq_target_io *tio, gfp_t gfp_mask)
 {
-	struct request *clone = alloc_clone_request(md, gfp_mask);
+	/*
+	 * Do not allocate a clone if tio->clone was already set
+	 * (see: dm_mq_queue_rq).
+	 */
+	bool alloc_clone = !tio->clone;
+	struct request *clone;
 
-	if (!clone)
-		return NULL;
+	if (alloc_clone) {
+		clone = alloc_clone_request(md, gfp_mask);
+		if (!clone)
+			return NULL;
+	} else
+		clone = tio->clone;
 
 	blk_rq_init(NULL, clone);
 	if (setup_clone(clone, rq, tio, gfp_mask)) {
 		/* -ENOMEM */
-		free_clone_request(md, clone);
+		if (alloc_clone)
+			free_clone_request(md, clone);
 		return NULL;
 	}
 
@@ -1864,7 +1875,8 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
 	tio->orig = rq;
 	tio->error = 0;
 	memset(&tio->info, 0, sizeof(tio->info));
-	init_kthread_work(&tio->work, map_tio_request);
+	if (md->kworker_task)
+		init_kthread_work(&tio->work, map_tio_request);
 }
 
 static struct dm_rq_target_io *prep_tio(struct request *rq,
@@ -1941,7 +1953,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		}
 		if (IS_ERR(clone))
 			return DM_MAPIO_REQUEUE;
-		if (setup_clone(clone, rq, tio, GFP_NOIO)) {
+		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
 			/* -ENOMEM */
 			ti->type->release_clone_rq(clone);
 			return DM_MAPIO_REQUEUE;
@@ -2408,7 +2420,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 	p->bs = NULL;
 
 out:
-	/* mempool bind completed, now no need any mempools in the table */
+	/* mempool bind completed, no longer need any mempools in the table */
 	dm_table_free_md_mempools(t);
 }
 
@@ -2713,9 +2725,24 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	/* Init tio using md established in .init_request */
 	init_tio(tio, rq, md);
 
-	/* Establish tio->ti before queuing work (map_tio_request) */
+	/*
+	 * Establish tio->ti before queuing work (map_tio_request)
+	 * or making direct call to map_request().
+	 */
 	tio->ti = ti;
-	queue_kthread_work(&md->kworker, &tio->work);
+
+	/* Clone the request if underlying devices aren't blk-mq */
+	if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
+		/* clone request is allocated at the end of the pdu */
+		tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
+		if (!clone_rq(rq, md, tio, GFP_ATOMIC))
+			return BLK_MQ_RQ_QUEUE_BUSY;
+		queue_kthread_work(&md->kworker, &tio->work);
+	} else {
+		/* Direct call is fine since .queue_rq allows allocations */
+		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
+			dm_requeue_unmapped_original_request(md, rq);
+	}
 
 	return BLK_MQ_RQ_QUEUE_OK;
 }
@@ -2729,6 +2756,7 @@ static struct blk_mq_ops dm_mq_ops = {
 
 static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
 {
+	unsigned md_type = dm_get_md_type(md);
 	struct request_queue *q;
 	int err;
 
@@ -2738,7 +2766,11 @@ static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
 	md->tag_set.numa_node = NUMA_NO_NODE;
 	md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
 	md->tag_set.nr_hw_queues = 1;
-	md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
+	if (md_type == DM_TYPE_REQUEST_BASED) {
+		/* make the memory for non-blk-mq clone part of the pdu */
+		md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
+	} else
+		md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
 	md->tag_set.driver_data = md;
 
 	err = blk_mq_alloc_tag_set(&md->tag_set);
@@ -2756,7 +2788,8 @@ static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
 	/* backfill 'mq' sysfs registration normally done in blk_register_queue */
 	blk_mq_register_disk(md->disk);
 
-	init_rq_based_worker_thread(md);
+	if (md_type == DM_TYPE_REQUEST_BASED)
+		init_rq_based_worker_thread(md);
 
 	return 0;
 
@@ -2876,7 +2909,7 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 	set_bit(DMF_FREEING, &md->flags);
 	spin_unlock(&_minor_lock);
 
-	if (dm_request_based(md))
+	if (dm_request_based(md) && md->kworker_task)
 		flush_kthread_worker(&md->kworker);
 
 	/*
@@ -3130,7 +3163,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
 	 */
 	if (dm_request_based(md)) {
 		stop_queue(md->queue);
-		flush_kthread_worker(&md->kworker);
+		if (md->kworker_task)
+			flush_kthread_worker(&md->kworker);
 	}
 
 	flush_workqueue(md->wq);
-- 
cgit v1.2.1


From 17e149b8f73ba116e71e25930dd6f2eb3828792d Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 11 Mar 2015 15:01:09 -0400
Subject: dm: add 'use_blk_mq' module param and expose in per-device ro sysfs
 attr

Request-based DM's blk-mq support defaults to off; but a user can easily
change the default using the dm_mod.use_blk_mq module/boot option.

Also, you can check what mode a given request-based DM device is using
with: cat /sys/block/dm-X/dm/use_blk_mq

This change enabled further cleanup and reduced work (e.g. the
md->io_pool and md->rq_pool isn't created if using blk-mq).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/Kconfig    | 11 +++++++++++
 drivers/md/dm-sysfs.c |  9 +++++++++
 drivers/md/dm-table.c |  6 +++---
 drivers/md/dm.c       | 53 +++++++++++++++++++++++++++++++++++++++------------
 drivers/md/dm.h       |  5 ++++-
 5 files changed, 68 insertions(+), 16 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 63e05e32b462..109f9dcc9cab 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -196,6 +196,17 @@ config BLK_DEV_DM
 
 	  If unsure, say N.
 
+config DM_MQ_DEFAULT
+	bool "request-based DM: use blk-mq I/O path by default"
+	depends on BLK_DEV_DM
+	---help---
+	  This option enables the blk-mq based I/O path for request-based
+	  DM devices by default.  With the option the dm_mod.use_blk_mq
+	  module/boot option defaults to Y, without it to N, but it can
+	  still be overriden either way.
+
+	  If unsure say N.
+
 config DM_DEBUG
 	bool "Device mapper debugging support"
 	depends on BLK_DEV_DM
diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index f5bb3944f75e..7e818f5f1dc4 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -89,15 +89,24 @@ static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
 	return strlen(buf);
 }
 
+static ssize_t dm_attr_use_blk_mq_show(struct mapped_device *md, char *buf)
+{
+	sprintf(buf, "%d\n", dm_use_blk_mq(md));
+
+	return strlen(buf);
+}
+
 static DM_ATTR_RO(name);
 static DM_ATTR_RO(uuid);
 static DM_ATTR_RO(suspended);
+static DM_ATTR_RO(use_blk_mq);
 static DM_ATTR_RW(rq_based_seq_io_merge_deadline);
 
 static struct attribute *dm_attrs[] = {
 	&dm_attr_name.attr,
 	&dm_attr_uuid.attr,
 	&dm_attr_suspended.attr,
+	&dm_attr_use_blk_mq.attr,
 	&dm_attr_rq_based_seq_io_merge_deadline.attr,
 	NULL,
 };
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 66600cab9fa5..8d025f33de92 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -940,7 +940,7 @@ bool dm_table_mq_request_based(struct dm_table *t)
 	return dm_table_get_type(t) == DM_TYPE_MQ_REQUEST_BASED;
 }
 
-static int dm_table_alloc_md_mempools(struct dm_table *t)
+static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *md)
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
@@ -958,7 +958,7 @@ static int dm_table_alloc_md_mempools(struct dm_table *t)
 			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
 		}
 
-	t->mempools = dm_alloc_md_mempools(type, t->integrity_supported, per_bio_data_size);
+	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
@@ -1128,7 +1128,7 @@ int dm_table_complete(struct dm_table *t)
 		return r;
 	}
 
-	r = dm_table_alloc_md_mempools(t);
+	r = dm_table_alloc_md_mempools(t, t->md);
 	if (r)
 		DMERR("unable to allocate mempools");
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 55cadb1a2735..944cdb322708 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -228,8 +228,20 @@ struct mapped_device {
 
 	/* for blk-mq request-based DM support */
 	struct blk_mq_tag_set tag_set;
+	bool use_blk_mq;
 };
 
+#ifdef CONFIG_DM_MQ_DEFAULT
+static bool use_blk_mq = true;
+#else
+static bool use_blk_mq = false;
+#endif
+
+bool dm_use_blk_mq(struct mapped_device *md)
+{
+	return md->use_blk_mq;
+}
+
 /*
  * For mempools pre-allocation at the table loading time.
  */
@@ -2034,7 +2046,7 @@ ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
 {
 	unsigned deadline;
 
-	if (!dm_request_based(md))
+	if (!dm_request_based(md) || md->use_blk_mq)
 		return count;
 
 	if (kstrtouint(buf, 10, &deadline))
@@ -2222,6 +2234,7 @@ static void dm_init_md_queue(struct mapped_device *md)
 
 static void dm_init_old_md_queue(struct mapped_device *md)
 {
+	md->use_blk_mq = false;
 	dm_init_md_queue(md);
 
 	/*
@@ -2263,6 +2276,7 @@ static struct mapped_device *alloc_dev(int minor)
 	if (r < 0)
 		goto bad_io_barrier;
 
+	md->use_blk_mq = use_blk_mq;
 	md->type = DM_TYPE_NONE;
 	mutex_init(&md->suspend_lock);
 	mutex_init(&md->type_lock);
@@ -2349,7 +2363,6 @@ static void unlock_fs(struct mapped_device *md);
 static void free_dev(struct mapped_device *md)
 {
 	int minor = MINOR(disk_devt(md->disk));
-	bool using_blk_mq = !!md->queue->mq_ops;
 
 	unlock_fs(md);
 	destroy_workqueue(md->wq);
@@ -2375,7 +2388,7 @@ static void free_dev(struct mapped_device *md)
 	del_gendisk(md->disk);
 	put_disk(md->disk);
 	blk_cleanup_queue(md->queue);
-	if (using_blk_mq)
+	if (md->use_blk_mq)
 		blk_mq_free_tag_set(&md->tag_set);
 	bdput(md->bdev);
 	free_minor(minor);
@@ -2388,7 +2401,7 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-	if (md->io_pool && md->bs) {
+	if (md->bs) {
 		/* The md already has necessary mempools. */
 		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
 			/*
@@ -2798,13 +2811,21 @@ out_tag_set:
 	return err;
 }
 
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+	if (type == DM_TYPE_BIO_BASED)
+		return type;
+
+	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
+}
+
 /*
  * Setup the DM device's queue based on md's type
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
 	int r;
-	unsigned md_type = dm_get_md_type(md);
+	unsigned md_type = filter_md_type(dm_get_md_type(md), md);
 
 	switch (md_type) {
 	case DM_TYPE_REQUEST_BASED:
@@ -3509,16 +3530,19 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size)
 {
 	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-	struct kmem_cache *cachep;
+	struct kmem_cache *cachep = NULL;
 	unsigned int pool_size = 0;
 	unsigned int front_pad;
 
 	if (!pools)
 		return NULL;
 
+	type = filter_md_type(type, md);
+
 	switch (type) {
 	case DM_TYPE_BIO_BASED:
 		cachep = _io_cache;
@@ -3526,13 +3550,13 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
 		break;
 	case DM_TYPE_REQUEST_BASED:
+		cachep = _rq_tio_cache;
 		pool_size = dm_get_reserved_rq_based_ios();
 		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
 		if (!pools->rq_pool)
 			goto out;
 		/* fall through to setup remaining rq-based pools */
 	case DM_TYPE_MQ_REQUEST_BASED:
-		cachep = _rq_tio_cache;
 		if (!pool_size)
 			pool_size = dm_get_reserved_rq_based_ios();
 		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
@@ -3540,12 +3564,14 @@ struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, u
 		WARN_ON(per_bio_data_size != 0);
 		break;
 	default:
-		goto out;
+		BUG();
 	}
 
-	pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-	if (!pools->io_pool)
-		goto out;
+	if (cachep) {
+		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
+		if (!pools->io_pool)
+			goto out;
+	}
 
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
@@ -3602,6 +3628,9 @@ MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
 module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
 
+module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
+
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 5522422cc6c4..6123c2bf9150 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -211,6 +211,8 @@ int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 void dm_internal_suspend(struct mapped_device *md);
 void dm_internal_resume(struct mapped_device *md);
 
+bool dm_use_blk_mq(struct mapped_device *md);
+
 int dm_io_init(void);
 void dm_io_exit(void);
 
@@ -220,7 +222,8 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
-- 
cgit v1.2.1


From 644bda6f346038bce7ad3ed48f7044c10dde6d47 Mon Sep 17 00:00:00 2001
From: Dan Ehrenberg <dehrenberg@chromium.org>
Date: Tue, 10 Feb 2015 15:20:51 -0800
Subject: dm table: fall back to getting device using name_to_dev_t()

If a device is used as the root filesystem, it can't be built
off of devices which are within the root filesystem (just like
command line arguments to root=).  For this reason, Linux has a
pseudo-filesystem for root= and MD initialization (based on the
function name_to_dev_t) which handles different ways of specifying
devices including PARTUUID and major:minor.

Switch to using name_to_dev_t() in dm_get_device().  Rather than
having DM assume that all things which are not major:minor are paths in
an already-mounted filesystem, change dm_get_device() to first attempt
to look up the device in the filesystem, and if not found it will fall
back to using name_to_dev_t().

In terms of backwards compatibility, there are some cases where
behavior will be different:
- If you have a file in the current working directory named 1:2 and
  you initialze DM there, then it will try to use that file rather
  than the disk with that major:minor pair as a backing device.
- Similarly for other bdev types which name_to_dev_t() knows how to
  interpret, the previous behavior was to repeatedly check for the
  existence of the file (e.g., while waiting for rootfs to come up)
  but the new behavior is to use the name_to_dev_t() interpretation.
  For example, if you have a file named /dev/ubiblock0_0 which is
  a symlink to /dev/sda3, but it is not yet present when DM starts
  to initialize, then the name_to_dev_t() interpretation will take
  precedence.

These incompatibilities would only show up in really strange setups
with bad practices so we shouldn't have to worry about them.

Signed-off-by: Dan Ehrenberg <dehrenberg@chromium.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 8d025f33de92..e0f618b43c25 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -19,6 +19,7 @@
 #include <linux/delay.h>
 #include <linux/atomic.h>
 #include <linux/blk-mq.h>
+#include <linux/mount.h>
 
 #define DM_MSG_PREFIX "table"
 
@@ -373,23 +374,18 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 	int r;
 	dev_t uninitialized_var(dev);
 	struct dm_dev_internal *dd;
-	unsigned int major, minor;
 	struct dm_table *t = ti->table;
-	char dummy;
+	struct block_device *bdev;
 
 	BUG_ON(!t);
 
-	if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
-		/* Extract the major/minor numbers */
-		dev = MKDEV(major, minor);
-		if (MAJOR(dev) != major || MINOR(dev) != minor)
-			return -EOVERFLOW;
+	/* convert the path to a device */
+	bdev = lookup_bdev(path);
+	if (IS_ERR(bdev)) {
+		dev = name_to_dev_t(path);
+		if (!dev)
+			return -ENODEV;
 	} else {
-		/* convert the path to a device */
-		struct block_device *bdev = lookup_bdev(path);
-
-		if (IS_ERR(bdev))
-			return PTR_ERR(bdev);
 		dev = bdev->bd_dev;
 		bdput(bdev);
 	}
-- 
cgit v1.2.1


From c32a512fdf62de260ee8298436558ea50b94dfcb Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Sun, 15 Mar 2015 13:09:10 -0400
Subject: dm log userspace transfer: match wait_for_completion_timeout return
 type

Return type of wait_for_completion_timeout() is unsigned long not int.
An appropriately named unsigned long is added and the assignment fixed.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-log-userspace-transfer.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
index 39ad9664d397..fdf8ec304f8d 100644
--- a/drivers/md/dm-log-userspace-transfer.c
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -172,6 +172,7 @@ int dm_consult_userspace(const char *uuid, uint64_t luid, int request_type,
 			 char *rdata, size_t *rdata_size)
 {
 	int r = 0;
+	unsigned long tmo;
 	size_t dummy = 0;
 	int overhead_size = sizeof(struct dm_ulog_request) + sizeof(struct cn_msg);
 	struct dm_ulog_request *tfr = prealloced_ulog_tfr;
@@ -236,11 +237,11 @@ resend:
 		goto out;
 	}
 
-	r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+	tmo = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
 	spin_lock(&receiving_list_lock);
 	list_del_init(&(pkg.list));
 	spin_unlock(&receiving_list_lock);
-	if (!r) {
+	if (!tmo) {
 		DMWARN("[%s] Request timed out: [%u/%u] - retrying",
 		       (strlen(uuid) > 8) ?
 		       (uuid + (strlen(uuid) - 8)) : (uuid),
-- 
cgit v1.2.1


From 18cc980ac8cf8c727d1f7d581b4576ed64bd78a6 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Wed, 18 Mar 2015 18:59:02 -0400
Subject: dm log userspace base: fix compile warning

This fixes up a compile warning [-Wunused-but-set-variable] - given the
comment in userspace_set_region_sync() the non-reporting of errors is
intentional so the return value can be dropped to make gcc happy.

Also, fix typo in comment.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-log-userspace-base.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
index 39fa00733431..058256d2eeea 100644
--- a/drivers/md/dm-log-userspace-base.c
+++ b/drivers/md/dm-log-userspace-base.c
@@ -741,7 +741,6 @@ static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
 static void userspace_set_region_sync(struct dm_dirty_log *log,
 				      region_t region, int in_sync)
 {
-	int r;
 	struct log_c *lc = log->context;
 	struct {
 		region_t r;
@@ -751,12 +750,12 @@ static void userspace_set_region_sync(struct dm_dirty_log *log,
 	pkg.r = region;
 	pkg.i = (int64_t)in_sync;
 
-	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
-				 (char *)&pkg, sizeof(pkg), NULL, NULL);
+	(void) userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+				    (char *)&pkg, sizeof(pkg), NULL, NULL);
 
 	/*
 	 * It would be nice to be able to report failures.
-	 * However, it is easy emough to detect and resolve.
+	 * However, it is easy enough to detect and resolve.
 	 */
 	return;
 }
-- 
cgit v1.2.1


From aca607ba242dde316794c0b607048fd5b688a520 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire <hofrat@osadl.org>
Date: Tue, 17 Mar 2015 07:47:58 -0400
Subject: dm delay: use msecs_to_jiffies for time conversion

Converting milliseconds to jiffies by "val * HZ / 1000" is technically
OK but msecs_to_jiffies(val) is the cleaner solution and handles all
corner cases correctly.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-delay.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 42c3a27a14cc..57b6a1901c91 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -236,7 +236,7 @@ static int delay_bio(struct delay_c *dc, int delay, struct bio *bio)
 	delayed = dm_per_bio_data(bio, sizeof(struct dm_delay_info));
 
 	delayed->context = dc;
-	delayed->expires = expires = jiffies + (delay * HZ / 1000);
+	delayed->expires = expires = jiffies + msecs_to_jiffies(delay);
 
 	mutex_lock(&delayed_bios_lock);
 
-- 
cgit v1.2.1


From 65ff5b7ddf0541f2b6e5cc59c47bfbf6cbcd91b8 Mon Sep 17 00:00:00 2001
From: Sami Tolvanen <samitolvanen@google.com>
Date: Wed, 18 Mar 2015 15:52:14 +0000
Subject: dm verity: add error handling modes for corrupted blocks

Add device specific modes to dm-verity to specify how corrupted
blocks should be handled.  The following modes are defined:

  - DM_VERITY_MODE_EIO is the default behavior, where reading a
    corrupted block results in -EIO.

  - DM_VERITY_MODE_LOGGING only logs corrupted blocks, but does
    not block the read.

  - DM_VERITY_MODE_RESTART calls kernel_restart when a corrupted
    block is discovered.

In addition, each mode sends a uevent to notify userspace of
corruption and to allow further recovery actions.

The driver defaults to previous behavior (DM_VERITY_MODE_EIO)
and other modes can be enabled with an additional parameter to
the verity table.

Signed-off-by: Sami Tolvanen <samitolvanen@google.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-verity.c | 147 +++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/dm.c        |   1 +
 2 files changed, 136 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 7a7bab8947ae..66616db33e6f 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -18,20 +18,39 @@
 
 #include <linux/module.h>
 #include <linux/device-mapper.h>
+#include <linux/reboot.h>
 #include <crypto/hash.h>
 
 #define DM_MSG_PREFIX			"verity"
 
+#define DM_VERITY_ENV_LENGTH		42
+#define DM_VERITY_ENV_VAR_NAME		"DM_VERITY_ERR_BLOCK_NR"
+
 #define DM_VERITY_IO_VEC_INLINE		16
 #define DM_VERITY_MEMPOOL_SIZE		4
 #define DM_VERITY_DEFAULT_PREFETCH_SIZE	262144
 
 #define DM_VERITY_MAX_LEVELS		63
+#define DM_VERITY_MAX_CORRUPTED_ERRS	100
+
+#define DM_VERITY_OPT_LOGGING		"ignore_corruption"
+#define DM_VERITY_OPT_RESTART		"restart_on_corruption"
 
 static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
 
 module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
 
+enum verity_mode {
+	DM_VERITY_MODE_EIO,
+	DM_VERITY_MODE_LOGGING,
+	DM_VERITY_MODE_RESTART
+};
+
+enum verity_block_type {
+	DM_VERITY_BLOCK_TYPE_DATA,
+	DM_VERITY_BLOCK_TYPE_METADATA
+};
+
 struct dm_verity {
 	struct dm_dev *data_dev;
 	struct dm_dev *hash_dev;
@@ -54,6 +73,8 @@ struct dm_verity {
 	unsigned digest_size;	/* digest size for the current hash algorithm */
 	unsigned shash_descsize;/* the size of temporary space for crypto */
 	int hash_failed;	/* set to 1 if hash of any block failed */
+	enum verity_mode mode;	/* mode for handling verification errors */
+	unsigned corrupted_errs;/* Number of errors for corrupted blocks */
 
 	mempool_t *vec_mempool;	/* mempool of bio vector */
 
@@ -174,6 +195,57 @@ static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
 		*offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
 }
 
+/*
+ * Handle verification errors.
+ */
+static int verity_handle_err(struct dm_verity *v, enum verity_block_type type,
+			     unsigned long long block)
+{
+	char verity_env[DM_VERITY_ENV_LENGTH];
+	char *envp[] = { verity_env, NULL };
+	const char *type_str = "";
+	struct mapped_device *md = dm_table_get_md(v->ti->table);
+
+	/* Corruption should be visible in device status in all modes */
+	v->hash_failed = 1;
+
+	if (v->corrupted_errs >= DM_VERITY_MAX_CORRUPTED_ERRS)
+		goto out;
+
+	v->corrupted_errs++;
+
+	switch (type) {
+	case DM_VERITY_BLOCK_TYPE_DATA:
+		type_str = "data";
+		break;
+	case DM_VERITY_BLOCK_TYPE_METADATA:
+		type_str = "metadata";
+		break;
+	default:
+		BUG();
+	}
+
+	DMERR("%s: %s block %llu is corrupted", v->data_dev->name, type_str,
+		block);
+
+	if (v->corrupted_errs == DM_VERITY_MAX_CORRUPTED_ERRS)
+		DMERR("%s: reached maximum errors", v->data_dev->name);
+
+	snprintf(verity_env, DM_VERITY_ENV_LENGTH, "%s=%d,%llu",
+		DM_VERITY_ENV_VAR_NAME, type, block);
+
+	kobject_uevent_env(&disk_to_dev(dm_disk(md))->kobj, KOBJ_CHANGE, envp);
+
+out:
+	if (v->mode == DM_VERITY_MODE_LOGGING)
+		return 0;
+
+	if (v->mode == DM_VERITY_MODE_RESTART)
+		kernel_restart("dm-verity device corrupted");
+
+	return 1;
+}
+
 /*
  * Verify hash of a metadata block pertaining to the specified data block
  * ("block" argument) at a specified level ("level" argument).
@@ -251,11 +323,11 @@ static int verity_verify_level(struct dm_verity_io *io, sector_t block,
 			goto release_ret_r;
 		}
 		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-			DMERR_LIMIT("metadata block %llu is corrupted",
-				(unsigned long long)hash_block);
-			v->hash_failed = 1;
-			r = -EIO;
-			goto release_ret_r;
+			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_METADATA,
+					      hash_block)) {
+				r = -EIO;
+				goto release_ret_r;
+			}
 		} else
 			aux->hash_verified = 1;
 	}
@@ -367,10 +439,9 @@ test_block_hash:
 			return r;
 		}
 		if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
-			DMERR_LIMIT("data block %llu is corrupted",
-				(unsigned long long)(io->block + b));
-			v->hash_failed = 1;
-			return -EIO;
+			if (verity_handle_err(v, DM_VERITY_BLOCK_TYPE_DATA,
+					      io->block + b))
+				return -EIO;
 		}
 	}
 
@@ -546,6 +617,19 @@ static void verity_status(struct dm_target *ti, status_type_t type,
 		else
 			for (x = 0; x < v->salt_size; x++)
 				DMEMIT("%02x", v->salt[x]);
+		if (v->mode != DM_VERITY_MODE_EIO) {
+			DMEMIT(" 1 ");
+			switch (v->mode) {
+			case DM_VERITY_MODE_LOGGING:
+				DMEMIT(DM_VERITY_OPT_LOGGING);
+				break;
+			case DM_VERITY_MODE_RESTART:
+				DMEMIT(DM_VERITY_OPT_RESTART);
+				break;
+			default:
+				BUG();
+			}
+		}
 		break;
 	}
 }
@@ -647,13 +731,19 @@ static void verity_dtr(struct dm_target *ti)
 static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	struct dm_verity *v;
-	unsigned num;
+	struct dm_arg_set as;
+	const char *opt_string;
+	unsigned int num, opt_params;
 	unsigned long long num_ll;
 	int r;
 	int i;
 	sector_t hash_position;
 	char dummy;
 
+	static struct dm_arg _args[] = {
+		{0, 1, "Invalid number of feature args"},
+	};
+
 	v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
 	if (!v) {
 		ti->error = "Cannot allocate verity structure";
@@ -668,8 +758,8 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		goto bad;
 	}
 
-	if (argc != 10) {
-		ti->error = "Invalid argument count: exactly 10 arguments required";
+	if (argc < 10) {
+		ti->error = "Not enough arguments";
 		r = -EINVAL;
 		goto bad;
 	}
@@ -790,6 +880,39 @@ static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
 		}
 	}
 
+	argv += 10;
+	argc -= 10;
+
+	/* Optional parameters */
+	if (argc) {
+		as.argc = argc;
+		as.argv = argv;
+
+		r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
+		if (r)
+			goto bad;
+
+		while (opt_params) {
+			opt_params--;
+			opt_string = dm_shift_arg(&as);
+			if (!opt_string) {
+				ti->error = "Not enough feature arguments";
+				r = -EINVAL;
+				goto bad;
+			}
+
+			if (!strcasecmp(opt_string, DM_VERITY_OPT_LOGGING))
+				v->mode = DM_VERITY_MODE_LOGGING;
+			else if (!strcasecmp(opt_string, DM_VERITY_OPT_RESTART))
+				v->mode = DM_VERITY_MODE_RESTART;
+			else {
+				ti->error = "Invalid feature arguments";
+				r = -EINVAL;
+				goto bad;
+			}
+		}
+	}
+
 	v->hash_per_block_bits =
 		__fls((1 << v->hash_dev_block_bits) / v->digest_size);
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 944cdb322708..f8c7ca3e8947 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -3483,6 +3483,7 @@ struct gendisk *dm_disk(struct mapped_device *md)
 {
 	return md->disk;
 }
+EXPORT_SYMBOL_GPL(dm_disk);
 
 struct kobject *dm_kobject(struct mapped_device *md)
 {
-- 
cgit v1.2.1


From 7f61f5a022101e0c38c3cff2ef9ace9c9c86dbfb Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Mon, 30 Mar 2015 10:43:18 -0700
Subject: dm table: use bool function return values of true/false not 1/0

Use the normal return values for bool functions.

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e0f618b43c25..d9b00b8565c6 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1336,14 +1336,14 @@ static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
 			continue;
 
 		if (ti->flush_supported)
-			return 1;
+			return true;
 
 		if (ti->type->iterate_devices &&
 		    ti->type->iterate_devices(ti, device_flush_capable, &flush))
-			return 1;
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
 static bool dm_table_discard_zeroes_data(struct dm_table *t)
@@ -1356,10 +1356,10 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t)
 		ti = dm_table_get_target(t, i++);
 
 		if (ti->discard_zeroes_data_unsupported)
-			return 0;
+			return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
@@ -1405,10 +1405,10 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 
 		if (!ti->type->iterate_devices ||
 		    !ti->type->iterate_devices(ti, func, NULL))
-			return 0;
+			return false;
 	}
 
-	return 1;
+	return true;
 }
 
 static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
@@ -1465,14 +1465,14 @@ static bool dm_table_supports_discards(struct dm_table *t)
 			continue;
 
 		if (ti->discards_supported)
-			return 1;
+			return true;
 
 		if (ti->type->iterate_devices &&
 		    ti->type->iterate_devices(ti, device_discard_capable, NULL))
-			return 1;
+			return true;
 	}
 
-	return 0;
+	return false;
 }
 
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
-- 
cgit v1.2.1


From 0e9cebe724597a76ab1b0ebc0a21e16f7db11b47 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@fb.com>
Date: Fri, 20 Mar 2015 10:50:37 -0400
Subject: dm: add log writes target

Introduce a new target that is meant for file system developers to test file
system integrity at particular points in the life of a file system.  We capture
all write requests and associated data and log them to a separate device
for later replay.  There is a userspace utility to do this replay.  The
idea behind this is to give file system developers a tool to verify that
the file system is always consistent.

Signed-off-by: Josef Bacik <jbacik@fb.com>
Reviewed-by: Zach Brown <zab@zabbo.net>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/Kconfig         |  16 +
 drivers/md/Makefile        |   1 +
 drivers/md/dm-log-writes.c | 825 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 842 insertions(+)
 create mode 100644 drivers/md/dm-log-writes.c

(limited to 'drivers/md')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 109f9dcc9cab..6ddc983417d5 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -443,4 +443,20 @@ config DM_SWITCH
 
 	  If unsure, say N.
 
+config DM_LOG_WRITES
+	tristate "Log writes target support"
+	depends on BLK_DEV_DM
+	---help---
+	  This device-mapper target takes two devices, one device to use
+	  normally, one to log all write operations done to the first device.
+	  This is for use by file system developers wishing to verify that
+	  their fs is writing a consitent file system at all times by allowing
+	  them to replay the log in a variety of ways and to check the
+	  contents.
+
+	  To compile this code as a module, choose M here: the module will
+	  be called dm-log-writes.
+
+	  If unsure, say N.
+
 endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index a2da532b1c2b..1863feaa5846 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
+obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
new file mode 100644
index 000000000000..93e08446a87d
--- /dev/null
+++ b/drivers/md/dm-log-writes.c
@@ -0,0 +1,825 @@
+/*
+ * Copyright (C) 2014 Facebook. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/device-mapper.h>
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#define DM_MSG_PREFIX "log-writes"
+
+/*
+ * This target will sequentially log all writes to the target device onto the
+ * log device.  This is helpful for replaying writes to check for fs consistency
+ * at all times.  This target provides a mechanism to mark specific events to
+ * check data at a later time.  So for example you would:
+ *
+ * write data
+ * fsync
+ * dmsetup message /dev/whatever mark mymark
+ * unmount /mnt/test
+ *
+ * Then replay the log up to mymark and check the contents of the replay to
+ * verify it matches what was written.
+ *
+ * We log writes only after they have been flushed, this makes the log describe
+ * close to the order in which the data hits the actual disk, not its cache.  So
+ * for example the following sequence (W means write, C means complete)
+ *
+ * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
+ *
+ * Would result in the log looking like this:
+ *
+ * c,a,flush,fuad,b,<other writes>,<next flush>
+ *
+ * This is meant to help expose problems where file systems do not properly wait
+ * on data being written before invoking a FLUSH.  FUA bypasses cache so once it
+ * completes it is added to the log as it should be on disk.
+ *
+ * We treat DISCARDs as if they don't bypass cache so that they are logged in
+ * order of completion along with the normal writes.  If we didn't do it this
+ * way we would process all the discards first and then write all the data, when
+ * in fact we want to do the data and the discard in the order that they
+ * completed.
+ */
+#define LOG_FLUSH_FLAG (1 << 0)
+#define LOG_FUA_FLAG (1 << 1)
+#define LOG_DISCARD_FLAG (1 << 2)
+#define LOG_MARK_FLAG (1 << 3)
+
+#define WRITE_LOG_VERSION 1
+#define WRITE_LOG_MAGIC 0x6a736677736872
+
+/*
+ * The disk format for this is braindead simple.
+ *
+ * At byte 0 we have our super, followed by the following sequence for
+ * nr_entries:
+ *
+ * [   1 sector    ][  entry->nr_sectors ]
+ * [log_write_entry][    data written    ]
+ *
+ * The log_write_entry takes up a full sector so we can have arbitrary length
+ * marks and it leaves us room for extra content in the future.
+ */
+
+/*
+ * Basic info about the log for userspace.
+ */
+struct log_write_super {
+	__le64 magic;
+	__le64 version;
+	__le64 nr_entries;
+	__le32 sectorsize;
+};
+
+/*
+ * sector - the sector we wrote.
+ * nr_sectors - the number of sectors we wrote.
+ * flags - flags for this log entry.
+ * data_len - the size of the data in this log entry, this is for private log
+ * entry stuff, the MARK data provided by userspace for example.
+ */
+struct log_write_entry {
+	__le64 sector;
+	__le64 nr_sectors;
+	__le64 flags;
+	__le64 data_len;
+};
+
+struct log_writes_c {
+	struct dm_dev *dev;
+	struct dm_dev *logdev;
+	u64 logged_entries;
+	u32 sectorsize;
+	atomic_t io_blocks;
+	atomic_t pending_blocks;
+	sector_t next_sector;
+	sector_t end_sector;
+	bool logging_enabled;
+	bool device_supports_discard;
+	spinlock_t blocks_lock;
+	struct list_head unflushed_blocks;
+	struct list_head logging_blocks;
+	wait_queue_head_t wait;
+	struct task_struct *log_kthread;
+};
+
+struct pending_block {
+	int vec_cnt;
+	u64 flags;
+	sector_t sector;
+	sector_t nr_sectors;
+	char *data;
+	u32 datalen;
+	struct list_head list;
+	struct bio_vec vecs[0];
+};
+
+struct per_bio_data {
+	struct pending_block *block;
+};
+
+static void put_pending_block(struct log_writes_c *lc)
+{
+	if (atomic_dec_and_test(&lc->pending_blocks)) {
+		smp_mb__after_atomic();
+		if (waitqueue_active(&lc->wait))
+			wake_up(&lc->wait);
+	}
+}
+
+static void put_io_block(struct log_writes_c *lc)
+{
+	if (atomic_dec_and_test(&lc->io_blocks)) {
+		smp_mb__after_atomic();
+		if (waitqueue_active(&lc->wait))
+			wake_up(&lc->wait);
+	}
+}
+
+static void log_end_io(struct bio *bio, int err)
+{
+	struct log_writes_c *lc = bio->bi_private;
+	struct bio_vec *bvec;
+	int i;
+
+	if (err) {
+		unsigned long flags;
+
+		DMERR("Error writing log block, error=%d", err);
+		spin_lock_irqsave(&lc->blocks_lock, flags);
+		lc->logging_enabled = false;
+		spin_unlock_irqrestore(&lc->blocks_lock, flags);
+	}
+
+	bio_for_each_segment_all(bvec, bio, i)
+		__free_page(bvec->bv_page);
+
+	put_io_block(lc);
+	bio_put(bio);
+}
+
+/*
+ * Meant to be called if there is an error, it will free all the pages
+ * associated with the block.
+ */
+static void free_pending_block(struct log_writes_c *lc,
+			       struct pending_block *block)
+{
+	int i;
+
+	for (i = 0; i < block->vec_cnt; i++) {
+		if (block->vecs[i].bv_page)
+			__free_page(block->vecs[i].bv_page);
+	}
+	kfree(block->data);
+	kfree(block);
+	put_pending_block(lc);
+}
+
+static int write_metadata(struct log_writes_c *lc, void *entry,
+			  size_t entrylen, void *data, size_t datalen,
+			  sector_t sector)
+{
+	struct bio *bio;
+	struct page *page;
+	void *ptr;
+	size_t ret;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	if (!bio) {
+		DMERR("Couldn't alloc log bio");
+		goto error;
+	}
+	bio->bi_iter.bi_size = 0;
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_bdev = lc->logdev->bdev;
+	bio->bi_end_io = log_end_io;
+	bio->bi_private = lc;
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		DMERR("Couldn't alloc log page");
+		bio_put(bio);
+		goto error;
+	}
+
+	ptr = kmap_atomic(page);
+	memcpy(ptr, entry, entrylen);
+	if (datalen)
+		memcpy(ptr + entrylen, data, datalen);
+	memset(ptr + entrylen + datalen, 0,
+	       lc->sectorsize - entrylen - datalen);
+	kunmap_atomic(ptr);
+
+	ret = bio_add_page(bio, page, lc->sectorsize, 0);
+	if (ret != lc->sectorsize) {
+		DMERR("Couldn't add page to the log block");
+		goto error_bio;
+	}
+	submit_bio(WRITE, bio);
+	return 0;
+error_bio:
+	bio_put(bio);
+	__free_page(page);
+error:
+	put_io_block(lc);
+	return -1;
+}
+
+static int log_one_block(struct log_writes_c *lc,
+			 struct pending_block *block, sector_t sector)
+{
+	struct bio *bio;
+	struct log_write_entry entry;
+	size_t ret;
+	int i;
+
+	entry.sector = cpu_to_le64(block->sector);
+	entry.nr_sectors = cpu_to_le64(block->nr_sectors);
+	entry.flags = cpu_to_le64(block->flags);
+	entry.data_len = cpu_to_le64(block->datalen);
+	if (write_metadata(lc, &entry, sizeof(entry), block->data,
+			   block->datalen, sector)) {
+		free_pending_block(lc, block);
+		return -1;
+	}
+
+	if (!block->vec_cnt)
+		goto out;
+	sector++;
+
+	bio = bio_alloc(GFP_KERNEL, block->vec_cnt);
+	if (!bio) {
+		DMERR("Couldn't alloc log bio");
+		goto error;
+	}
+	atomic_inc(&lc->io_blocks);
+	bio->bi_iter.bi_size = 0;
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_bdev = lc->logdev->bdev;
+	bio->bi_end_io = log_end_io;
+	bio->bi_private = lc;
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+	for (i = 0; i < block->vec_cnt; i++) {
+		/*
+		 * The page offset is always 0 because we allocate a new page
+		 * for every bvec in the original bio for simplicity sake.
+		 */
+		ret = bio_add_page(bio, block->vecs[i].bv_page,
+				   block->vecs[i].bv_len, 0);
+		if (ret != block->vecs[i].bv_len) {
+			atomic_inc(&lc->io_blocks);
+			submit_bio(WRITE, bio);
+			bio = bio_alloc(GFP_KERNEL, block->vec_cnt - i);
+			if (!bio) {
+				DMERR("Couldn't alloc log bio");
+				goto error;
+			}
+			bio->bi_iter.bi_size = 0;
+			bio->bi_iter.bi_sector = sector;
+			bio->bi_bdev = lc->logdev->bdev;
+			bio->bi_end_io = log_end_io;
+			bio->bi_private = lc;
+			set_bit(BIO_UPTODATE, &bio->bi_flags);
+
+			ret = bio_add_page(bio, block->vecs[i].bv_page,
+					   block->vecs[i].bv_len, 0);
+			if (ret != block->vecs[i].bv_len) {
+				DMERR("Couldn't add page on new bio?");
+				bio_put(bio);
+				goto error;
+			}
+		}
+		sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
+	}
+	submit_bio(WRITE, bio);
+out:
+	kfree(block->data);
+	kfree(block);
+	put_pending_block(lc);
+	return 0;
+error:
+	free_pending_block(lc, block);
+	put_io_block(lc);
+	return -1;
+}
+
+static int log_super(struct log_writes_c *lc)
+{
+	struct log_write_super super;
+
+	super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
+	super.version = cpu_to_le64(WRITE_LOG_VERSION);
+	super.nr_entries = cpu_to_le64(lc->logged_entries);
+	super.sectorsize = cpu_to_le32(lc->sectorsize);
+
+	if (write_metadata(lc, &super, sizeof(super), NULL, 0, 0)) {
+		DMERR("Couldn't write super");
+		return -1;
+	}
+
+	return 0;
+}
+
+static inline sector_t logdev_last_sector(struct log_writes_c *lc)
+{
+	return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
+}
+
+static int log_writes_kthread(void *arg)
+{
+	struct log_writes_c *lc = (struct log_writes_c *)arg;
+	sector_t sector = 0;
+
+	while (!kthread_should_stop()) {
+		bool super = false;
+		bool logging_enabled;
+		struct pending_block *block = NULL;
+		int ret;
+
+		spin_lock_irq(&lc->blocks_lock);
+		if (!list_empty(&lc->logging_blocks)) {
+			block = list_first_entry(&lc->logging_blocks,
+						 struct pending_block, list);
+			list_del_init(&block->list);
+			if (!lc->logging_enabled)
+				goto next;
+
+			sector = lc->next_sector;
+			if (block->flags & LOG_DISCARD_FLAG)
+				lc->next_sector++;
+			else
+				lc->next_sector += block->nr_sectors + 1;
+
+			/*
+			 * Apparently the size of the device may not be known
+			 * right away, so handle this properly.
+			 */
+			if (!lc->end_sector)
+				lc->end_sector = logdev_last_sector(lc);
+			if (lc->end_sector &&
+			    lc->next_sector >= lc->end_sector) {
+				DMERR("Ran out of space on the logdev");
+				lc->logging_enabled = false;
+				goto next;
+			}
+			lc->logged_entries++;
+			atomic_inc(&lc->io_blocks);
+
+			super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
+			if (super)
+				atomic_inc(&lc->io_blocks);
+		}
+next:
+		logging_enabled = lc->logging_enabled;
+		spin_unlock_irq(&lc->blocks_lock);
+		if (block) {
+			if (logging_enabled) {
+				ret = log_one_block(lc, block, sector);
+				if (!ret && super)
+					ret = log_super(lc);
+				if (ret) {
+					spin_lock_irq(&lc->blocks_lock);
+					lc->logging_enabled = false;
+					spin_unlock_irq(&lc->blocks_lock);
+				}
+			} else
+				free_pending_block(lc, block);
+			continue;
+		}
+
+		if (!try_to_freeze()) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (!kthread_should_stop() &&
+			    !atomic_read(&lc->pending_blocks))
+				schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+	}
+	return 0;
+}
+
+/*
+ * Construct a log-writes mapping:
+ * log-writes <dev_path> <log_dev_path>
+ */
+static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct log_writes_c *lc;
+	struct dm_arg_set as;
+	const char *devname, *logdevname;
+
+	as.argc = argc;
+	as.argv = argv;
+
+	if (argc < 2) {
+		ti->error = "Invalid argument count";
+		return -EINVAL;
+	}
+
+	lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
+	if (!lc) {
+		ti->error = "Cannot allocate context";
+		return -ENOMEM;
+	}
+	spin_lock_init(&lc->blocks_lock);
+	INIT_LIST_HEAD(&lc->unflushed_blocks);
+	INIT_LIST_HEAD(&lc->logging_blocks);
+	init_waitqueue_head(&lc->wait);
+	lc->sectorsize = 1 << SECTOR_SHIFT;
+	atomic_set(&lc->io_blocks, 0);
+	atomic_set(&lc->pending_blocks, 0);
+
+	devname = dm_shift_arg(&as);
+	if (dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev)) {
+		ti->error = "Device lookup failed";
+		goto bad;
+	}
+
+	logdevname = dm_shift_arg(&as);
+	if (dm_get_device(ti, logdevname, dm_table_get_mode(ti->table), &lc->logdev)) {
+		ti->error = "Log device lookup failed";
+		dm_put_device(ti, lc->dev);
+		goto bad;
+	}
+
+	lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
+	if (!lc->log_kthread) {
+		ti->error = "Couldn't alloc kthread";
+		dm_put_device(ti, lc->dev);
+		dm_put_device(ti, lc->logdev);
+		goto bad;
+	}
+
+	/* We put the super at sector 0, start logging at sector 1 */
+	lc->next_sector = 1;
+	lc->logging_enabled = true;
+	lc->end_sector = logdev_last_sector(lc);
+	lc->device_supports_discard = true;
+
+	ti->num_flush_bios = 1;
+	ti->flush_supported = true;
+	ti->num_discard_bios = 1;
+	ti->discards_supported = true;
+	ti->per_bio_data_size = sizeof(struct per_bio_data);
+	ti->private = lc;
+	return 0;
+
+bad:
+	kfree(lc);
+	return -EINVAL;
+}
+
+static int log_mark(struct log_writes_c *lc, char *data)
+{
+	struct pending_block *block;
+	size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
+
+	block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
+	if (!block) {
+		DMERR("Error allocating pending block");
+		return -ENOMEM;
+	}
+
+	block->data = kstrndup(data, maxsize, GFP_KERNEL);
+	if (!block->data) {
+		DMERR("Error copying mark data");
+		kfree(block);
+		return -ENOMEM;
+	}
+	atomic_inc(&lc->pending_blocks);
+	block->datalen = strlen(block->data);
+	block->flags |= LOG_MARK_FLAG;
+	spin_lock_irq(&lc->blocks_lock);
+	list_add_tail(&block->list, &lc->logging_blocks);
+	spin_unlock_irq(&lc->blocks_lock);
+	wake_up_process(lc->log_kthread);
+	return 0;
+}
+
+static void log_writes_dtr(struct dm_target *ti)
+{
+	struct log_writes_c *lc = ti->private;
+
+	spin_lock_irq(&lc->blocks_lock);
+	list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
+	spin_unlock_irq(&lc->blocks_lock);
+
+	/*
+	 * This is just nice to have since it'll update the super to include the
+	 * unflushed blocks, if it fails we don't really care.
+	 */
+	log_mark(lc, "dm-log-writes-end");
+	wake_up_process(lc->log_kthread);
+	wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
+		   !atomic_read(&lc->pending_blocks));
+	kthread_stop(lc->log_kthread);
+
+	WARN_ON(!list_empty(&lc->logging_blocks));
+	WARN_ON(!list_empty(&lc->unflushed_blocks));
+	dm_put_device(ti, lc->dev);
+	dm_put_device(ti, lc->logdev);
+	kfree(lc);
+}
+
+static void normal_map_bio(struct dm_target *ti, struct bio *bio)
+{
+	struct log_writes_c *lc = ti->private;
+
+	bio->bi_bdev = lc->dev->bdev;
+}
+
+static int log_writes_map(struct dm_target *ti, struct bio *bio)
+{
+	struct log_writes_c *lc = ti->private;
+	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+	struct pending_block *block;
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	size_t alloc_size;
+	int i = 0;
+	bool flush_bio = (bio->bi_rw & REQ_FLUSH);
+	bool fua_bio = (bio->bi_rw & REQ_FUA);
+	bool discard_bio = (bio->bi_rw & REQ_DISCARD);
+
+	pb->block = NULL;
+
+	/* Don't bother doing anything if logging has been disabled */
+	if (!lc->logging_enabled)
+		goto map_bio;
+
+	/*
+	 * Map reads as normal.
+	 */
+	if (bio_data_dir(bio) == READ)
+		goto map_bio;
+
+	/* No sectors and not a flush?  Don't care */
+	if (!bio_sectors(bio) && !flush_bio)
+		goto map_bio;
+
+	/*
+	 * Discards will have bi_size set but there's no actual data, so just
+	 * allocate the size of the pending block.
+	 */
+	if (discard_bio)
+		alloc_size = sizeof(struct pending_block);
+	else
+		alloc_size = sizeof(struct pending_block) + sizeof(struct bio_vec) * bio_segments(bio);
+
+	block = kzalloc(alloc_size, GFP_NOIO);
+	if (!block) {
+		DMERR("Error allocating pending block");
+		spin_lock_irq(&lc->blocks_lock);
+		lc->logging_enabled = false;
+		spin_unlock_irq(&lc->blocks_lock);
+		return -ENOMEM;
+	}
+	INIT_LIST_HEAD(&block->list);
+	pb->block = block;
+	atomic_inc(&lc->pending_blocks);
+
+	if (flush_bio)
+		block->flags |= LOG_FLUSH_FLAG;
+	if (fua_bio)
+		block->flags |= LOG_FUA_FLAG;
+	if (discard_bio)
+		block->flags |= LOG_DISCARD_FLAG;
+
+	block->sector = bio->bi_iter.bi_sector;
+	block->nr_sectors = bio_sectors(bio);
+
+	/* We don't need the data, just submit */
+	if (discard_bio) {
+		WARN_ON(flush_bio || fua_bio);
+		if (lc->device_supports_discard)
+			goto map_bio;
+		bio_endio(bio, 0);
+		return DM_MAPIO_SUBMITTED;
+	}
+
+	/* Flush bio, splice the unflushed blocks onto this list and submit */
+	if (flush_bio && !bio_sectors(bio)) {
+		spin_lock_irq(&lc->blocks_lock);
+		list_splice_init(&lc->unflushed_blocks, &block->list);
+		spin_unlock_irq(&lc->blocks_lock);
+		goto map_bio;
+	}
+
+	/*
+	 * We will write this bio somewhere else way later so we need to copy
+	 * the actual contents into new pages so we know the data will always be
+	 * there.
+	 *
+	 * We do this because this could be a bio from O_DIRECT in which case we
+	 * can't just hold onto the page until some later point, we have to
+	 * manually copy the contents.
+	 */
+	bio_for_each_segment(bv, bio, iter) {
+		struct page *page;
+		void *src, *dst;
+
+		page = alloc_page(GFP_NOIO);
+		if (!page) {
+			DMERR("Error allocing page");
+			free_pending_block(lc, block);
+			spin_lock_irq(&lc->blocks_lock);
+			lc->logging_enabled = false;
+			spin_unlock_irq(&lc->blocks_lock);
+			return -ENOMEM;
+		}
+
+		src = kmap_atomic(bv.bv_page);
+		dst = kmap_atomic(page);
+		memcpy(dst, src + bv.bv_offset, bv.bv_len);
+		kunmap_atomic(dst);
+		kunmap_atomic(src);
+		block->vecs[i].bv_page = page;
+		block->vecs[i].bv_len = bv.bv_len;
+		block->vec_cnt++;
+		i++;
+	}
+
+	/* Had a flush with data in it, weird */
+	if (flush_bio) {
+		spin_lock_irq(&lc->blocks_lock);
+		list_splice_init(&lc->unflushed_blocks, &block->list);
+		spin_unlock_irq(&lc->blocks_lock);
+	}
+map_bio:
+	normal_map_bio(ti, bio);
+	return DM_MAPIO_REMAPPED;
+}
+
+static int normal_end_io(struct dm_target *ti, struct bio *bio, int error)
+{
+	struct log_writes_c *lc = ti->private;
+	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
+
+	if (bio_data_dir(bio) == WRITE && pb->block) {
+		struct pending_block *block = pb->block;
+		unsigned long flags;
+
+		spin_lock_irqsave(&lc->blocks_lock, flags);
+		if (block->flags & LOG_FLUSH_FLAG) {
+			list_splice_tail_init(&block->list, &lc->logging_blocks);
+			list_add_tail(&block->list, &lc->logging_blocks);
+			wake_up_process(lc->log_kthread);
+		} else if (block->flags & LOG_FUA_FLAG) {
+			list_add_tail(&block->list, &lc->logging_blocks);
+			wake_up_process(lc->log_kthread);
+		} else
+			list_add_tail(&block->list, &lc->unflushed_blocks);
+		spin_unlock_irqrestore(&lc->blocks_lock, flags);
+	}
+
+	return error;
+}
+
+/*
+ * INFO format: <logged entries> <highest allocated sector>
+ */
+static void log_writes_status(struct dm_target *ti, status_type_t type,
+			      unsigned status_flags, char *result,
+			      unsigned maxlen)
+{
+	unsigned sz = 0;
+	struct log_writes_c *lc = ti->private;
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		DMEMIT("%llu %llu", lc->logged_entries,
+		       (unsigned long long)lc->next_sector - 1);
+		if (!lc->logging_enabled)
+			DMEMIT(" logging_disabled");
+		break;
+
+	case STATUSTYPE_TABLE:
+		DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
+		break;
+	}
+}
+
+static int log_writes_ioctl(struct dm_target *ti, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct log_writes_c *lc = ti->private;
+	struct dm_dev *dev = lc->dev;
+	int r = 0;
+
+	/*
+	 * Only pass ioctls through if the device sizes match exactly.
+	 */
+	if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
+		r = scsi_verify_blk_ioctl(NULL, cmd);
+
+	return r ? : __blkdev_driver_ioctl(dev->bdev, dev->mode, cmd, arg);
+}
+
+static int log_writes_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+			    struct bio_vec *biovec, int max_size)
+{
+	struct log_writes_c *lc = ti->private;
+	struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = lc->dev->bdev;
+	bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static int log_writes_iterate_devices(struct dm_target *ti,
+				      iterate_devices_callout_fn fn,
+				      void *data)
+{
+	struct log_writes_c *lc = ti->private;
+
+	return fn(ti, lc->dev, 0, ti->len, data);
+}
+
+/*
+ * Messages supported:
+ *   mark <mark data> - specify the marked data.
+ */
+static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv)
+{
+	int r = -EINVAL;
+	struct log_writes_c *lc = ti->private;
+
+	if (argc != 2) {
+		DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
+		return r;
+	}
+
+	if (!strcasecmp(argv[0], "mark"))
+		r = log_mark(lc, argv[1]);
+	else
+		DMWARN("Unrecognised log writes target message received: %s", argv[0]);
+
+	return r;
+}
+
+static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct log_writes_c *lc = ti->private;
+	struct request_queue *q = bdev_get_queue(lc->dev->bdev);
+
+	if (!q || !blk_queue_discard(q)) {
+		lc->device_supports_discard = false;
+		limits->discard_granularity = 1 << SECTOR_SHIFT;
+		limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
+	}
+}
+
+static struct target_type log_writes_target = {
+	.name   = "log-writes",
+	.version = {1, 0, 0},
+	.module = THIS_MODULE,
+	.ctr    = log_writes_ctr,
+	.dtr    = log_writes_dtr,
+	.map    = log_writes_map,
+	.end_io = normal_end_io,
+	.status = log_writes_status,
+	.ioctl	= log_writes_ioctl,
+	.merge	= log_writes_merge,
+	.message = log_writes_message,
+	.iterate_devices = log_writes_iterate_devices,
+	.io_hints = log_writes_io_hints,
+};
+
+static int __init dm_log_writes_init(void)
+{
+	int r = dm_register_target(&log_writes_target);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	return r;
+}
+
+static void __exit dm_log_writes_exit(void)
+{
+	dm_unregister_target(&log_writes_target);
+}
+
+module_init(dm_log_writes_init);
+module_exit(dm_log_writes_exit);
+
+MODULE_DESCRIPTION(DM_NAME " log writes target");
+MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.1


From e44f23b32dc7916b2bc12817e2f723fefa21ba41 Mon Sep 17 00:00:00 2001
From: Milan Broz <gmazyland@gmail.com>
Date: Sun, 5 Apr 2015 18:03:10 +0200
Subject: dm crypt: update URLs to new cryptsetup project page

Cryptsetup home page moved to GitLab.
Also remove link to abandonded Truecrypt page.

Signed-off-by: Milan Broz <gmazyland@gmail.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 713a96237a80..ea09d5464a9f 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -228,7 +228,7 @@ static struct crypto_ablkcipher *any_tfm(struct crypt_config *cc)
  *
  * tcw:  Compatible implementation of the block chaining mode used
  *       by the TrueCrypt device encryption system (prior to version 4.1).
- *       For more info see: http://www.truecrypt.org
+ *       For more info see: https://gitlab.com/cryptsetup/cryptsetup/wikis/TrueCryptOnDiskFormat
  *       It operates on full 512 byte sectors and uses CBC
  *       with an IV derived from initial key and the sector number.
  *       In addition, whitening value is applied on every sector, whitening
-- 
cgit v1.2.1


From 5977907937afa2b5584a874d44ba6c0f56aeaa9c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 9 Apr 2015 16:53:24 -0400
Subject: dm crypt: leverage immutable biovecs when decrypting on read

Commit 003b5c571 ("block: Convert drivers to immutable biovecs")
stopped short of changing dm-crypt to leverage the fact that the biovec
array of a bio will no longer be modified.

Switch to using bio_clone_fast() when cloning bios for decryption after
read.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index ea09d5464a9f..aa1238facbeb 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1124,15 +1124,15 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone)
 static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp)
 {
 	struct crypt_config *cc = io->cc;
-	struct bio *base_bio = io->base_bio;
 	struct bio *clone;
 
 	/*
-	 * The block layer might modify the bvec array, so always
-	 * copy the required bvecs because we need the original
-	 * one in order to decrypt the whole bio data *afterwards*.
+	 * We need the original biovec array in order to decrypt
+	 * the whole bio data *afterwards* -- thanks to immutable
+	 * biovecs we don't need to worry about the block layer
+	 * modifying the biovec array; so leverage bio_clone_fast().
 	 */
-	clone = bio_clone_bioset(base_bio, gfp, cc->bs);
+	clone = bio_clone_fast(io->base_bio, gfp, cc->bs);
 	if (!clone)
 		return 1;
 
-- 
cgit v1.2.1


From 0618764cb25f6fa9fb31152995de42a8a0496475 Mon Sep 17 00:00:00 2001
From: Ben Collins <ben.c@servergy.com>
Date: Fri, 3 Apr 2015 16:09:46 +0000
Subject: dm crypt: fix deadlock when async crypto algorithm returns -EBUSY

I suspect this doesn't show up for most anyone because software
algorithms typically don't have a sense of being too busy.  However,
when working with the Freescale CAAM driver it will return -EBUSY on
occasion under heavy -- which resulted in dm-crypt deadlock.

After checking the logic in some other drivers, the scheme for
crypt_convert() and it's callback, kcryptd_async_done(), were not
correctly laid out to properly handle -EBUSY or -EINPROGRESS.

Fix this by using the completion for both -EBUSY and -EINPROGRESS.  Now
crypt_convert()'s use of completion is comparable to
af_alg_wait_for_completion().  Similarly, kcryptd_async_done() follows
the pattern used in af_alg_complete().

Before this fix dm-crypt would lockup within 1-2 minutes running with
the CAAM driver.  Fix was regression tested against software algorithms
on PPC32 and x86_64, and things seem perfectly happy there as well.

Signed-off-by: Ben Collins <ben.c@servergy.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-crypt.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index aa1238facbeb..9b5e1eb0ffcf 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -925,11 +925,10 @@ static int crypt_convert(struct crypt_config *cc,
 
 		switch (r) {
 		/* async */
+		case -EINPROGRESS:
 		case -EBUSY:
 			wait_for_completion(&ctx->restart);
 			reinit_completion(&ctx->restart);
-			/* fall through*/
-		case -EINPROGRESS:
 			ctx->req = NULL;
 			ctx->cc_sector++;
 			continue;
@@ -1346,10 +1345,8 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
 	struct crypt_config *cc = io->cc;
 
-	if (error == -EINPROGRESS) {
-		complete(&ctx->restart);
+	if (error == -EINPROGRESS)
 		return;
-	}
 
 	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
 		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
@@ -1360,12 +1357,15 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
 	if (!atomic_dec_and_test(&ctx->cc_pending))
-		return;
+		goto done;
 
 	if (bio_data_dir(io->base_bio) == READ)
 		kcryptd_crypt_read_done(io);
 	else
 		kcryptd_crypt_write_io_submit(io, 1);
+done:
+	if (!completion_done(&ctx->restart))
+		complete(&ctx->restart);
 }
 
 static void kcryptd_crypt(struct work_struct *work)
-- 
cgit v1.2.1


From 44c144f9c8e8fbd73ede2848da8253b3aae42ec2 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Date: Thu, 16 Apr 2015 22:00:50 -0400
Subject: dm crypt: fix missing error code return from crypt_ctr error path

Fix to return a negative error code from crypt_ctr()'s optional
parameter processing error path.

Signed-off-by: Wei Yongjun <yongjun_wei@trendmicro.com.cn>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9b5e1eb0ffcf..9eeea196328a 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1816,6 +1816,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		if (ret)
 			goto bad;
 
+		ret = -EINVAL;
 		while (opt_params--) {
 			opt_string = dm_shift_arg(&as);
 			if (!opt_string) {
-- 
cgit v1.2.1


From 8c58f02e244d5b35fa38aa308007715d4957d4c7 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Tue, 21 Apr 2015 11:25:52 -0500
Subject: md-cluster: correct the num for comparison

Since the node num of md-cluster is from zero, and
cinfo->slot_number represents the slot num of dlm,
no need to check for equality.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md-cluster.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index ae8bb547f94d..10c44a3a9d6a 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -612,9 +612,9 @@ static int join(struct mddev *mddev, int nodes)
 	if (ret)
 		goto err;
 	wait_for_completion(&cinfo->completion);
-	if (nodes <= cinfo->slot_number) {
-		pr_err("md-cluster: Slot allotted(%d) greater than available slots(%d)", cinfo->slot_number - 1,
-			nodes);
+	if (nodes < cinfo->slot_number) {
+		pr_err("md-cluster: Slot allotted(%d) is greater than available slots(%d).",
+			cinfo->slot_number, nodes);
 		ret = -ERANGE;
 		goto err;
 	}
-- 
cgit v1.2.1


From fb56dfef4e31f214cfbfa0eb8a1949591c20b118 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 14 Apr 2015 10:43:24 -0500
Subject: md: Export and rename kick_rdev_from_array

This export is required for clustering module in order to
co-ordinate remove/readd a rdev from all nodes.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 17 +++++++++--------
 drivers/md/md.h |  1 +
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index eb6f92e57ab6..bc1e43014292 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2168,11 +2168,12 @@ static void export_rdev(struct md_rdev *rdev)
 	kobject_put(&rdev->kobj);
 }
 
-static void kick_rdev_from_array(struct md_rdev *rdev)
+void md_kick_rdev_from_array(struct md_rdev *rdev)
 {
 	unbind_rdev_from_array(rdev);
 	export_rdev(rdev);
 }
+EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
 
 static void export_array(struct mddev *mddev)
 {
@@ -2181,7 +2182,7 @@ static void export_array(struct mddev *mddev)
 	while (!list_empty(&mddev->disks)) {
 		rdev = list_first_entry(&mddev->disks, struct md_rdev,
 					same_set);
-		kick_rdev_from_array(rdev);
+		md_kick_rdev_from_array(rdev);
 	}
 	mddev->raid_disks = 0;
 	mddev->major_version = 0;
@@ -2476,7 +2477,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 			struct mddev *mddev = rdev->mddev;
 			if (mddev_is_clustered(mddev))
 				md_cluster_ops->metadata_update_start(mddev);
-			kick_rdev_from_array(rdev);
+			md_kick_rdev_from_array(rdev);
 			if (mddev->pers)
 				md_update_sb(mddev, 1);
 			md_new_event(mddev);
@@ -3134,7 +3135,7 @@ static void analyze_sbs(struct mddev *mddev)
 				"md: fatal superblock inconsistency in %s"
 				" -- removing from array\n",
 				bdevname(rdev->bdev,b));
-			kick_rdev_from_array(rdev);
+			md_kick_rdev_from_array(rdev);
 		}
 
 	super_types[mddev->major_version].
@@ -3149,7 +3150,7 @@ static void analyze_sbs(struct mddev *mddev)
 			       "md: %s: %s: only %d devices permitted\n",
 			       mdname(mddev), bdevname(rdev->bdev, b),
 			       mddev->max_disks);
-			kick_rdev_from_array(rdev);
+			md_kick_rdev_from_array(rdev);
 			continue;
 		}
 		if (rdev != freshest) {
@@ -3158,7 +3159,7 @@ static void analyze_sbs(struct mddev *mddev)
 				printk(KERN_WARNING "md: kicking non-fresh %s"
 					" from array!\n",
 					bdevname(rdev->bdev,b));
-				kick_rdev_from_array(rdev);
+				md_kick_rdev_from_array(rdev);
 				continue;
 			}
 			/* No device should have a Candidate flag
@@ -3167,7 +3168,7 @@ static void analyze_sbs(struct mddev *mddev)
 			if (test_bit(Candidate, &rdev->flags)) {
 				pr_info("md: kicking Cluster Candidate %s from array!\n",
 					bdevname(rdev->bdev, b));
-				kick_rdev_from_array(rdev);
+				md_kick_rdev_from_array(rdev);
 			}
 		}
 		if (mddev->level == LEVEL_MULTIPATH) {
@@ -5966,7 +5967,7 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 	if (rdev->raid_disk >= 0)
 		goto busy;
 
-	kick_rdev_from_array(rdev);
+	md_kick_rdev_from_array(rdev);
 	md_update_sb(mddev, 1);
 	md_new_event(mddev);
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 6dc0ce09f50c..d98c0d764d8f 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -671,6 +671,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
 extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
 extern void md_reload_sb(struct mddev *mddev);
 extern void md_update_sb(struct mddev *mddev, int force);
+extern void md_kick_rdev_from_array(struct md_rdev * rdev);
 static inline int mddev_check_plugged(struct mddev *mddev)
 {
 	return !!blk_check_plugged(md_unplug, mddev,
-- 
cgit v1.2.1


From 57d051dccaef395e0d8c0fff02cfc3a77bacc88c Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 14 Apr 2015 10:43:55 -0500
Subject: md: Export and rename find_rdev_nr_rcu

This is required by the clustering module (patches to follow) to
find the device to remove or re-add.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 9 +++++----
 drivers/md/md.h | 1 +
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index bc1e43014292..d406a79f9140 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -642,7 +642,7 @@ void mddev_unlock(struct mddev *mddev)
 }
 EXPORT_SYMBOL_GPL(mddev_unlock);
 
-static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
+struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
 {
 	struct md_rdev *rdev;
 
@@ -652,6 +652,7 @@ static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
 
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
 
 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
 {
@@ -2049,11 +2050,11 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 		int choice = 0;
 		if (mddev->pers)
 			choice = mddev->raid_disks;
-		while (find_rdev_nr_rcu(mddev, choice))
+		while (md_find_rdev_nr_rcu(mddev, choice))
 			choice++;
 		rdev->desc_nr = choice;
 	} else {
-		if (find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
+		if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
 			rcu_read_unlock();
 			return -EBUSY;
 		}
@@ -5721,7 +5722,7 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
 		return -EFAULT;
 
 	rcu_read_lock();
-	rdev = find_rdev_nr_rcu(mddev, info.number);
+	rdev = md_find_rdev_nr_rcu(mddev, info.number);
 	if (rdev) {
 		info.major = MAJOR(rdev->bdev->bd_dev);
 		info.minor = MINOR(rdev->bdev->bd_dev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d98c0d764d8f..ecdce36ec6b8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -672,6 +672,7 @@ extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
 extern void md_reload_sb(struct mddev *mddev);
 extern void md_update_sb(struct mddev *mddev, int force);
 extern void md_kick_rdev_from_array(struct md_rdev * rdev);
+struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
 static inline int mddev_check_plugged(struct mddev *mddev)
 {
 	return !!blk_check_plugged(md_unplug, mddev,
-- 
cgit v1.2.1


From 88bcfef7be513e8bf5448e0025330fdd97c4c708 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 14 Apr 2015 10:44:44 -0500
Subject: md-cluster: remove capabilities

This adds "remove" capabilities for the clustered environment.
When a user initiates removal of a device from the array, a
REMOVE message with disk number in the array is sent to all
the nodes which kick the respective device in their own array.

This facilitates the removal of failed devices.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md-cluster.c | 30 ++++++++++++++++++++++++++++++
 drivers/md/md-cluster.h |  1 +
 drivers/md/md.c         |  7 ++++++-
 3 files changed, 37 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 10c44a3a9d6a..30b41b70db17 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -72,6 +72,7 @@ enum msg_type {
 	METADATA_UPDATED = 0,
 	RESYNCING,
 	NEWDISK,
+	REMOVE,
 };
 
 struct cluster_msg {
@@ -401,6 +402,16 @@ static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg
 	dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
 }
 
+static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
+{
+	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+
+	if (rdev)
+		md_kick_rdev_from_array(rdev);
+	else
+		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
+}
+
 static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 {
 	switch (msg->type) {
@@ -419,6 +430,15 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 		pr_info("%s: %d Received message: NEWDISK from %d\n",
 			__func__, __LINE__, msg->slot);
 		process_add_new_disk(mddev, msg);
+		break;
+	case REMOVE:
+		pr_info("%s: %d Received REMOVE from %d\n",
+			__func__, __LINE__, msg->slot);
+		process_remove_disk(mddev, msg);
+		break;
+	default:
+		pr_warn("%s:%d Received unknown message from %d\n",
+			__func__, __LINE__, msg->slot);
 	}
 }
 
@@ -854,6 +874,15 @@ static int new_disk_ack(struct mddev *mddev, bool ack)
 	return 0;
 }
 
+static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
+{
+	struct cluster_msg cmsg;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	cmsg.type = REMOVE;
+	cmsg.raid_slot = rdev->desc_nr;
+	return __sendmsg(cinfo, &cmsg);
+}
+
 static struct md_cluster_operations cluster_ops = {
 	.join   = join,
 	.leave  = leave,
@@ -868,6 +897,7 @@ static struct md_cluster_operations cluster_ops = {
 	.add_new_disk_start = add_new_disk_start,
 	.add_new_disk_finish = add_new_disk_finish,
 	.new_disk_ack = new_disk_ack,
+	.remove_disk = remove_disk,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 7417133c4295..71e51432c1f4 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -22,6 +22,7 @@ struct md_cluster_operations {
 	int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
 	int (*add_new_disk_finish)(struct mddev *mddev);
 	int (*new_disk_ack)(struct mddev *mddev, bool ack);
+	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d406a79f9140..ca011d1d1de7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2477,8 +2477,10 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 		else {
 			struct mddev *mddev = rdev->mddev;
 			if (mddev_is_clustered(mddev))
-				md_cluster_ops->metadata_update_start(mddev);
+				md_cluster_ops->remove_disk(mddev, rdev);
 			md_kick_rdev_from_array(rdev);
+			if (mddev_is_clustered(mddev))
+				md_cluster_ops->metadata_update_start(mddev);
 			if (mddev->pers)
 				md_update_sb(mddev, 1);
 			md_new_event(mddev);
@@ -5968,6 +5970,9 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
 	if (rdev->raid_disk >= 0)
 		goto busy;
 
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->remove_disk(mddev, rdev);
+
 	md_kick_rdev_from_array(rdev);
 	md_update_sb(mddev, 1);
 	md_new_event(mddev);
-- 
cgit v1.2.1


From a6da4ef85cef0382244fc588c901e133a2ec5109 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 14 Apr 2015 10:45:22 -0500
Subject: md: re-add a failed disk

This adds the capability of re-adding a failed disk by
writing "re-add" to /sys/block/mdXX/md/dev-YYY/state.

This facilitates adding disks which have encountered a temporary
error such as a network disconnection/hiccup in an iSCSI device,
or a SAN cable disconnection which has been restored. In such
a situation, you do not need to remove and re-add the device.
Writing re-add to the failed device's state would add it again
to the array and perform the recovery of only the blocks which
were written after the device failed.

This works for generic md, and is not related to clustering. However,
this patch is to ease re-add operations listed above in clustering
environments.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 57 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 20 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index ca011d1d1de7..429e95e9a942 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2375,6 +2375,36 @@ repeat:
 }
 EXPORT_SYMBOL(md_update_sb);
 
+static int add_bound_rdev(struct md_rdev *rdev)
+{
+	struct mddev *mddev = rdev->mddev;
+	int err = 0;
+
+	if (!mddev->pers->hot_remove_disk) {
+		/* If there is hot_add_disk but no hot_remove_disk
+		 * then added disks for geometry changes,
+		 * and should be added immediately.
+		 */
+		super_types[mddev->major_version].
+			validate_super(mddev, rdev);
+		err = mddev->pers->hot_add_disk(mddev, rdev);
+		if (err) {
+			unbind_rdev_from_array(rdev);
+			export_rdev(rdev);
+			return err;
+		}
+	}
+	sysfs_notify_dirent_safe(rdev->sysfs_state);
+
+	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	if (mddev->degraded)
+		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
+	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+	md_new_event(mddev);
+	md_wakeup_thread(mddev->thread);
+	return 0;
+}
+
 /* words written to sysfs files may, or may not, be \n terminated.
  * We want to accept with case. For this we use cmd_match.
  */
@@ -2564,6 +2594,12 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 			clear_bit(Replacement, &rdev->flags);
 			err = 0;
 		}
+	} else if (cmd_match(buf, "re-add")) {
+		if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
+			clear_bit(Faulty, &rdev->flags);
+			err = add_bound_rdev(rdev);
+		} else
+			err = -EBUSY;
 	}
 	if (!err)
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -5875,29 +5911,10 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
 
 		rdev->raid_disk = -1;
 		err = bind_rdev_to_array(rdev, mddev);
-		if (!err && !mddev->pers->hot_remove_disk) {
-			/* If there is hot_add_disk but no hot_remove_disk
-			 * then added disks for geometry changes,
-			 * and should be added immediately.
-			 */
-			super_types[mddev->major_version].
-				validate_super(mddev, rdev);
-			err = mddev->pers->hot_add_disk(mddev, rdev);
-			if (err)
-				unbind_rdev_from_array(rdev);
-		}
 		if (err)
 			export_rdev(rdev);
 		else
-			sysfs_notify_dirent_safe(rdev->sysfs_state);
-
-		set_bit(MD_CHANGE_DEVS, &mddev->flags);
-		if (mddev->degraded)
-			set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
-		if (!err)
-			md_new_event(mddev);
-		md_wakeup_thread(mddev->thread);
+			err = add_bound_rdev(rdev);
 		if (mddev_is_clustered(mddev) &&
 				(info->state & (1 << MD_DISK_CLUSTER_ADD)))
 			md_cluster_ops->add_new_disk_finish(mddev);
-- 
cgit v1.2.1


From 97f6cd39da227459cb46ed4088d37d5d8db51c50 Mon Sep 17 00:00:00 2001
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
Date: Tue, 14 Apr 2015 10:45:42 -0500
Subject: md-cluster: re-add capabilities

When "re-add" is writted to /sys/block/mdXX/md/dev-YYY/state,
the clustered md:

1. Sends RE_ADD message with the desc_nr. Nodes receiving the message
   clear the Faulty bit in their respective rdev->flags.
2. The node initiating re-add, gathers the bitmaps of all nodes
   and copies them into the local bitmap. It does not clear the bitmap
   from which it is copying.
3. Initiating node schedules a md recovery to sync the devices.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c     | 20 +++++++++++---------
 drivers/md/bitmap.h     |  2 +-
 drivers/md/md-cluster.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/md/md-cluster.h |  1 +
 drivers/md/md.c         | 13 +++++++++++--
 5 files changed, 71 insertions(+), 13 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index e98db04eb4f9..2bc56e2a3526 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1851,7 +1851,7 @@ EXPORT_SYMBOL_GPL(bitmap_load);
  * to our bitmap
  */
 int bitmap_copy_from_slot(struct mddev *mddev, int slot,
-		sector_t *low, sector_t *high)
+		sector_t *low, sector_t *high, bool clear_bits)
 {
 	int rv = 0, i, j;
 	sector_t block, lo = 0, hi = 0;
@@ -1882,14 +1882,16 @@ int bitmap_copy_from_slot(struct mddev *mddev, int slot,
 		}
 	}
 
-	bitmap_update_sb(bitmap);
-	/* Setting this for the ev_page should be enough.
-	 * And we do not require both write_all and PAGE_DIRT either
-	 */
-	for (i = 0; i < bitmap->storage.file_pages; i++)
-		set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
-	bitmap_write_all(bitmap);
-	bitmap_unplug(bitmap);
+	if (clear_bits) {
+		bitmap_update_sb(bitmap);
+		/* Setting this for the ev_page should be enough.
+		 * And we do not require both write_all and PAGE_DIRT either
+		 */
+		for (i = 0; i < bitmap->storage.file_pages; i++)
+			set_page_attr(bitmap, i, BITMAP_PAGE_DIRTY);
+		bitmap_write_all(bitmap);
+		bitmap_unplug(bitmap);
+	}
 	*low = lo;
 	*high = hi;
 err:
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index 4aabc74ef7b9..f1f4dd01090d 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -263,7 +263,7 @@ void bitmap_daemon_work(struct mddev *mddev);
 int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
 		  int chunksize, int init);
 int bitmap_copy_from_slot(struct mddev *mddev, int slot,
-				sector_t *lo, sector_t *hi);
+				sector_t *lo, sector_t *hi, bool clear_bits);
 #endif
 
 #endif
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 30b41b70db17..fcfc4b9b2672 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -73,6 +73,7 @@ enum msg_type {
 	RESYNCING,
 	NEWDISK,
 	REMOVE,
+	RE_ADD,
 };
 
 struct cluster_msg {
@@ -253,7 +254,7 @@ static void recover_bitmaps(struct md_thread *thread)
 					str, ret);
 			goto clear_bit;
 		}
-		ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi);
+		ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi, true);
 		if (ret) {
 			pr_err("md-cluster: Could not copy data from bitmap %d\n", slot);
 			goto dlm_unlock;
@@ -412,6 +413,16 @@ static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
 		pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
 }
 
+static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
+{
+	struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
+
+	if (rdev && test_bit(Faulty, &rdev->flags))
+		clear_bit(Faulty, &rdev->flags);
+	else
+		pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
+}
+
 static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 {
 	switch (msg->type) {
@@ -436,6 +447,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 			__func__, __LINE__, msg->slot);
 		process_remove_disk(mddev, msg);
 		break;
+	case RE_ADD:
+		pr_info("%s: %d Received RE_ADD from %d\n",
+			__func__, __LINE__, msg->slot);
+		process_readd_disk(mddev, msg);
+		break;
 	default:
 		pr_warn("%s:%d Received unknown message from %d\n",
 			__func__, __LINE__, msg->slot);
@@ -883,6 +899,35 @@ static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	return __sendmsg(cinfo, &cmsg);
 }
 
+static int gather_bitmaps(struct md_rdev *rdev)
+{
+	int sn, err;
+	sector_t lo, hi;
+	struct cluster_msg cmsg;
+	struct mddev *mddev = rdev->mddev;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	cmsg.type = RE_ADD;
+	cmsg.raid_slot = rdev->desc_nr;
+	err = sendmsg(cinfo, &cmsg);
+	if (err)
+		goto out;
+
+	for (sn = 0; sn < mddev->bitmap_info.nodes; sn++) {
+		if (sn == (cinfo->slot_number - 1))
+			continue;
+		err = bitmap_copy_from_slot(mddev, sn, &lo, &hi, false);
+		if (err) {
+			pr_warn("md-cluster: Could not gather bitmaps from slot %d", sn);
+			goto out;
+		}
+		if ((hi > 0) && (lo < mddev->recovery_cp))
+			mddev->recovery_cp = lo;
+	}
+out:
+	return err;
+}
+
 static struct md_cluster_operations cluster_ops = {
 	.join   = join,
 	.leave  = leave,
@@ -898,6 +943,7 @@ static struct md_cluster_operations cluster_ops = {
 	.add_new_disk_finish = add_new_disk_finish,
 	.new_disk_ack = new_disk_ack,
 	.remove_disk = remove_disk,
+	.gather_bitmaps = gather_bitmaps,
 };
 
 static int __init cluster_init(void)
diff --git a/drivers/md/md-cluster.h b/drivers/md/md-cluster.h
index 71e51432c1f4..6817ee00e053 100644
--- a/drivers/md/md-cluster.h
+++ b/drivers/md/md-cluster.h
@@ -23,6 +23,7 @@ struct md_cluster_operations {
 	int (*add_new_disk_finish)(struct mddev *mddev);
 	int (*new_disk_ack)(struct mddev *mddev, bool ack);
 	int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
+	int (*gather_bitmaps)(struct md_rdev *rdev);
 };
 
 #endif /* _MD_CLUSTER_H */
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 429e95e9a942..d9cac48db2fc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2596,8 +2596,17 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
 		}
 	} else if (cmd_match(buf, "re-add")) {
 		if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1)) {
-			clear_bit(Faulty, &rdev->flags);
-			err = add_bound_rdev(rdev);
+			/* clear_bit is performed _after_ all the devices
+			 * have their local Faulty bit cleared. If any writes
+			 * happen in the meantime in the local node, they
+			 * will land in the local bitmap, which will be synced
+			 * by this node eventually
+			 */
+			if (!mddev_is_clustered(rdev->mddev) ||
+			    (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
+				clear_bit(Faulty, &rdev->flags);
+				err = add_bound_rdev(rdev);
+			}
 		} else
 			err = -EBUSY;
 	}
-- 
cgit v1.2.1


From 50c37b136a3807eda44afe16529b5af701ec49f5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 23 Mar 2015 17:36:38 +1100
Subject: md: don't require sync_min to be a multiple of chunk_size.

There is really no need for sync_min to be a multiple of
chunk_size, and values read from here often aren't.
That means you cannot read a value and expect to be able
to write it back later.

So remove the chunk_size check, and round down to a multiple
of 4K, to be sure everything works with 4K-sector devices.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0d8968535976..3724a29eaf0e 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4427,7 +4427,6 @@ min_sync_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	unsigned long long min;
 	int err;
-	int chunk;
 
 	if (kstrtoull(buf, 10, &min))
 		return -EINVAL;
@@ -4441,16 +4440,8 @@ min_sync_store(struct mddev *mddev, const char *buf, size_t len)
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		goto out_unlock;
 
-	/* Must be a multiple of chunk_size */
-	chunk = mddev->chunk_sectors;
-	if (chunk) {
-		sector_t temp = min;
-
-		err = -EINVAL;
-		if (sector_div(temp, chunk))
-			goto out_unlock;
-	}
-	mddev->resync_min = min;
+	/* Round down to multiple of 4K for safety */
+	mddev->resync_min = round_down(min, 8);
 	err = 0;
 
 out_unlock:
-- 
cgit v1.2.1


From 09314799e4f0589e52bafcd0ca3556c60468bc0e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 19 Feb 2015 16:04:40 +1100
Subject: md: remove 'go_faster' option from ->sync_request()

This option is not well justified and testing suggests that
it hardly ever makes any difference.

The comment suggests there might be a need to wait for non-resync
activity indicated by ->nr_waiting, however raise_barrier()
already waits for all of that.

So just remove it to simplify reasoning about speed limiting.

This allows us to remove a 'FIXME' comment from raid5.c as that
never used the flag.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     | 5 ++---
 drivers/md/md.h     | 2 +-
 drivers/md/raid1.c  | 9 +--------
 drivers/md/raid10.c | 8 +-------
 drivers/md/raid5.c  | 3 +--
 5 files changed, 6 insertions(+), 21 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3724a29eaf0e..3b9b032aa006 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7820,8 +7820,7 @@ void md_do_sync(struct md_thread *thread)
 		if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 			break;
 
-		sectors = mddev->pers->sync_request(mddev, j, &skipped,
-						  currspeed < speed_min(mddev));
+		sectors = mddev->pers->sync_request(mddev, j, &skipped);
 		if (sectors == 0) {
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 			break;
@@ -7898,7 +7897,7 @@ void md_do_sync(struct md_thread *thread)
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
 	/* tell personality that we are finished */
-	mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
+	mddev->pers->sync_request(mddev, max_sectors, &skipped);
 
 	if (mddev_is_clustered(mddev))
 		md_cluster_ops->resync_finish(mddev);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ecdce36ec6b8..4046a6c6f223 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -506,7 +506,7 @@ struct md_personality
 	int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
 	int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
 	int (*spare_active) (struct mddev *mddev);
-	sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
+	sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped);
 	int (*resize) (struct mddev *mddev, sector_t sectors);
 	sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks);
 	int (*check_reshape) (struct mddev *mddev);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4efa50186a2a..9157a29c8dbf 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2480,7 +2480,7 @@ static int init_resync(struct r1conf *conf)
  * that can be installed to exclude normal IO requests.
  */
 
-static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
 {
 	struct r1conf *conf = mddev->private;
 	struct r1bio *r1_bio;
@@ -2533,13 +2533,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
 		*skipped = 1;
 		return sync_blocks;
 	}
-	/*
-	 * If there is non-resync activity waiting for a turn,
-	 * and resync is going fast enough,
-	 * then let it though before starting on this new sync request.
-	 */
-	if (!go_faster && conf->nr_waiting)
-		msleep_interruptible(1000);
 
 	bitmap_cond_end_sync(mddev->bitmap, sector_nr);
 	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a7196c49d15d..e793ab6b3570 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2889,7 +2889,7 @@ static int init_resync(struct r10conf *conf)
  */
 
 static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
-			     int *skipped, int go_faster)
+			     int *skipped)
 {
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
@@ -2994,12 +2994,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 	if (conf->geo.near_copies < conf->geo.raid_disks &&
 	    max_sector > (sector_nr | chunk_mask))
 		max_sector = (sector_nr | chunk_mask) + 1;
-	/*
-	 * If there is non-resync activity waiting for us then
-	 * put in a delay to throttle resync.
-	 */
-	if (!go_faster && conf->nr_waiting)
-		msleep_interruptible(1000);
 
 	/* Again, very different code for resync and recovery.
 	 * Both must result in an r10bio with a list of bios that
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index cd2f96b2c572..022a0d99e110 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5050,8 +5050,7 @@ ret:
 	return reshape_sectors;
 }
 
-/* FIXME go_faster isn't used */
-static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster)
+static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)
 {
 	struct r5conf *conf = mddev->private;
 	struct stripe_head *sh;
-- 
cgit v1.2.1


From ac8fa4196d205ac8fff3f8932bddbad4f16e4110 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 19 Feb 2015 16:55:00 +1100
Subject: md: allow resync to go faster when there is competing IO.

When md notices non-sync IO happening while it is trying
to resync (or reshape or recover) it slows down to the
set minimum.

The default minimum might have made sense many years ago
but the drives have become faster.  Changing the default
to match the times isn't really a long term solution.

This patch changes the code so that instead of waiting until the speed
has dropped to the target, it just waits until pending requests
have completed.
This means that the delay inserted is a function of the speed
of the devices.

Testing shows that:
 - for some loads, the resync speed is unchanged.  For those loads
   increasing the minimum doesn't change the speed either.
   So this is a good result.  To increase resync speed under such
   loads we would probably need to increase the resync window
   size.

 - for other loads, resync speed does increase to a reasonable
   fraction (e.g. 20%) of maximum possible, and throughput of
   the load only drops a little bit (e.g. 10%)

 - for other loads, throughput of the non-sync load drops quite a bit
   more.  These seem to be latency-sensitive loads.

So it isn't a perfect solution, but it is mostly an improvement.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3b9b032aa006..d4f31e195e26 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7880,11 +7880,18 @@ void md_do_sync(struct md_thread *thread)
 			/((jiffies-mddev->resync_mark)/HZ +1) +1;
 
 		if (currspeed > speed_min(mddev)) {
-			if ((currspeed > speed_max(mddev)) ||
-					!is_mddev_idle(mddev, 0)) {
+			if (currspeed > speed_max(mddev)) {
 				msleep(500);
 				goto repeat;
 			}
+			if (!is_mddev_idle(mddev, 0)) {
+				/*
+				 * Give other IO more of a chance.
+				 * The faster the devices, the less we wait.
+				 */
+				wait_event(mddev->recovery_wait,
+					   !atomic_read(&mddev->recovery_active));
+			}
 		}
 	}
 	printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
-- 
cgit v1.2.1


From 753f2856cda2a130d38ebc3db97bff66c1ef3ca7 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 13 Feb 2015 19:48:01 +0100
Subject: md raid0: access mddev->queue (request queue member) conditionally
 because it is not set when accessed from dm-raid

The patch makes 3 references to mddev->queue in the raid0 personality
conditional in order to allow for it to be accessed from dm-raid.
Mandatory, because md instances underneath dm-raid don't manage
a request queue of their own which'd lead to oopses without the patch.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Tested-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 48 +++++++++++++++++++++++++++---------------------
 1 file changed, 27 insertions(+), 21 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 3b5d7f704aa3..2cb59a641cd2 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -271,14 +271,16 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		goto abort;
 	}
 
-	blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
-	blk_queue_io_opt(mddev->queue,
-			 (mddev->chunk_sectors << 9) * mddev->raid_disks);
-
-	if (!discard_supported)
-		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
-	else
-		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	if (mddev->queue) {
+		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+		blk_queue_io_opt(mddev->queue,
+				 (mddev->chunk_sectors << 9) * mddev->raid_disks);
+
+		if (!discard_supported)
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+		else
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	}
 
 	pr_debug("md/raid0:%s: done.\n", mdname(mddev));
 	*private_conf = conf;
@@ -429,9 +431,12 @@ static int raid0_run(struct mddev *mddev)
 	}
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
-	blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
-	blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
-	blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+
+	if (mddev->queue) {
+		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
+		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+	}
 
 	/* if private is not null, we are here after takeover */
 	if (mddev->private == NULL) {
@@ -448,16 +453,17 @@ static int raid0_run(struct mddev *mddev)
 	printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n",
 	       mdname(mddev),
 	       (unsigned long long)mddev->array_sectors);
-	/* calculate the max read-ahead size.
-	 * For read-ahead of large files to be effective, we need to
-	 * readahead at least twice a whole stripe. i.e. number of devices
-	 * multiplied by chunk size times 2.
-	 * If an individual device has an ra_pages greater than the
-	 * chunk size, then we will not drive that device as hard as it
-	 * wants.  We consider this a configuration error: a larger
-	 * chunksize should be used in that case.
-	 */
-	{
+
+	if (mddev->queue) {
+		/* calculate the max read-ahead size.
+		 * For read-ahead of large files to be effective, we need to
+		 * readahead at least twice a whole stripe. i.e. number of devices
+		 * multiplied by chunk size times 2.
+		 * If an individual device has an ra_pages greater than the
+		 * chunk size, then we will not drive that device as hard as it
+		 * wants.  We consider this a configuration error: a larger
+		 * chunksize should be used in that case.
+		 */
 		int stripe = mddev->raid_disks *
 			(mddev->chunk_sectors << 9) / PAGE_SIZE;
 		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
-- 
cgit v1.2.1


From 46d5b785621ad10a373e292f9101ccfc626466e0 Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:02 +1100
Subject: raid5: use flex_array for scribble data

Use flex_array for scribble data. Next patch will batch several stripes
together, so scribble data should be able to cover several stripes, so this
patch also allocates scribble data for stripes across a chunk.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 89 ++++++++++++++++++++++++++++++++++--------------------
 drivers/md/raid5.h |  6 +---
 2 files changed, 57 insertions(+), 38 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 022a0d99e110..7fb510e54548 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -54,6 +54,7 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/nodemask.h>
+#include <linux/flex_array.h>
 #include <trace/events/block.h>
 
 #include "md.h"
@@ -1109,16 +1110,28 @@ static void ops_complete_compute(void *stripe_head_ref)
 
 /* return a pointer to the address conversion region of the scribble buffer */
 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
-				 struct raid5_percpu *percpu)
+				 struct raid5_percpu *percpu, int i)
 {
-	return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
+	void *addr;
+
+	addr = flex_array_get(percpu->scribble, i);
+	return addr + sizeof(struct page *) * (sh->disks + 2);
+}
+
+/* return a pointer to the address conversion region of the scribble buffer */
+static struct page **to_addr_page(struct raid5_percpu *percpu, int i)
+{
+	void *addr;
+
+	addr = flex_array_get(percpu->scribble, i);
+	return addr;
 }
 
 static struct dma_async_tx_descriptor *
 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = percpu->scribble;
+	struct page **xor_srcs = to_addr_page(percpu, 0);
 	int target = sh->ops.target;
 	struct r5dev *tgt = &sh->dev[target];
 	struct page *xor_dest = tgt->page;
@@ -1138,7 +1151,7 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 	atomic_inc(&sh->count);
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
-			  ops_complete_compute, sh, to_addr_conv(sh, percpu));
+			  ops_complete_compute, sh, to_addr_conv(sh, percpu, 0));
 	if (unlikely(count == 1))
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 	else
@@ -1183,7 +1196,7 @@ static struct dma_async_tx_descriptor *
 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 {
 	int disks = sh->disks;
-	struct page **blocks = percpu->scribble;
+	struct page **blocks = to_addr_page(percpu, 0);
 	int target;
 	int qd_idx = sh->qd_idx;
 	struct dma_async_tx_descriptor *tx;
@@ -1216,7 +1229,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 		BUG_ON(blocks[count+1] != dest); /* q should already be set */
 		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 				  ops_complete_compute, sh,
-				  to_addr_conv(sh, percpu));
+				  to_addr_conv(sh, percpu, 0));
 		tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
 	} else {
 		/* Compute any data- or p-drive using XOR */
@@ -1229,7 +1242,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 
 		init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
 				  NULL, ops_complete_compute, sh,
-				  to_addr_conv(sh, percpu));
+				  to_addr_conv(sh, percpu, 0));
 		tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
 	}
 
@@ -1248,7 +1261,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 	struct r5dev *tgt = &sh->dev[target];
 	struct r5dev *tgt2 = &sh->dev[target2];
 	struct dma_async_tx_descriptor *tx;
-	struct page **blocks = percpu->scribble;
+	struct page **blocks = to_addr_page(percpu, 0);
 	struct async_submit_ctl submit;
 
 	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
@@ -1290,7 +1303,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 			/* Missing P+Q, just recompute */
 			init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 					  ops_complete_compute, sh,
-					  to_addr_conv(sh, percpu));
+					  to_addr_conv(sh, percpu, 0));
 			return async_gen_syndrome(blocks, 0, syndrome_disks+2,
 						  STRIPE_SIZE, &submit);
 		} else {
@@ -1314,21 +1327,21 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 			init_async_submit(&submit,
 					  ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
 					  NULL, NULL, NULL,
-					  to_addr_conv(sh, percpu));
+					  to_addr_conv(sh, percpu, 0));
 			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
 				       &submit);
 
 			count = set_syndrome_sources(blocks, sh);
 			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
 					  ops_complete_compute, sh,
-					  to_addr_conv(sh, percpu));
+					  to_addr_conv(sh, percpu, 0));
 			return async_gen_syndrome(blocks, 0, count+2,
 						  STRIPE_SIZE, &submit);
 		}
 	} else {
 		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
 				  ops_complete_compute, sh,
-				  to_addr_conv(sh, percpu));
+				  to_addr_conv(sh, percpu, 0));
 		if (failb == syndrome_disks) {
 			/* We're missing D+P. */
 			return async_raid6_datap_recov(syndrome_disks+2,
@@ -1356,7 +1369,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	       struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = percpu->scribble;
+	struct page **xor_srcs = to_addr_page(percpu, 0);
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct async_submit_ctl submit;
 
@@ -1374,7 +1387,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	}
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
-			  ops_complete_prexor, sh, to_addr_conv(sh, percpu));
+			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
 	tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
 
 	return tx;
@@ -1478,7 +1491,7 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 		     struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = percpu->scribble;
+	struct page **xor_srcs = to_addr_page(percpu, 0);
 	struct async_submit_ctl submit;
 	int count = 0, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
@@ -1531,7 +1544,7 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	atomic_inc(&sh->count);
 
 	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
-			  to_addr_conv(sh, percpu));
+			  to_addr_conv(sh, percpu, 0));
 	if (unlikely(count == 1))
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 	else
@@ -1543,7 +1556,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 		     struct dma_async_tx_descriptor *tx)
 {
 	struct async_submit_ctl submit;
-	struct page **blocks = percpu->scribble;
+	struct page **blocks = to_addr_page(percpu, 0);
 	int count, i;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
@@ -1567,7 +1580,7 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 	atomic_inc(&sh->count);
 
 	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
-			  sh, to_addr_conv(sh, percpu));
+			  sh, to_addr_conv(sh, percpu, 0));
 	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
 }
 
@@ -1589,7 +1602,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 	int pd_idx = sh->pd_idx;
 	int qd_idx = sh->qd_idx;
 	struct page *xor_dest;
-	struct page **xor_srcs = percpu->scribble;
+	struct page **xor_srcs = to_addr_page(percpu, 0);
 	struct dma_async_tx_descriptor *tx;
 	struct async_submit_ctl submit;
 	int count;
@@ -1608,7 +1621,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 	}
 
 	init_async_submit(&submit, 0, NULL, NULL, NULL,
-			  to_addr_conv(sh, percpu));
+			  to_addr_conv(sh, percpu, 0));
 	tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
 			   &sh->ops.zero_sum_result, &submit);
 
@@ -1619,7 +1632,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 
 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
 {
-	struct page **srcs = percpu->scribble;
+	struct page **srcs = to_addr_page(percpu, 0);
 	struct async_submit_ctl submit;
 	int count;
 
@@ -1632,7 +1645,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 
 	atomic_inc(&sh->count);
 	init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
-			  sh, to_addr_conv(sh, percpu));
+			  sh, to_addr_conv(sh, percpu, 0));
 	async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
 			   &sh->ops.zero_sum_result, percpu->spare_page, &submit);
 }
@@ -1772,13 +1785,21 @@ static int grow_stripes(struct r5conf *conf, int num)
  * calculate over all devices (not just the data blocks), using zeros in place
  * of the P and Q blocks.
  */
-static size_t scribble_len(int num)
+static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
 {
+	struct flex_array *ret;
 	size_t len;
 
 	len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
-
-	return len;
+	ret = flex_array_alloc(len, cnt, flags);
+	if (!ret)
+		return NULL;
+	/* always prealloc all elements, so no locking is required */
+	if (flex_array_prealloc(ret, 0, cnt, flags)) {
+		flex_array_free(ret);
+		return NULL;
+	}
+	return ret;
 }
 
 static int resize_stripes(struct r5conf *conf, int newsize)
@@ -1896,16 +1917,16 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		err = -ENOMEM;
 
 	get_online_cpus();
-	conf->scribble_len = scribble_len(newsize);
 	for_each_present_cpu(cpu) {
 		struct raid5_percpu *percpu;
-		void *scribble;
+		struct flex_array *scribble;
 
 		percpu = per_cpu_ptr(conf->percpu, cpu);
-		scribble = kmalloc(conf->scribble_len, GFP_NOIO);
+		scribble = scribble_alloc(newsize, conf->chunk_sectors /
+			STRIPE_SECTORS, GFP_NOIO);
 
 		if (scribble) {
-			kfree(percpu->scribble);
+			flex_array_free(percpu->scribble);
 			percpu->scribble = scribble;
 		} else {
 			err = -ENOMEM;
@@ -5698,7 +5719,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 static void free_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu)
 {
 	safe_put_page(percpu->spare_page);
-	kfree(percpu->scribble);
+	if (percpu->scribble)
+		flex_array_free(percpu->scribble);
 	percpu->spare_page = NULL;
 	percpu->scribble = NULL;
 }
@@ -5708,7 +5730,9 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
 	if (conf->level == 6 && !percpu->spare_page)
 		percpu->spare_page = alloc_page(GFP_KERNEL);
 	if (!percpu->scribble)
-		percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
+		percpu->scribble = scribble_alloc(max(conf->raid_disks,
+			conf->previous_raid_disks), conf->chunk_sectors /
+			STRIPE_SECTORS, GFP_KERNEL);
 
 	if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
 		free_scratch_buffer(conf, percpu);
@@ -5878,7 +5902,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	else
 		conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
 	max_disks = max(conf->raid_disks, conf->previous_raid_disks);
-	conf->scribble_len = scribble_len(max_disks);
 
 	conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
 			      GFP_KERNEL);
@@ -5906,6 +5929,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 		INIT_LIST_HEAD(conf->temp_inactive_list + i);
 
 	conf->level = mddev->new_level;
+	conf->chunk_sectors = mddev->new_chunk_sectors;
 	if (raid5_alloc_percpu(conf) != 0)
 		goto abort;
 
@@ -5938,7 +5962,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 			conf->fullsync = 1;
 	}
 
-	conf->chunk_sectors = mddev->new_chunk_sectors;
 	conf->level = mddev->new_level;
 	if (conf->level == 6)
 		conf->max_degraded = 2;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 983e18a83db1..1d0f241d7d3b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -458,15 +458,11 @@ struct r5conf {
 	/* per cpu variables */
 	struct raid5_percpu {
 		struct page	*spare_page; /* Used when checking P/Q in raid6 */
-		void		*scribble;   /* space for constructing buffer
+		struct flex_array *scribble;   /* space for constructing buffer
 					      * lists and performing address
 					      * conversions
 					      */
 	} __percpu *percpu;
-	size_t			scribble_len; /* size of scribble region must be
-					       * associated with conf to handle
-					       * cpu hotplug while reshaping
-					       */
 #ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block	cpu_notify;
 #endif
-- 
cgit v1.2.1


From da41ba65972532a04f73927c903029a7ec3bc2ed Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:03 +1100
Subject: raid5: add a new flag to track if a stripe can be batched

A freshly new stripe with write request can be batched. Any time the stripe is
handled or new read is queued, the flag will be cleared.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 12 +++++++++---
 drivers/md/raid5.h |  1 +
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7fb510e54548..49b0f23dbad2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -555,6 +555,7 @@ retry:
 		goto retry;
 	insert_hash(conf, sh);
 	sh->cpu = smp_processor_id();
+	set_bit(STRIPE_BATCH_READY, &sh->state);
 }
 
 static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
@@ -2645,7 +2646,8 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
  * toread/towrite point to the first in a chain.
  * The bi_next chain must be in order.
  */
-static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
+static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
+			  int forwrite, int previous)
 {
 	struct bio **bip;
 	struct r5conf *conf = sh->raid_conf;
@@ -2678,6 +2680,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	if (*bip && (*bip)->bi_iter.bi_sector < bio_end_sector(bi))
 		goto overlap;
 
+	if (!forwrite || previous)
+		clear_bit(STRIPE_BATCH_READY, &sh->state);
+
 	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
 	if (*bip)
 		bi->bi_next = *bip;
@@ -3824,6 +3829,7 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
+	clear_bit(STRIPE_BATCH_READY, &sh->state);
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
@@ -4793,7 +4799,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			}
 
 			if (test_bit(STRIPE_EXPANDING, &sh->state) ||
-			    !add_stripe_bio(sh, bi, dd_idx, rw)) {
+			    !add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
 				/* Stripe is busy expanding or
 				 * add failed due to overlap.  Flush everything
 				 * and wait a while
@@ -5206,7 +5212,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 			return handled;
 		}
 
-		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
+		if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
 			release_stripe(sh);
 			raid5_set_bi_processed_stripes(raid_bio, scnt);
 			conf->retry_read_aligned = raid_bio;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 1d0f241d7d3b..37644e3d5293 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -327,6 +327,7 @@ enum {
 	STRIPE_ON_UNPLUG_LIST,
 	STRIPE_DISCARD,
 	STRIPE_ON_RELEASE_LIST,
+	STRIPE_BATCH_READY,
 };
 
 /*
-- 
cgit v1.2.1


From 7a87f43405e91ca12b8770eb689dd9886f217091 Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:03 +1100
Subject: raid5: track overwrite disk count

Track overwrite disk count, so we can know if a stripe is a full stripe write.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 14 +++++++++++++-
 drivers/md/raid5.h |  4 ++++
 2 files changed, 17 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 49b0f23dbad2..e801c6669c6d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -553,6 +553,7 @@ retry:
 	}
 	if (read_seqcount_retry(&conf->gen_lock, seq))
 		goto retry;
+	sh->overwrite_disks = 0;
 	insert_hash(conf, sh);
 	sh->cpu = smp_processor_id();
 	set_bit(STRIPE_BATCH_READY, &sh->state);
@@ -710,6 +711,12 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 	return sh;
 }
 
+static bool is_full_stripe_write(struct stripe_head *sh)
+{
+	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
+	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
+}
+
 /* Determine if 'data_offset' or 'new_data_offset' should be used
  * in this stripe_head.
  */
@@ -1413,6 +1420,7 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
+			sh->overwrite_disks = 0;
 			BUG_ON(dev->written);
 			wbi = dev->written = chosen;
 			spin_unlock_irq(&sh->stripe_lock);
@@ -2700,7 +2708,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 				sector = bio_end_sector(bi);
 		}
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
-			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
+			if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags))
+				sh->overwrite_disks++;
 	}
 
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
@@ -2772,6 +2781,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 		/* fail all writes first */
 		bi = sh->dev[i].towrite;
 		sh->dev[i].towrite = NULL;
+		sh->overwrite_disks = 0;
 		spin_unlock_irq(&sh->stripe_lock);
 		if (bi)
 			bitmap_end = 1;
@@ -4630,12 +4640,14 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		}
 		set_bit(STRIPE_DISCARD, &sh->state);
 		finish_wait(&conf->wait_for_overlap, &w);
+		sh->overwrite_disks = 0;
 		for (d = 0; d < conf->raid_disks; d++) {
 			if (d == sh->pd_idx || d == sh->qd_idx)
 				continue;
 			sh->dev[d].towrite = bi;
 			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
 			raid5_inc_bi_active_stripes(bi);
+			sh->overwrite_disks++;
 		}
 		spin_unlock_irq(&sh->stripe_lock);
 		if (conf->mddev->bitmap) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 37644e3d5293..4cc1a48127c7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -210,6 +210,10 @@ struct stripe_head {
 	atomic_t		count;	      /* nr of active thread/requests */
 	int			bm_seq;	/* sequence number for bitmap flushes */
 	int			disks;		/* disks in stripe */
+	int			overwrite_disks; /* total overwrite disks in stripe,
+						  * this is only checked when stripe
+						  * has STRIPE_BATCH_READY
+						  */
 	enum check_states	check_state;
 	enum reconstruct_states reconstruct_state;
 	spinlock_t		stripe_lock;
-- 
cgit v1.2.1


From 59fc630b8b5f9f21c8ce3ba153341c107dce1b0c Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:03 +1100
Subject: RAID5: batch adjacent full stripe write

stripe cache is 4k size. Even adjacent full stripe writes are handled in 4k
unit. Idealy we should use big size for adjacent full stripe writes. Bigger
stripe cache size means less stripes runing in the state machine so can reduce
cpu overhead. And also bigger size can cause bigger IO size dispatched to under
layer disks.

With below patch, we will automatically batch adjacent full stripe write
together. Such stripes will be added to the batch list. Only the first stripe
of the list will be put to handle_list and so run handle_stripe(). Some steps
of handle_stripe() are extended to cover all stripes of the list, including
ops_run_io, ops_run_biodrain and so on. With this patch, we have less stripes
running in handle_stripe() and we send IO of whole stripe list together to
increase IO size.

Stripes added to a batch list have some limitations. A batch list can only
include full stripe write and can't cross chunk boundary to make sure stripes
have the same parity disks. Stripes in a batch list must be in the same state
(no written, toread and so on). If a stripe is in a batch list, all new
read/write to add_stripe_bio will be blocked to overlap conflict till the batch
list is handled. The limitations will make sure stripes in a batch list be in
exactly the same state in the life circly.

I did test running 160k randwrite in a RAID5 array with 32k chunk size and 6
PCIe SSD. This patch improves around 30% performance and IO size to under layer
disk is exactly 32k. I also run a 4k randwrite test in the same array to make
sure the performance isn't changed with the patch.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 357 +++++++++++++++++++++++++++++++++++++++++++++++++----
 drivers/md/raid5.h |   4 +
 2 files changed, 336 insertions(+), 25 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e801c6669c6d..717189e74243 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -526,6 +526,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
 	BUG_ON(atomic_read(&sh->count) != 0);
 	BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
 	BUG_ON(stripe_operations_active(sh));
+	BUG_ON(sh->batch_head);
 
 	pr_debug("init_stripe called, stripe %llu\n",
 		(unsigned long long)sector);
@@ -717,6 +718,124 @@ static bool is_full_stripe_write(struct stripe_head *sh)
 	return sh->overwrite_disks == (sh->disks - sh->raid_conf->max_degraded);
 }
 
+static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+	local_irq_disable();
+	if (sh1 > sh2) {
+		spin_lock(&sh2->stripe_lock);
+		spin_lock_nested(&sh1->stripe_lock, 1);
+	} else {
+		spin_lock(&sh1->stripe_lock);
+		spin_lock_nested(&sh2->stripe_lock, 1);
+	}
+}
+
+static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
+{
+	spin_unlock(&sh1->stripe_lock);
+	spin_unlock(&sh2->stripe_lock);
+	local_irq_enable();
+}
+
+/* Only freshly new full stripe normal write stripe can be added to a batch list */
+static bool stripe_can_batch(struct stripe_head *sh)
+{
+	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
+		is_full_stripe_write(sh);
+}
+
+/* we only do back search */
+static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh)
+{
+	struct stripe_head *head;
+	sector_t head_sector, tmp_sec;
+	int hash;
+	int dd_idx;
+
+	if (!stripe_can_batch(sh))
+		return;
+	/* Don't cross chunks, so stripe pd_idx/qd_idx is the same */
+	tmp_sec = sh->sector;
+	if (!sector_div(tmp_sec, conf->chunk_sectors))
+		return;
+	head_sector = sh->sector - STRIPE_SECTORS;
+
+	hash = stripe_hash_locks_hash(head_sector);
+	spin_lock_irq(conf->hash_locks + hash);
+	head = __find_stripe(conf, head_sector, conf->generation);
+	if (head && !atomic_inc_not_zero(&head->count)) {
+		spin_lock(&conf->device_lock);
+		if (!atomic_read(&head->count)) {
+			if (!test_bit(STRIPE_HANDLE, &head->state))
+				atomic_inc(&conf->active_stripes);
+			BUG_ON(list_empty(&head->lru) &&
+			       !test_bit(STRIPE_EXPANDING, &head->state));
+			list_del_init(&head->lru);
+			if (head->group) {
+				head->group->stripes_cnt--;
+				head->group = NULL;
+			}
+		}
+		atomic_inc(&head->count);
+		spin_unlock(&conf->device_lock);
+	}
+	spin_unlock_irq(conf->hash_locks + hash);
+
+	if (!head)
+		return;
+	if (!stripe_can_batch(head))
+		goto out;
+
+	lock_two_stripes(head, sh);
+	/* clear_batch_ready clear the flag */
+	if (!stripe_can_batch(head) || !stripe_can_batch(sh))
+		goto unlock_out;
+
+	if (sh->batch_head)
+		goto unlock_out;
+
+	dd_idx = 0;
+	while (dd_idx == sh->pd_idx || dd_idx == sh->qd_idx)
+		dd_idx++;
+	if (head->dev[dd_idx].towrite->bi_rw != sh->dev[dd_idx].towrite->bi_rw)
+		goto unlock_out;
+
+	if (head->batch_head) {
+		spin_lock(&head->batch_head->batch_lock);
+		/* This batch list is already running */
+		if (!stripe_can_batch(head)) {
+			spin_unlock(&head->batch_head->batch_lock);
+			goto unlock_out;
+		}
+
+		/*
+		 * at this point, head's BATCH_READY could be cleared, but we
+		 * can still add the stripe to batch list
+		 */
+		list_add(&sh->batch_list, &head->batch_list);
+		spin_unlock(&head->batch_head->batch_lock);
+
+		sh->batch_head = head->batch_head;
+	} else {
+		head->batch_head = head;
+		sh->batch_head = head->batch_head;
+		spin_lock(&head->batch_lock);
+		list_add_tail(&sh->batch_list, &head->batch_list);
+		spin_unlock(&head->batch_lock);
+	}
+
+	if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
+		if (atomic_dec_return(&conf->preread_active_stripes)
+		    < IO_THRESHOLD)
+			md_wakeup_thread(conf->mddev->thread);
+
+	atomic_inc(&sh->count);
+unlock_out:
+	unlock_two_stripes(head, sh);
+out:
+	release_stripe(head);
+}
+
 /* Determine if 'data_offset' or 'new_data_offset' should be used
  * in this stripe_head.
  */
@@ -747,6 +866,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 {
 	struct r5conf *conf = sh->raid_conf;
 	int i, disks = sh->disks;
+	struct stripe_head *head_sh = sh;
 
 	might_sleep();
 
@@ -755,6 +875,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		int replace_only = 0;
 		struct bio *bi, *rbi;
 		struct md_rdev *rdev, *rrdev = NULL;
+
+		sh = head_sh;
 		if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
 			if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
 				rw = WRITE_FUA;
@@ -773,6 +895,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		if (test_and_clear_bit(R5_SyncIO, &sh->dev[i].flags))
 			rw |= REQ_SYNC;
 
+again:
 		bi = &sh->dev[i].req;
 		rbi = &sh->dev[i].rreq; /* For writing to replacement */
 
@@ -791,7 +914,7 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				/* We raced and saw duplicates */
 				rrdev = NULL;
 		} else {
-			if (test_bit(R5_ReadRepl, &sh->dev[i].flags) && rrdev)
+			if (test_bit(R5_ReadRepl, &head_sh->dev[i].flags) && rrdev)
 				rdev = rrdev;
 			rrdev = NULL;
 		}
@@ -862,13 +985,15 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				__func__, (unsigned long long)sh->sector,
 				bi->bi_rw, i);
 			atomic_inc(&sh->count);
+			if (sh != head_sh)
+				atomic_inc(&head_sh->count);
 			if (use_new_offset(conf, sh))
 				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->new_data_offset);
 			else
 				bi->bi_iter.bi_sector = (sh->sector
 						 + rdev->data_offset);
-			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
+			if (test_bit(R5_ReadNoMerge, &head_sh->dev[i].flags))
 				bi->bi_rw |= REQ_NOMERGE;
 
 			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
@@ -912,6 +1037,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				__func__, (unsigned long long)sh->sector,
 				rbi->bi_rw, i);
 			atomic_inc(&sh->count);
+			if (sh != head_sh)
+				atomic_inc(&head_sh->count);
 			if (use_new_offset(conf, sh))
 				rbi->bi_iter.bi_sector = (sh->sector
 						  + rrdev->new_data_offset);
@@ -945,6 +1072,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
+
+		if (!head_sh->batch_head)
+			continue;
+		sh = list_first_entry(&sh->batch_list, struct stripe_head,
+				      batch_list);
+		if (sh != head_sh)
+			goto again;
 	}
 }
 
@@ -1060,6 +1194,7 @@ static void ops_run_biofill(struct stripe_head *sh)
 	struct async_submit_ctl submit;
 	int i;
 
+	BUG_ON(sh->batch_head);
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
@@ -1148,6 +1283,8 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 	struct async_submit_ctl submit;
 	int i;
 
+	BUG_ON(sh->batch_head);
+
 	pr_debug("%s: stripe %llu block: %d\n",
 		__func__, (unsigned long long)sh->sector, target);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
@@ -1214,6 +1351,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 	int i;
 	int count;
 
+	BUG_ON(sh->batch_head);
 	if (sh->ops.target < 0)
 		target = sh->ops.target2;
 	else if (sh->ops.target2 < 0)
@@ -1272,6 +1410,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 	struct page **blocks = to_addr_page(percpu, 0);
 	struct async_submit_ctl submit;
 
+	BUG_ON(sh->batch_head);
 	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
 		 __func__, (unsigned long long)sh->sector, target, target2);
 	BUG_ON(target < 0 || target2 < 0);
@@ -1384,6 +1523,7 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	/* existing parity data subtracted */
 	struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 
+	BUG_ON(sh->batch_head);
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
@@ -1406,17 +1546,21 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
 	int i;
+	struct stripe_head *head_sh = sh;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
 	for (i = disks; i--; ) {
-		struct r5dev *dev = &sh->dev[i];
+		struct r5dev *dev;
 		struct bio *chosen;
 
-		if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
+		sh = head_sh;
+		if (test_and_clear_bit(R5_Wantdrain, &head_sh->dev[i].flags)) {
 			struct bio *wbi;
 
+again:
+			dev = &sh->dev[i];
 			spin_lock_irq(&sh->stripe_lock);
 			chosen = dev->towrite;
 			dev->towrite = NULL;
@@ -1445,6 +1589,15 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 				}
 				wbi = r5_next_bio(wbi, dev->sector);
 			}
+
+			if (head_sh->batch_head) {
+				sh = list_first_entry(&sh->batch_list,
+						      struct stripe_head,
+						      batch_list);
+				if (sh == head_sh)
+					continue;
+				goto again;
+			}
 		}
 	}
 
@@ -1500,12 +1653,15 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 		     struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
-	struct page **xor_srcs = to_addr_page(percpu, 0);
+	struct page **xor_srcs;
 	struct async_submit_ctl submit;
-	int count = 0, pd_idx = sh->pd_idx, i;
+	int count, pd_idx = sh->pd_idx, i;
 	struct page *xor_dest;
 	int prexor = 0;
 	unsigned long flags;
+	int j = 0;
+	struct stripe_head *head_sh = sh;
+	int last_stripe;
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
@@ -1522,15 +1678,18 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 		ops_complete_reconstruct(sh);
 		return;
 	}
+again:
+	count = 0;
+	xor_srcs = to_addr_page(percpu, j);
 	/* check if prexor is active which means only process blocks
 	 * that are part of a read-modify-write (written)
 	 */
-	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+	if (head_sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
 		prexor = 1;
 		xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (dev->written)
+			if (head_sh->dev[i].written)
 				xor_srcs[count++] = dev->page;
 		}
 	} else {
@@ -1547,17 +1706,32 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
 	 * for the synchronous xor case
 	 */
-	flags = ASYNC_TX_ACK |
-		(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
-
-	atomic_inc(&sh->count);
+	last_stripe = !head_sh->batch_head ||
+		list_first_entry(&sh->batch_list,
+				 struct stripe_head, batch_list) == head_sh;
+	if (last_stripe) {
+		flags = ASYNC_TX_ACK |
+			(prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
+
+		atomic_inc(&head_sh->count);
+		init_async_submit(&submit, flags, tx, ops_complete_reconstruct, head_sh,
+				  to_addr_conv(sh, percpu, j));
+	} else {
+		flags = prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST;
+		init_async_submit(&submit, flags, tx, NULL, NULL,
+				  to_addr_conv(sh, percpu, j));
+	}
 
-	init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
-			  to_addr_conv(sh, percpu, 0));
 	if (unlikely(count == 1))
 		tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
 	else
 		tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
+	if (!last_stripe) {
+		j++;
+		sh = list_first_entry(&sh->batch_list, struct stripe_head,
+				      batch_list);
+		goto again;
+	}
 }
 
 static void
@@ -1565,8 +1739,10 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 		     struct dma_async_tx_descriptor *tx)
 {
 	struct async_submit_ctl submit;
-	struct page **blocks = to_addr_page(percpu, 0);
-	int count, i;
+	struct page **blocks;
+	int count, i, j = 0;
+	struct stripe_head *head_sh = sh;
+	int last_stripe;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
@@ -1584,13 +1760,27 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 		return;
 	}
 
+again:
+	blocks = to_addr_page(percpu, j);
 	count = set_syndrome_sources(blocks, sh);
-
-	atomic_inc(&sh->count);
-
-	init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
-			  sh, to_addr_conv(sh, percpu, 0));
+	last_stripe = !head_sh->batch_head ||
+		list_first_entry(&sh->batch_list,
+				 struct stripe_head, batch_list) == head_sh;
+
+	if (last_stripe) {
+		atomic_inc(&head_sh->count);
+		init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+				  head_sh, to_addr_conv(sh, percpu, j));
+	} else
+		init_async_submit(&submit, 0, tx, NULL, NULL,
+				  to_addr_conv(sh, percpu, j));
 	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+	if (!last_stripe) {
+		j++;
+		sh = list_first_entry(&sh->batch_list, struct stripe_head,
+				      batch_list);
+		goto again;
+	}
 }
 
 static void ops_complete_check(void *stripe_head_ref)
@@ -1620,6 +1810,7 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	BUG_ON(sh->batch_head);
 	count = 0;
 	xor_dest = sh->dev[pd_idx].page;
 	xor_srcs[count++] = xor_dest;
@@ -1648,6 +1839,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
 		(unsigned long long)sh->sector, checkp);
 
+	BUG_ON(sh->batch_head);
 	count = set_syndrome_sources(srcs, sh);
 	if (!checkp)
 		srcs[count] = NULL;
@@ -1715,7 +1907,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			BUG();
 	}
 
-	if (overlap_clear)
+	if (overlap_clear && !sh->batch_head)
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
 			if (test_and_clear_bit(R5_Overlap, &dev->flags))
@@ -1745,6 +1937,10 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
 	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
 	INIT_LIST_HEAD(&sh->lru);
+
+	spin_lock_init(&sh->batch_lock);
+	INIT_LIST_HEAD(&sh->batch_list);
+	sh->batch_head = NULL;
 	release_stripe(sh);
 	return 1;
 }
@@ -2188,6 +2384,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
+
+	if (sh->batch_head && sh != sh->batch_head)
+		release_stripe(sh->batch_head);
 }
 
 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
@@ -2674,6 +2873,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	 * protect it.
 	 */
 	spin_lock_irq(&sh->stripe_lock);
+	/* Don't allow new IO added to stripes in batch list */
+	if (sh->batch_head)
+		goto overlap;
 	if (forwrite) {
 		bip = &sh->dev[dd_idx].towrite;
 		if (*bip == NULL)
@@ -2723,6 +2925,9 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 		sh->bm_seq = conf->seq_flush+1;
 		set_bit(STRIPE_BIT_DELAY, &sh->state);
 	}
+
+	if (stripe_can_batch(sh))
+		stripe_add_to_batch_list(conf, sh);
 	return 1;
 
  overlap:
@@ -2755,6 +2960,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 				struct bio **return_bi)
 {
 	int i;
+	BUG_ON(sh->batch_head);
 	for (i = disks; i--; ) {
 		struct bio *bi;
 		int bitmap_end = 0;
@@ -2870,6 +3076,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh,
 	int abort = 0;
 	int i;
 
+	BUG_ON(sh->batch_head);
 	clear_bit(STRIPE_SYNCING, &sh->state);
 	if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags))
 		wake_up(&conf->wait_for_overlap);
@@ -3100,6 +3307,7 @@ static void handle_stripe_fill(struct stripe_head *sh,
 {
 	int i;
 
+	BUG_ON(sh->batch_head);
 	/* look for blocks to read/compute, skip this if a compute
 	 * is already in flight, or if the stripe contents are in the
 	 * midst of changing due to a write
@@ -3123,6 +3331,9 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 	int i;
 	struct r5dev *dev;
 	int discard_pending = 0;
+	struct stripe_head *head_sh = sh;
+	bool do_endio = false;
+	int wakeup_nr = 0;
 
 	for (i = disks; i--; )
 		if (sh->dev[i].written) {
@@ -3138,8 +3349,11 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 					clear_bit(R5_UPTODATE, &dev->flags);
 				if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
 					WARN_ON(test_bit(R5_UPTODATE, &dev->flags));
-					dev->page = dev->orig_page;
 				}
+				do_endio = true;
+
+returnbi:
+				dev->page = dev->orig_page;
 				wbi = dev->written;
 				dev->written = NULL;
 				while (wbi && wbi->bi_iter.bi_sector <
@@ -3156,6 +3370,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 						STRIPE_SECTORS,
 					 !test_bit(STRIPE_DEGRADED, &sh->state),
 						0);
+				if (head_sh->batch_head) {
+					sh = list_first_entry(&sh->batch_list,
+							      struct stripe_head,
+							      batch_list);
+					if (sh != head_sh) {
+						dev = &sh->dev[i];
+						goto returnbi;
+					}
+				}
+				sh = head_sh;
+				dev = &sh->dev[i];
 			} else if (test_bit(R5_Discard, &dev->flags))
 				discard_pending = 1;
 			WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
@@ -3177,8 +3402,17 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 		 * will be reinitialized
 		 */
 		spin_lock_irq(&conf->device_lock);
+unhash:
 		remove_hash(sh);
+		if (head_sh->batch_head) {
+			sh = list_first_entry(&sh->batch_list,
+					      struct stripe_head, batch_list);
+			if (sh != head_sh)
+					goto unhash;
+		}
 		spin_unlock_irq(&conf->device_lock);
+		sh = head_sh;
+
 		if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
 			set_bit(STRIPE_HANDLE, &sh->state);
 
@@ -3187,6 +3421,39 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
 		if (atomic_dec_and_test(&conf->pending_full_writes))
 			md_wakeup_thread(conf->mddev->thread);
+
+	if (!head_sh->batch_head || !do_endio)
+		return;
+	for (i = 0; i < head_sh->disks; i++) {
+		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
+			wakeup_nr++;
+	}
+	while (!list_empty(&head_sh->batch_list)) {
+		int i;
+		sh = list_first_entry(&head_sh->batch_list,
+				      struct stripe_head, batch_list);
+		list_del_init(&sh->batch_list);
+
+		sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
+			(1 << STRIPE_PREREAD_ACTIVE)));
+		sh->check_state = head_sh->check_state;
+		sh->reconstruct_state = head_sh->reconstruct_state;
+		for (i = 0; i < sh->disks; i++) {
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				wakeup_nr++;
+			sh->dev[i].flags = head_sh->dev[i].flags;
+		}
+
+		spin_lock_irq(&sh->stripe_lock);
+		sh->batch_head = NULL;
+		spin_unlock_irq(&sh->stripe_lock);
+		release_stripe(sh);
+	}
+
+	spin_lock_irq(&head_sh->stripe_lock);
+	head_sh->batch_head = NULL;
+	spin_unlock_irq(&head_sh->stripe_lock);
+	wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
 }
 
 static void handle_stripe_dirtying(struct r5conf *conf,
@@ -3326,6 +3593,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh,
 {
 	struct r5dev *dev = NULL;
 
+	BUG_ON(sh->batch_head);
 	set_bit(STRIPE_HANDLE, &sh->state);
 
 	switch (sh->check_state) {
@@ -3416,6 +3684,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh,
 	int qd_idx = sh->qd_idx;
 	struct r5dev *dev;
 
+	BUG_ON(sh->batch_head);
 	set_bit(STRIPE_HANDLE, &sh->state);
 
 	BUG_ON(s->failed > 2);
@@ -3579,6 +3848,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 	 * copy some of them into a target stripe for expand.
 	 */
 	struct dma_async_tx_descriptor *tx = NULL;
+	BUG_ON(sh->batch_head);
 	clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 	for (i = 0; i < sh->disks; i++)
 		if (i != sh->pd_idx && i != sh->qd_idx) {
@@ -3822,6 +4092,38 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 	rcu_read_unlock();
 }
 
+static int clear_batch_ready(struct stripe_head *sh)
+{
+	struct stripe_head *tmp;
+	if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
+		return 0;
+	spin_lock(&sh->stripe_lock);
+	if (!sh->batch_head) {
+		spin_unlock(&sh->stripe_lock);
+		return 0;
+	}
+
+	/*
+	 * this stripe could be added to a batch list before we check
+	 * BATCH_READY, skips it
+	 */
+	if (sh->batch_head != sh) {
+		spin_unlock(&sh->stripe_lock);
+		return 1;
+	}
+	spin_lock(&sh->batch_lock);
+	list_for_each_entry(tmp, &sh->batch_list, batch_list)
+		clear_bit(STRIPE_BATCH_READY, &tmp->state);
+	spin_unlock(&sh->batch_lock);
+	spin_unlock(&sh->stripe_lock);
+
+	/*
+	 * BATCH_READY is cleared, no new stripes can be added.
+	 * batch_list can be accessed without lock
+	 */
+	return 0;
+}
+
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
@@ -3839,7 +4141,11 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
-	clear_bit(STRIPE_BATCH_READY, &sh->state);
+	if (clear_batch_ready(sh) ) {
+		clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
+		return;
+	}
+
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
@@ -4824,7 +5130,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 			}
 			set_bit(STRIPE_HANDLE, &sh->state);
 			clear_bit(STRIPE_DELAYED, &sh->state);
-			if ((bi->bi_rw & REQ_SYNC) &&
+			if ((!sh->batch_head || sh == sh->batch_head) &&
+			    (bi->bi_rw & REQ_SYNC) &&
 			    !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
 			release_stripe_plug(mddev, sh);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4cc1a48127c7..c8d0004dca8f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -219,6 +219,10 @@ struct stripe_head {
 	spinlock_t		stripe_lock;
 	int			cpu;
 	struct r5worker_group	*group;
+
+	struct stripe_head	*batch_head; /* protected by stripe lock */
+	spinlock_t		batch_lock; /* only header's lock is useful */
+	struct list_head	batch_list; /* protected by head's batch lock*/
 	/**
 	 * struct stripe_operations
 	 * @target - STRIPE_OP_COMPUTE_BLK target
-- 
cgit v1.2.1


From 72ac733015bbdc0356ba3e92c52137a265910a91 Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:03 +1100
Subject: raid5: handle io error of batch list

If io error happens in any stripe of a batch list, the batch list will be
split, then normal process will run for the stripes in the list.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.h |  1 +
 2 files changed, 49 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 717189e74243..54f3cb312b42 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1070,6 +1070,9 @@ again:
 			pr_debug("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
+			if (sh->batch_head)
+				set_bit(STRIPE_BATCH_ERR,
+					&sh->batch_head->state);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
 
@@ -2380,6 +2383,9 @@ static void raid5_end_write_request(struct bio *bi, int error)
 	}
 	rdev_dec_pending(rdev, conf->mddev);
 
+	if (sh->batch_head && !uptodate)
+		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
+
 	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -4124,6 +4130,46 @@ static int clear_batch_ready(struct stripe_head *sh)
 	return 0;
 }
 
+static void check_break_stripe_batch_list(struct stripe_head *sh)
+{
+	struct stripe_head *head_sh, *next;
+	int i;
+
+	if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+		return;
+
+	head_sh = sh;
+	do {
+		sh = list_first_entry(&sh->batch_list,
+				      struct stripe_head, batch_list);
+		BUG_ON(sh == head_sh);
+	} while (!test_bit(STRIPE_DEGRADED, &sh->state));
+
+	while (sh != head_sh) {
+		next = list_first_entry(&sh->batch_list,
+					struct stripe_head, batch_list);
+		list_del_init(&sh->batch_list);
+
+		sh->state = head_sh->state & ~((1 << STRIPE_ACTIVE) |
+					       (1 << STRIPE_PREREAD_ACTIVE) |
+					       (1 << STRIPE_DEGRADED));
+		sh->check_state = head_sh->check_state;
+		sh->reconstruct_state = head_sh->reconstruct_state;
+		for (i = 0; i < sh->disks; i++)
+			sh->dev[i].flags = head_sh->dev[i].flags &
+				(~((1 << R5_WriteError) | (1 << R5_Overlap)));
+
+		spin_lock_irq(&sh->stripe_lock);
+		sh->batch_head = NULL;
+		spin_unlock_irq(&sh->stripe_lock);
+
+		set_bit(STRIPE_HANDLE, &sh->state);
+		release_stripe(sh);
+
+		sh = next;
+	}
+}
+
 static void handle_stripe(struct stripe_head *sh)
 {
 	struct stripe_head_state s;
@@ -4146,6 +4192,8 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
+	check_break_stripe_batch_list(sh);
+
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index c8d0004dca8f..cf3562e99440 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -336,6 +336,7 @@ enum {
 	STRIPE_DISCARD,
 	STRIPE_ON_RELEASE_LIST,
 	STRIPE_BATCH_READY,
+	STRIPE_BATCH_ERR,
 };
 
 /*
-- 
cgit v1.2.1


From dabc4ec6ba72418ebca6bf1884f344bba40c8709 Mon Sep 17 00:00:00 2001
From: "shli@kernel.org" <shli@kernel.org>
Date: Mon, 15 Dec 2014 12:57:04 +1100
Subject: raid5: handle expansion/resync case with stripe batching

expansion/resync can grab a stripe when the stripe is in batch list. Since all
stripes in batch list must be in the same state, we can't allow some stripes
run into expansion/resync. So we delay expansion/resync for stripe in batch
list.

Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 24 ++++++++++++++++--------
 drivers/md/raid5.h |  5 +++++
 2 files changed, 21 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 54f3cb312b42..3ae097d50b51 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3440,8 +3440,10 @@ unhash:
 				      struct stripe_head, batch_list);
 		list_del_init(&sh->batch_list);
 
-		sh->state = head_sh->state & (~((1 << STRIPE_ACTIVE) |
-			(1 << STRIPE_PREREAD_ACTIVE)));
+		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+			      head_sh->state & ~((1 << STRIPE_ACTIVE) |
+						 (1 << STRIPE_PREREAD_ACTIVE) |
+						 STRIPE_EXPAND_SYNC_FLAG));
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
 		for (i = 0; i < sh->disks; i++) {
@@ -3453,6 +3455,8 @@ unhash:
 		spin_lock_irq(&sh->stripe_lock);
 		sh->batch_head = NULL;
 		spin_unlock_irq(&sh->stripe_lock);
+		if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+			set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
 	}
 
@@ -3460,6 +3464,8 @@ unhash:
 	head_sh->batch_head = NULL;
 	spin_unlock_irq(&head_sh->stripe_lock);
 	wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
+	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+		set_bit(STRIPE_HANDLE, &head_sh->state);
 }
 
 static void handle_stripe_dirtying(struct r5conf *conf,
@@ -3927,8 +3933,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 
 	memset(s, 0, sizeof(*s));
 
-	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
-	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
+	s->expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state) && !sh->batch_head;
+	s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
 	s->failed_num[0] = -1;
 	s->failed_num[1] = -1;
 
@@ -4150,9 +4156,11 @@ static void check_break_stripe_batch_list(struct stripe_head *sh)
 					struct stripe_head, batch_list);
 		list_del_init(&sh->batch_list);
 
-		sh->state = head_sh->state & ~((1 << STRIPE_ACTIVE) |
-					       (1 << STRIPE_PREREAD_ACTIVE) |
-					       (1 << STRIPE_DEGRADED));
+		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
+			      head_sh->state & ~((1 << STRIPE_ACTIVE) |
+						 (1 << STRIPE_PREREAD_ACTIVE) |
+						 (1 << STRIPE_DEGRADED) |
+						 STRIPE_EXPAND_SYNC_FLAG));
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
 		for (i = 0; i < sh->disks; i++)
@@ -4194,7 +4202,7 @@ static void handle_stripe(struct stripe_head *sh)
 
 	check_break_stripe_batch_list(sh);
 
-	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
+	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
 		spin_lock(&sh->stripe_lock);
 		/* Cannot process 'sync' concurrently with 'discard' */
 		if (!test_bit(STRIPE_DISCARD, &sh->state) &&
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index cf3562e99440..ee65ed844d3f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -339,6 +339,11 @@ enum {
 	STRIPE_BATCH_ERR,
 };
 
+#define STRIPE_EXPAND_SYNC_FLAG \
+	((1 << STRIPE_EXPAND_SOURCE) |\
+	(1 << STRIPE_EXPAND_READY) |\
+	(1 << STRIPE_EXPANDING) |\
+	(1 << STRIPE_SYNC_REQUESTED))
 /*
  * Operation request flags
  */
-- 
cgit v1.2.1


From 584acdd49cd2472ca0f5a06adbe979db82d0b4af Mon Sep 17 00:00:00 2001
From: Markus Stockhausen <stockhausen@collogia.de>
Date: Mon, 15 Dec 2014 12:57:05 +1100
Subject: md/raid5: activate raid6 rmw feature

Glue it altogehter. The raid6 rmw path should work the same as the
already existing raid5 logic. So emulate the prexor handling/flags
and split functions as needed.

1) Enable xor_syndrome() in the async layer.

2) Split ops_run_prexor() into RAID4/5 and RAID6 logic. Xor the syndrome
at the start of a rmw run as we did it before for the single parity.

3) Take care of rmw run in ops_run_reconstruct6(). Again process only
the changed pages to get syndrome back into sync.

4) Enhance set_syndrome_sources() to fill NULL pages if we are in a rmw
run. The lower layers will calculate start & end pages from that and
call the xor_syndrome() correspondingly.

5) Adapt the several places where we ignored Q handling up to now.

Performance numbers for a single E5630 system with a mix of 10 7200k
desktop/server disks. 300 seconds random write with 8 threads onto a
3,2TB (10*400GB) RAID6 64K chunk without spare (group_thread_cnt=4)

bsize   rmw_level=1   rmw_level=0   rmw_level=1   rmw_level=0
        skip_copy=1   skip_copy=1   skip_copy=0   skip_copy=0
   4K      115 KB/s      141 KB/s      165 KB/s      140 KB/s
   8K      225 KB/s      275 KB/s      324 KB/s      274 KB/s
  16K      434 KB/s      536 KB/s      640 KB/s      534 KB/s
  32K      751 KB/s    1,051 KB/s    1,234 KB/s    1,045 KB/s
  64K    1,339 KB/s    1,958 KB/s    2,282 KB/s    1,962 KB/s
 128K    2,673 KB/s    3,862 KB/s    4,113 KB/s    3,898 KB/s
 256K    7,685 KB/s    7,539 KB/s    7,557 KB/s    7,638 KB/s
 512K   19,556 KB/s   19,558 KB/s   19,652 KB/s   19,688 Kb/s

Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 104 +++++++++++++++++++++++++++++++++++++++--------------
 drivers/md/raid5.h |  19 +++++++++-
 2 files changed, 96 insertions(+), 27 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3ae097d50b51..c82ce1fd8723 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1317,7 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
  * destination buffer is recorded in srcs[count] and the Q destination
  * is recorded in srcs[count+1]].
  */
-static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
+static int set_syndrome_sources(struct page **srcs,
+				struct stripe_head *sh,
+				int srctype)
 {
 	int disks = sh->disks;
 	int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
@@ -1332,8 +1334,15 @@ static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
 	i = d0_idx;
 	do {
 		int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
+		struct r5dev *dev = &sh->dev[i];
 
-		srcs[slot] = sh->dev[i].page;
+		if (i == sh->qd_idx || i == sh->pd_idx ||
+		    (srctype == SYNDROME_SRC_ALL) ||
+		    (srctype == SYNDROME_SRC_WANT_DRAIN &&
+		     test_bit(R5_Wantdrain, &dev->flags)) ||
+		    (srctype == SYNDROME_SRC_WRITTEN &&
+		     dev->written))
+			srcs[slot] = sh->dev[i].page;
 		i = raid6_next_disk(i, disks);
 	} while (i != d0_idx);
 
@@ -1373,7 +1382,7 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 	atomic_inc(&sh->count);
 
 	if (target == qd_idx) {
-		count = set_syndrome_sources(blocks, sh);
+		count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
 		blocks[count] = NULL; /* regenerating p is not necessary */
 		BUG_ON(blocks[count+1] != dest); /* q should already be set */
 		init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
@@ -1481,7 +1490,7 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 			tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
 				       &submit);
 
-			count = set_syndrome_sources(blocks, sh);
+			count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL);
 			init_async_submit(&submit, ASYNC_TX_FENCE, tx,
 					  ops_complete_compute, sh,
 					  to_addr_conv(sh, percpu, 0));
@@ -1515,8 +1524,8 @@ static void ops_complete_prexor(void *stripe_head_ref)
 }
 
 static struct dma_async_tx_descriptor *
-ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
-	       struct dma_async_tx_descriptor *tx)
+ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
+		struct dma_async_tx_descriptor *tx)
 {
 	int disks = sh->disks;
 	struct page **xor_srcs = to_addr_page(percpu, 0);
@@ -1544,6 +1553,26 @@ ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
 	return tx;
 }
 
+static struct dma_async_tx_descriptor *
+ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
+		struct dma_async_tx_descriptor *tx)
+{
+	struct page **blocks = to_addr_page(percpu, 0);
+	int count;
+	struct async_submit_ctl submit;
+
+	pr_debug("%s: stripe %llu\n", __func__,
+		(unsigned long long)sh->sector);
+
+	count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
+
+	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
+			  ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0));
+	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+
+	return tx;
+}
+
 static struct dma_async_tx_descriptor *
 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
 {
@@ -1746,6 +1775,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 	int count, i, j = 0;
 	struct stripe_head *head_sh = sh;
 	int last_stripe;
+	int synflags;
+	unsigned long txflags;
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
@@ -1765,14 +1796,23 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 
 again:
 	blocks = to_addr_page(percpu, j);
-	count = set_syndrome_sources(blocks, sh);
+
+	if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
+		synflags = SYNDROME_SRC_WRITTEN;
+		txflags = ASYNC_TX_ACK | ASYNC_TX_PQ_XOR_DST;
+	} else {
+		synflags = SYNDROME_SRC_ALL;
+		txflags = ASYNC_TX_ACK;
+	}
+
+	count = set_syndrome_sources(blocks, sh, synflags);
 	last_stripe = !head_sh->batch_head ||
 		list_first_entry(&sh->batch_list,
 				 struct stripe_head, batch_list) == head_sh;
 
 	if (last_stripe) {
 		atomic_inc(&head_sh->count);
-		init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
+		init_async_submit(&submit, txflags, tx, ops_complete_reconstruct,
 				  head_sh, to_addr_conv(sh, percpu, j));
 	} else
 		init_async_submit(&submit, 0, tx, NULL, NULL,
@@ -1843,7 +1883,7 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 		(unsigned long long)sh->sector, checkp);
 
 	BUG_ON(sh->batch_head);
-	count = set_syndrome_sources(srcs, sh);
+	count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
 	if (!checkp)
 		srcs[count] = NULL;
 
@@ -1884,8 +1924,12 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 			async_tx_ack(tx);
 	}
 
-	if (test_bit(STRIPE_OP_PREXOR, &ops_request))
-		tx = ops_run_prexor(sh, percpu, tx);
+	if (test_bit(STRIPE_OP_PREXOR, &ops_request)) {
+		if (level < 6)
+			tx = ops_run_prexor5(sh, percpu, tx);
+		else
+			tx = ops_run_prexor6(sh, percpu, tx);
+	}
 
 	if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
 		tx = ops_run_biodrain(sh, tx);
@@ -2770,7 +2814,7 @@ static void
 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 			 int rcw, int expand)
 {
-	int i, pd_idx = sh->pd_idx, disks = sh->disks;
+	int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx, disks = sh->disks;
 	struct r5conf *conf = sh->raid_conf;
 	int level = conf->level;
 
@@ -2806,13 +2850,15 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
 			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
 				atomic_inc(&conf->pending_full_writes);
 	} else {
-		BUG_ON(level == 6);
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+		BUG_ON(level == 6 &&
+			(!(test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags) ||
+			   test_bit(R5_Wantcompute, &sh->dev[qd_idx].flags))));
 
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if (i == pd_idx)
+			if (i == pd_idx || i == qd_idx)
 				continue;
 
 			if (dev->towrite &&
@@ -3476,28 +3522,27 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	int rmw = 0, rcw = 0, i;
 	sector_t recovery_cp = conf->mddev->recovery_cp;
 
-	/* RAID6 requires 'rcw' in current implementation.
-	 * Otherwise, check whether resync is now happening or should start.
+	/* Check whether resync is now happening or should start.
 	 * If yes, then the array is dirty (after unclean shutdown or
 	 * initial creation), so parity in some stripes might be inconsistent.
 	 * In this case, we need to always do reconstruct-write, to ensure
 	 * that in case of drive failure or read-error correction, we
 	 * generate correct data from the parity.
 	 */
-	if (conf->max_degraded == 2 ||
+	if (conf->rmw_level == PARITY_DISABLE_RMW ||
 	    (recovery_cp < MaxSector && sh->sector >= recovery_cp &&
 	     s->failed == 0)) {
 		/* Calculate the real rcw later - for now make it
 		 * look like rcw is cheaper
 		 */
 		rcw = 1; rmw = 2;
-		pr_debug("force RCW max_degraded=%u, recovery_cp=%llu sh->sector=%llu\n",
-			 conf->max_degraded, (unsigned long long)recovery_cp,
+		pr_debug("force RCW rmw_level=%u, recovery_cp=%llu sh->sector=%llu\n",
+			 conf->rmw_level, (unsigned long long)recovery_cp,
 			 (unsigned long long)sh->sector);
 	} else for (i = disks; i--; ) {
 		/* would I have to read this buffer for read_modify_write */
 		struct r5dev *dev = &sh->dev[i];
-		if ((dev->towrite || i == sh->pd_idx) &&
+		if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
 		      test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3507,7 +3552,8 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 				rmw += 2*disks;  /* cannot read it */
 		}
 		/* Would I have to read this buffer for reconstruct_write */
-		if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
+		if (!test_bit(R5_OVERWRITE, &dev->flags) &&
+		    i != sh->pd_idx && i != sh->qd_idx &&
 		    !test_bit(R5_LOCKED, &dev->flags) &&
 		    !(test_bit(R5_UPTODATE, &dev->flags) ||
 		    test_bit(R5_Wantcompute, &dev->flags))) {
@@ -3520,7 +3566,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 	pr_debug("for sector %llu, rmw=%d rcw=%d\n",
 		(unsigned long long)sh->sector, rmw, rcw);
 	set_bit(STRIPE_HANDLE, &sh->state);
-	if (rmw < rcw && rmw > 0) {
+	if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_ENABLE_RMW)) && rmw > 0) {
 		/* prefer read-modify-write, but need to get some data */
 		if (conf->mddev->queue)
 			blk_add_trace_msg(conf->mddev->queue,
@@ -3528,7 +3574,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 					  (unsigned long long)sh->sector, rmw);
 		for (i = disks; i--; ) {
 			struct r5dev *dev = &sh->dev[i];
-			if ((dev->towrite || i == sh->pd_idx) &&
+			if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx) &&
 			    !test_bit(R5_LOCKED, &dev->flags) &&
 			    !(test_bit(R5_UPTODATE, &dev->flags) ||
 			    test_bit(R5_Wantcompute, &dev->flags)) &&
@@ -3547,7 +3593,7 @@ static void handle_stripe_dirtying(struct r5conf *conf,
 			}
 		}
 	}
-	if (rcw <= rmw && rcw > 0) {
+	if ((rcw < rmw || (rcw == rmw && conf->rmw_level != PARITY_ENABLE_RMW)) && rcw > 0) {
 		/* want reconstruct write, but need to get some data */
 		int qread =0;
 		rcw = 0;
@@ -6344,10 +6390,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	}
 
 	conf->level = mddev->new_level;
-	if (conf->level == 6)
+	if (conf->level == 6) {
 		conf->max_degraded = 2;
-	else
+		if (raid6_call.xor_syndrome)
+			conf->rmw_level = PARITY_ENABLE_RMW;
+		else
+			conf->rmw_level = PARITY_DISABLE_RMW;
+	} else {
 		conf->max_degraded = 1;
+		conf->rmw_level = PARITY_ENABLE_RMW;
+	}
 	conf->algorithm = mddev->new_layout;
 	conf->reshape_progress = mddev->reshape_position;
 	if (conf->reshape_progress != MaxSector) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ee65ed844d3f..57fef9ba36fa 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -355,6 +355,23 @@ enum {
 	STRIPE_OP_RECONSTRUCT,
 	STRIPE_OP_CHECK,
 };
+
+/*
+ * RAID parity calculation preferences
+ */
+enum {
+	PARITY_DISABLE_RMW = 0,
+	PARITY_ENABLE_RMW,
+};
+
+/*
+ * Pages requested from set_syndrome_sources()
+ */
+enum {
+	SYNDROME_SRC_ALL,
+	SYNDROME_SRC_WANT_DRAIN,
+	SYNDROME_SRC_WRITTEN,
+};
 /*
  * Plugging:
  *
@@ -411,7 +428,7 @@ struct r5conf {
 	spinlock_t		hash_locks[NR_STRIPE_HASH_LOCKS];
 	struct mddev		*mddev;
 	int			chunk_sectors;
-	int			level, algorithm;
+	int			level, algorithm, rmw_level;
 	int			max_degraded;
 	int			raid_disks;
 	int			max_nr_stripes;
-- 
cgit v1.2.1


From d06f191f8ecaef4d524e765fdb455f96392fbd42 Mon Sep 17 00:00:00 2001
From: Markus Stockhausen <stockhausen@collogia.de>
Date: Mon, 15 Dec 2014 12:57:05 +1100
Subject: md/raid5: introduce configuration option rmw_level

Depending on the available coding we allow optimized rmw logic for write
operations. To support easier testing this patch allows manual control
of the rmw/rcw descision through the interface /sys/block/mdX/md/rmw_level.

The configuration can handle three levels of control.

rmw_level=0: Disable rmw for all RAID types. Hardware assisted P/Q
calculation has no implementation path yet to factor in/out chunks of
a syndrome. Enforcing this level can be benefical for slow CPUs with
hardware syndrome support and fast SSDs.

rmw_level=1: Estimate rmw IOs and rcw IOs. Execute rmw only if we will
save IOs. This equals the "old" unpatched behaviour and will be the
default.

rmw_level=2: Execute rmw even if calculated IOs for rmw and rcw are
equal. We might have higher CPU consumption because of calculating the
parity twice but it can be benefical otherwise. E.g. RAID4 with fast
dedicated parity disk/SSD. The option is implemented just to be
forward-looking and will ONLY work with this patch!

Signed-off-by: Markus Stockhausen <stockhausen@collogia.de>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.h |  1 +
 2 files changed, 45 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c82ce1fd8723..f78b1964543b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5879,6 +5879,49 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
 				raid5_show_stripe_cache_size,
 				raid5_store_stripe_cache_size);
 
+static ssize_t
+raid5_show_rmw_level(struct mddev  *mddev, char *page)
+{
+	struct r5conf *conf = mddev->private;
+	if (conf)
+		return sprintf(page, "%d\n", conf->rmw_level);
+	else
+		return 0;
+}
+
+static ssize_t
+raid5_store_rmw_level(struct mddev  *mddev, const char *page, size_t len)
+{
+	struct r5conf *conf = mddev->private;
+	unsigned long new;
+
+	if (!conf)
+		return -ENODEV;
+
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+
+	if (kstrtoul(page, 10, &new))
+		return -EINVAL;
+
+	if (new != PARITY_DISABLE_RMW && !raid6_call.xor_syndrome)
+		return -EINVAL;
+
+	if (new != PARITY_DISABLE_RMW &&
+	    new != PARITY_ENABLE_RMW &&
+	    new != PARITY_PREFER_RMW)
+		return -EINVAL;
+
+	conf->rmw_level = new;
+	return len;
+}
+
+static struct md_sysfs_entry
+raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR,
+			 raid5_show_rmw_level,
+			 raid5_store_rmw_level);
+
+
 static ssize_t
 raid5_show_preread_threshold(struct mddev *mddev, char *page)
 {
@@ -6065,6 +6108,7 @@ static struct attribute *raid5_attrs[] =  {
 	&raid5_preread_bypass_threshold.attr,
 	&raid5_group_thread_cnt.attr,
 	&raid5_skip_copy.attr,
+	&raid5_rmw_level.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 57fef9ba36fa..6614ac5ffc0e 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -362,6 +362,7 @@ enum {
 enum {
 	PARITY_DISABLE_RMW = 0,
 	PARITY_ENABLE_RMW,
+	PARITY_PREFER_RMW,
 };
 
 /*
-- 
cgit v1.2.1


From a9683a795bcca6d0e7fe4c4c00e071218f3f4428 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Feb 2015 12:02:51 +1100
Subject: md/raid5: pass gfp_t arg to grow_one_stripe()

This is needed for future improvement to stripe cache management.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f78b1964543b..ed8e34153c3d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -497,7 +497,7 @@ static void shrink_buffers(struct stripe_head *sh)
 	}
 }
 
-static int grow_buffers(struct stripe_head *sh)
+static int grow_buffers(struct stripe_head *sh, gfp_t gfp)
 {
 	int i;
 	int num = sh->raid_conf->pool_size;
@@ -505,7 +505,7 @@ static int grow_buffers(struct stripe_head *sh)
 	for (i = 0; i < num; i++) {
 		struct page *page;
 
-		if (!(page = alloc_page(GFP_KERNEL))) {
+		if (!(page = alloc_page(gfp))) {
 			return 1;
 		}
 		sh->dev[i].page = page;
@@ -1963,10 +1963,10 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	put_cpu();
 }
 
-static int grow_one_stripe(struct r5conf *conf, int hash)
+static int grow_one_stripe(struct r5conf *conf, int hash, gfp_t gfp)
 {
 	struct stripe_head *sh;
-	sh = kmem_cache_zalloc(conf->slab_cache, GFP_KERNEL);
+	sh = kmem_cache_zalloc(conf->slab_cache, gfp);
 	if (!sh)
 		return 0;
 
@@ -1974,7 +1974,7 @@ static int grow_one_stripe(struct r5conf *conf, int hash)
 
 	spin_lock_init(&sh->stripe_lock);
 
-	if (grow_buffers(sh)) {
+	if (grow_buffers(sh, gfp)) {
 		shrink_buffers(sh);
 		kmem_cache_free(conf->slab_cache, sh);
 		return 0;
@@ -2016,7 +2016,7 @@ static int grow_stripes(struct r5conf *conf, int num)
 	conf->pool_size = devs;
 	hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
 	while (num--) {
-		if (!grow_one_stripe(conf, hash))
+		if (!grow_one_stripe(conf, hash, GFP_KERNEL))
 			return 1;
 		conf->max_nr_stripes++;
 		hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
@@ -5841,7 +5841,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 		return err;
 	hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
 	while (size > conf->max_nr_stripes) {
-		if (grow_one_stripe(conf, hash))
+		if (grow_one_stripe(conf, hash, GFP_KERNEL))
 			conf->max_nr_stripes++;
 		else break;
 		hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-- 
cgit v1.2.1


From 486f0644c3482cbf64fe804499836e1f05abec14 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 25 Feb 2015 12:10:35 +1100
Subject: md/raid5: move max_nr_stripes management into grow_one_stripe and
 drop_one_stripe

Rather than adjusting max_nr_stripes whenever {grow,drop}_one_stripe()
succeeds, do it inside the functions.

Also choose the correct hash to handle next inside the functions.

This removes duplication and will help with future new uses of
{grow,drop}_one_stripe.

This also fixes a minor bug where the "md/raid:%md: allocate XXkB"
message always said "0kB".

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 57 +++++++++++++++++++++++-------------------------------
 1 file changed, 24 insertions(+), 33 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index ed8e34153c3d..78ac7dc853c7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1963,7 +1963,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	put_cpu();
 }
 
-static int grow_one_stripe(struct r5conf *conf, int hash, gfp_t gfp)
+static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 {
 	struct stripe_head *sh;
 	sh = kmem_cache_zalloc(conf->slab_cache, gfp);
@@ -1979,7 +1979,8 @@ static int grow_one_stripe(struct r5conf *conf, int hash, gfp_t gfp)
 		kmem_cache_free(conf->slab_cache, sh);
 		return 0;
 	}
-	sh->hash_lock_index = hash;
+	sh->hash_lock_index =
+		conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
 	/* we just created an active stripe so... */
 	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
@@ -1989,6 +1990,7 @@ static int grow_one_stripe(struct r5conf *conf, int hash, gfp_t gfp)
 	INIT_LIST_HEAD(&sh->batch_list);
 	sh->batch_head = NULL;
 	release_stripe(sh);
+	conf->max_nr_stripes++;
 	return 1;
 }
 
@@ -1996,7 +1998,6 @@ static int grow_stripes(struct r5conf *conf, int num)
 {
 	struct kmem_cache *sc;
 	int devs = max(conf->raid_disks, conf->previous_raid_disks);
-	int hash;
 
 	if (conf->mddev->gendisk)
 		sprintf(conf->cache_name[0],
@@ -2014,13 +2015,10 @@ static int grow_stripes(struct r5conf *conf, int num)
 		return 1;
 	conf->slab_cache = sc;
 	conf->pool_size = devs;
-	hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-	while (num--) {
-		if (!grow_one_stripe(conf, hash, GFP_KERNEL))
+	while (num--)
+		if (!grow_one_stripe(conf, GFP_KERNEL))
 			return 1;
-		conf->max_nr_stripes++;
-		hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-	}
+
 	return 0;
 }
 
@@ -2210,9 +2208,10 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	return err;
 }
 
-static int drop_one_stripe(struct r5conf *conf, int hash)
+static int drop_one_stripe(struct r5conf *conf)
 {
 	struct stripe_head *sh;
+	int hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
 
 	spin_lock_irq(conf->hash_locks + hash);
 	sh = get_free_stripe(conf, hash);
@@ -2223,15 +2222,15 @@ static int drop_one_stripe(struct r5conf *conf, int hash)
 	shrink_buffers(sh);
 	kmem_cache_free(conf->slab_cache, sh);
 	atomic_dec(&conf->active_stripes);
+	conf->max_nr_stripes--;
 	return 1;
 }
 
 static void shrink_stripes(struct r5conf *conf)
 {
-	int hash;
-	for (hash = 0; hash < NR_STRIPE_HASH_LOCKS; hash++)
-		while (drop_one_stripe(conf, hash))
-			;
+	while (conf->max_nr_stripes &&
+	       drop_one_stripe(conf))
+		;
 
 	if (conf->slab_cache)
 		kmem_cache_destroy(conf->slab_cache);
@@ -5822,30 +5821,22 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 {
 	struct r5conf *conf = mddev->private;
 	int err;
-	int hash;
 
 	if (size <= 16 || size > 32768)
 		return -EINVAL;
-	hash = (conf->max_nr_stripes - 1) % NR_STRIPE_HASH_LOCKS;
-	while (size < conf->max_nr_stripes) {
-		if (drop_one_stripe(conf, hash))
-			conf->max_nr_stripes--;
-		else
-			break;
-		hash--;
-		if (hash < 0)
-			hash = NR_STRIPE_HASH_LOCKS - 1;
-	}
+
+	while (size < conf->max_nr_stripes &&
+	       drop_one_stripe(conf))
+		;
+
 	err = md_allow_write(mddev);
 	if (err)
 		return err;
-	hash = conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
-	while (size > conf->max_nr_stripes) {
-		if (grow_one_stripe(conf, hash, GFP_KERNEL))
-			conf->max_nr_stripes++;
-		else break;
-		hash = (hash + 1) % NR_STRIPE_HASH_LOCKS;
-	}
+
+	while (size > conf->max_nr_stripes)
+		if (!grow_one_stripe(conf, GFP_KERNEL))
+			break;
+
 	return 0;
 }
 EXPORT_SYMBOL(raid5_set_cache_size);
@@ -6451,7 +6442,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 		conf->prev_algo = mddev->layout;
 	}
 
-	memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
+	memory = NR_STRIPES * (sizeof(struct stripe_head) +
 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
 	if (grow_stripes(conf, NR_STRIPES)) {
-- 
cgit v1.2.1


From 5423399a84ee1d92d29d763029ed40e4905cf50f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 26 Feb 2015 12:21:04 +1100
Subject: md/raid5: change ->inactive_blocked to a bit-flag.

This allows us to easily add more (atomic) flags.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 13 ++++++++-----
 drivers/md/raid5.h |  9 ++++++---
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 78ac7dc853c7..b7cd32e7f29e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -672,20 +672,23 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 				    *(conf->hash_locks + hash));
 		sh = __find_stripe(conf, sector, conf->generation - previous);
 		if (!sh) {
-			if (!conf->inactive_blocked)
+			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 				sh = get_free_stripe(conf, hash);
 			if (noblock && sh == NULL)
 				break;
 			if (!sh) {
-				conf->inactive_blocked = 1;
+				set_bit(R5_INACTIVE_BLOCKED,
+					&conf->cache_state);
 				wait_event_lock_irq(
 					conf->wait_for_stripe,
 					!list_empty(conf->inactive_list + hash) &&
 					(atomic_read(&conf->active_stripes)
 					 < (conf->max_nr_stripes * 3 / 4)
-					 || !conf->inactive_blocked),
+					 || !test_bit(R5_INACTIVE_BLOCKED,
+						      &conf->cache_state)),
 					*(conf->hash_locks + hash));
-				conf->inactive_blocked = 0;
+				clear_bit(R5_INACTIVE_BLOCKED,
+					  &conf->cache_state);
 			} else {
 				init_stripe(sh, sector, previous);
 				atomic_inc(&sh->count);
@@ -4602,7 +4605,7 @@ static int raid5_congested(struct mddev *mddev, int bits)
 	 * how busy the stripe_cache is
 	 */
 
-	if (conf->inactive_blocked)
+	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 		return 1;
 	if (conf->quiesce)
 		return 1;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6614ac5ffc0e..ebe4e24bc14d 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -509,9 +509,11 @@ struct r5conf {
 	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_stripe;
 	wait_queue_head_t	wait_for_overlap;
-	int			inactive_blocked;	/* release of inactive stripes blocked,
-							 * waiting for 25% to be free
-							 */
+	unsigned long		cache_state;
+#define R5_INACTIVE_BLOCKED	1	/* release of inactive stripes blocked,
+					 * waiting for 25% to be free
+					 */
+
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
@@ -526,6 +528,7 @@ struct r5conf {
 	int			worker_cnt_per_group;
 };
 
+
 /*
  * Our supported algorithms
  */
-- 
cgit v1.2.1


From edbe83ab4c27ea6669eb57adb5ed7eaec1118ceb Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 26 Feb 2015 12:47:56 +1100
Subject: md/raid5: allow the stripe_cache to grow and shrink.

The default setting of 256 stripe_heads is probably
much too small for many configurations.  So it is best to make it
auto-configure.

Shrinking the cache under memory pressure is easy.  The only
interesting part here is that we put a fairly high cost
('seeks') on shrinking the cache as the cost is greater than
just having to read more data, it reduces parallelism.

Growing the cache on demand needs to be done carefully.  If we allow
fast growth, that can upset memory balance as lots of dirty memory can
quickly turn into lots of memory queued in the stripe_cache.
It is important for the raid5 block device to appear congested to
allow write-throttling to work.

So we only add stripes slowly. We set a flag when an allocation
fails because all stripes are in use, allocate at a convenient
time when that flag is set, and don't allow it to be set again
until at least one stripe_head has been released for re-use.

This means that a spurt of requests will only cause one stripe_head
to be allocated, but a steady stream of requests will slowly
increase the cache size - until memory pressure puts it back again.

It could take hours to reach a steady state.

The value written to, and displayed in, stripe_cache_size is
used as a minimum.  The cache can grow above this and shrink back
down to it.  The actual size is not directly visible, though it can
be deduced to some extent by watching stripe_cache_active.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++------
 drivers/md/raid5.h | 11 ++++++++-
 2 files changed, 71 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b7cd32e7f29e..9716319cc477 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -672,8 +672,13 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 				    *(conf->hash_locks + hash));
 		sh = __find_stripe(conf, sector, conf->generation - previous);
 		if (!sh) {
-			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
 				sh = get_free_stripe(conf, hash);
+				if (!sh && llist_empty(&conf->released_stripes) &&
+				    !test_bit(R5_DID_ALLOC, &conf->cache_state))
+					set_bit(R5_ALLOC_MORE,
+						&conf->cache_state);
+			}
 			if (noblock && sh == NULL)
 				break;
 			if (!sh) {
@@ -5761,6 +5766,8 @@ static void raid5d(struct md_thread *thread)
 		int batch_size, released;
 
 		released = release_stripe_list(conf, conf->temp_inactive_list);
+		if (released)
+			clear_bit(R5_DID_ALLOC, &conf->cache_state);
 
 		if (
 		    !list_empty(&conf->bitmap_list)) {
@@ -5799,6 +5806,13 @@ static void raid5d(struct md_thread *thread)
 	pr_debug("%d stripes handled\n", handled);
 
 	spin_unlock_irq(&conf->device_lock);
+	if (test_and_clear_bit(R5_ALLOC_MORE, &conf->cache_state)) {
+		grow_one_stripe(conf, __GFP_NOWARN);
+		/* Set flag even if allocation failed.  This helps
+		 * slow down allocation requests when mem is short
+		 */
+		set_bit(R5_DID_ALLOC, &conf->cache_state);
+	}
 
 	async_tx_issue_pending_all();
 	blk_finish_plug(&plug);
@@ -5814,7 +5828,7 @@ raid5_show_stripe_cache_size(struct mddev *mddev, char *page)
 	spin_lock(&mddev->lock);
 	conf = mddev->private;
 	if (conf)
-		ret = sprintf(page, "%d\n", conf->max_nr_stripes);
+		ret = sprintf(page, "%d\n", conf->min_nr_stripes);
 	spin_unlock(&mddev->lock);
 	return ret;
 }
@@ -5828,10 +5842,12 @@ raid5_set_cache_size(struct mddev *mddev, int size)
 	if (size <= 16 || size > 32768)
 		return -EINVAL;
 
+	conf->min_nr_stripes = size;
 	while (size < conf->max_nr_stripes &&
 	       drop_one_stripe(conf))
 		;
 
+
 	err = md_allow_write(mddev);
 	if (err)
 		return err;
@@ -5947,7 +5963,7 @@ raid5_store_preread_threshold(struct mddev *mddev, const char *page, size_t len)
 	conf = mddev->private;
 	if (!conf)
 		err = -ENODEV;
-	else if (new > conf->max_nr_stripes)
+	else if (new > conf->min_nr_stripes)
 		err = -EINVAL;
 	else
 		conf->bypass_threshold = new;
@@ -6228,6 +6244,8 @@ static void raid5_free_percpu(struct r5conf *conf)
 
 static void free_conf(struct r5conf *conf)
 {
+	if (conf->shrinker.seeks)
+		unregister_shrinker(&conf->shrinker);
 	free_thread_groups(conf);
 	shrink_stripes(conf);
 	raid5_free_percpu(conf);
@@ -6295,6 +6313,30 @@ static int raid5_alloc_percpu(struct r5conf *conf)
 	return err;
 }
 
+static unsigned long raid5_cache_scan(struct shrinker *shrink,
+				      struct shrink_control *sc)
+{
+	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+	int ret = 0;
+	while (ret < sc->nr_to_scan) {
+		if (drop_one_stripe(conf) == 0)
+			return SHRINK_STOP;
+		ret++;
+	}
+	return ret;
+}
+
+static unsigned long raid5_cache_count(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
+	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
+
+	if (conf->max_nr_stripes < conf->min_nr_stripes)
+		/* unlikely, but not impossible */
+		return 0;
+	return conf->max_nr_stripes - conf->min_nr_stripes;
+}
+
 static struct r5conf *setup_conf(struct mddev *mddev)
 {
 	struct r5conf *conf;
@@ -6445,10 +6487,11 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 		conf->prev_algo = mddev->layout;
 	}
 
-	memory = NR_STRIPES * (sizeof(struct stripe_head) +
+	conf->min_nr_stripes = NR_STRIPES;
+	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
-	if (grow_stripes(conf, NR_STRIPES)) {
+	if (grow_stripes(conf, conf->min_nr_stripes)) {
 		printk(KERN_ERR
 		       "md/raid:%s: couldn't allocate %dkB for buffers\n",
 		       mdname(mddev), memory);
@@ -6456,6 +6499,17 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	} else
 		printk(KERN_INFO "md/raid:%s: allocated %dkB\n",
 		       mdname(mddev), memory);
+	/*
+	 * Losing a stripe head costs more than the time to refill it,
+	 * it reduces the queue depth and so can hurt throughput.
+	 * So set it rather large, scaled by number of devices.
+	 */
+	conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
+	conf->shrinker.scan_objects = raid5_cache_scan;
+	conf->shrinker.count_objects = raid5_cache_count;
+	conf->shrinker.batch = 128;
+	conf->shrinker.flags = 0;
+	register_shrinker(&conf->shrinker);
 
 	sprintf(pers_name, "raid%d", mddev->new_level);
 	conf->thread = md_register_thread(raid5d, mddev, pers_name);
@@ -7097,9 +7151,9 @@ static int check_stripe_cache(struct mddev *mddev)
 	 */
 	struct r5conf *conf = mddev->private;
 	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
-	    > conf->max_nr_stripes ||
+	    > conf->min_nr_stripes ||
 	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
-	    > conf->max_nr_stripes) {
+	    > conf->min_nr_stripes) {
 		printk(KERN_WARNING "md/raid:%s: reshape: not enough stripes.  Needed %lu\n",
 		       mdname(mddev),
 		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index ebe4e24bc14d..7dc0dd86074b 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -433,6 +433,7 @@ struct r5conf {
 	int			max_degraded;
 	int			raid_disks;
 	int			max_nr_stripes;
+	int			min_nr_stripes;
 
 	/* reshape_progress is the leading edge of a 'reshape'
 	 * It has value MaxSector when no reshape is happening
@@ -513,7 +514,15 @@ struct r5conf {
 #define R5_INACTIVE_BLOCKED	1	/* release of inactive stripes blocked,
 					 * waiting for 25% to be free
 					 */
-
+#define R5_ALLOC_MORE		2	/* It might help to allocate another
+					 * stripe.
+					 */
+#define R5_DID_ALLOC		4	/* A stripe was allocated, don't allocate
+					 * more until at least one has been
+					 * released.  This avoids flooding
+					 * the cache.
+					 */
+	struct shrinker		shrinker;
 	int			pool_size; /* number of disks in stripeheads in pool */
 	spinlock_t		device_lock;
 	struct disk_info	*disks;
-- 
cgit v1.2.1


From 9ffc8f7cb9647b13dfe4d1ad0d5e1427bb8b46d6 Mon Sep 17 00:00:00 2001
From: Eric Mei <eric.mei@seagate.com>
Date: Wed, 18 Mar 2015 23:39:11 -0600
Subject: md/raid5: don't do chunk aligned read on degraded array.

When array is degraded, read data landed on failed drives will result in
reading rest of data in a stripe. So a single sequential read would
result in same data being read twice.

This patch is to avoid chunk aligned read for degraded array. The
downside is to involve stripe cache which means associated CPU overhead
and extra memory copy.

Test Results:
Following test are done on a enterprise storage node with Seagate 6T SAS
drives and Xeon E5-2648L CPU (10 cores, 1.9Ghz), 10 disks MD RAID6 8+2,
chunk size 128 KiB.

I use FIO, using direct-io with various bs size, enough queue depth,
tested sequential and 100% random read against 3 array config:
 1) optimal, as baseline;
 2) degraded;
 3) degraded with this patch.
Kernel version is 4.0-rc3.

Each individual test I only did once so there might be some variations,
but we just focus on big trend.

Sequential Read:
  bs=(KiB)  optimal(MiB/s)  degraded(MiB/s)  degraded-with-patch (MiB/s)
   1024       1608            656              995
    512       1624            710              956
    256       1635            728              980
    128       1636            771              983
     64       1612           1119             1000
     32       1580           1420             1004
     16       1368            688              986
      8        768            647              953
      4        411            413              850

Random Read:
  bs=(KiB)  optimal(IOPS)  degraded(IOPS)  degraded-with-patch (IOPS)
   1024        163            160              156
    512        274            273              272
    256        426            428              424
    128        576            592              591
     64        726            724              726
     32        849            848              837
     16        900            970              971
      8        927            940              929
      4        948            940              955

Some notes:
  * In sequential + optimal, as bs size getting smaller, the FIO thread
become CPU bound.
  * In sequential + degraded, there's big increase when bs is 64K and
32K, I don't have explanation.
  * In sequential + degraded-with-patch, the MD thread mostly become CPU
bound.

If you want to we can discuss specific data point in those data. But in
general it seems with this patch, we have more predictable and in most
cases significant better sequential read performance when array is
degraded, and almost no noticeable impact on random read.

Performance is a complicated thing, the patch works well for this
particular configuration, but may not be universal. For example I
imagine testing on all SSD array may have very different result. But I
personally think in most cases IO bandwidth is more scarce resource than
CPU.


Signed-off-by: Eric Mei <eric.mei@seagate.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9716319cc477..77dfd720aaa0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4632,8 +4632,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev,
 	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
-	if ((bvm->bi_rw & 1) == WRITE)
-		return biovec->bv_len; /* always allow writes to be mergeable */
+	/*
+	 * always allow writes to be mergeable, read as well if array
+	 * is degraded as we'll go through stripe cache anyway.
+	 */
+	if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
+		return biovec->bv_len;
 
 	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
 		chunk_sectors = mddev->new_chunk_sectors;
@@ -5110,7 +5114,12 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 
 	md_write_start(mddev, bi);
 
-	if (rw == READ &&
+	/*
+	 * If array is degraded, better not do chunk aligned read because
+	 * later we might have to read it again in order to reconstruct
+	 * data on failed drives.
+	 */
+	if (rw == READ && mddev->degraded == 0 &&
 	     mddev->reshape_position == MaxSector &&
 	     chunk_aligned_read(mddev,bi))
 		return;
-- 
cgit v1.2.1


From 6cd18e711dd8075da9d78cfc1239f912ff28968a Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Mon, 27 Apr 2015 14:12:22 +1000
Subject: block: destroy bdi before blockdev is unregistered.

Because of the peculiar way that md devices are created (automatically
when the device node is opened), a new device can be created and
registered immediately after the
	blk_unregister_region(disk_devt(disk), disk->minors);
call in del_gendisk().

Therefore it is important that all visible artifacts of the previous
device are removed before this call.  In particular, the 'bdi'.

Since:
commit c4db59d31e39ea067c32163ac961e9c80198fd37
Author: Christoph Hellwig <hch@lst.de>
    fs: don't reassign dirty inodes to default_backing_dev_info

moved the
   device_unregister(bdi->dev);
call from bdi_unregister() to bdi_destroy() it has been quite easy to
lose a race and have a new (e.g.) "md127" be created after the
blk_unregister_region() call and before bdi_destroy() is ultimately
called by the final 'put_disk', which must come after del_gendisk().

The new device finds that the bdi name is already registered in sysfs
and complains

> [ 9627.630029] WARNING: CPU: 18 PID: 3330 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x5a/0x70()
> [ 9627.630032] sysfs: cannot create duplicate filename '/devices/virtual/bdi/9:127'

We can fix this by moving the bdi_destroy() call out of
blk_release_queue() (which can happen very late when a refcount
reaches zero) and into blk_cleanup_queue() - which happens exactly when the md
device driver calls it.

Then it is only necessary for md to call blk_cleanup_queue() before
del_gendisk().  As loop.c devices are also created on demand by
opening the device node, we make the same change there.

Fixes: c4db59d31e39ea067c32163ac961e9c80198fd37
Reported-by: Azat Khuzhin <a3at.mail@gmail.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: stable@vger.kernel.org (v4.0)
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e6178787ce3d..e47d1dd046da 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4754,12 +4754,12 @@ static void md_free(struct kobject *ko)
 	if (mddev->sysfs_state)
 		sysfs_put(mddev->sysfs_state);
 
+	if (mddev->queue)
+		blk_cleanup_queue(mddev->queue);
 	if (mddev->gendisk) {
 		del_gendisk(mddev->gendisk);
 		put_disk(mddev->gendisk);
 	}
-	if (mddev->queue)
-		blk_cleanup_queue(mddev->queue);
 
 	kfree(mddev);
 }
-- 
cgit v1.2.1


From 3e6180f0c82b3790a9ec6d13d67aae359bf1ce84 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 30 Apr 2015 10:10:36 -0400
Subject: dm: only initialize the request_queue once

Commit bfebd1cdb4 ("dm: add full blk-mq support to request-based DM")
didn't properly account for the need to short-circuit re-initializing
DM's blk-mq request_queue if it was already initialized.

Otherwise, reloading a blk-mq request-based DM table (either manually
or via multipathd) resulted in errors, see:
 https://www.redhat.com/archives/dm-devel/2015-April/msg00132.html

Fix is to only initialize the request_queue on the initial table load
(when the mapped_device type is assigned).

This is better than having dm_init_request_based_blk_mq_queue() return
early if the queue was already initialized because it elevates the
constraint to a more meaningful location in DM core.  As such the
pre-existing early return in dm_init_request_based_queue() can now be
removed.

Fixes: bfebd1cdb4 ("dm: add full blk-mq support to request-based DM")
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-ioctl.c | 17 +++++++++--------
 drivers/md/dm.c       |  3 ---
 2 files changed, 9 insertions(+), 11 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index c8a18e4ee9dc..720ceeb7fa9b 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1298,21 +1298,22 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 		goto err_unlock_md_type;
 	}
 
-	if (dm_get_md_type(md) == DM_TYPE_NONE)
+	if (dm_get_md_type(md) == DM_TYPE_NONE) {
 		/* Initial table load: acquire type of table. */
 		dm_set_md_type(md, dm_table_get_type(t));
-	else if (dm_get_md_type(md) != dm_table_get_type(t)) {
+
+		/* setup md->queue to reflect md's type (may block) */
+		r = dm_setup_md_queue(md);
+		if (r) {
+			DMWARN("unable to set up device queue for new table.");
+			goto err_unlock_md_type;
+		}
+	} else if (dm_get_md_type(md) != dm_table_get_type(t)) {
 		DMWARN("can't change device type after initial table load.");
 		r = -EINVAL;
 		goto err_unlock_md_type;
 	}
 
-	/* setup md->queue to reflect md's type (may block) */
-	r = dm_setup_md_queue(md);
-	if (r) {
-		DMWARN("unable to set up device queue for new table.");
-		goto err_unlock_md_type;
-	}
 	dm_unlock_md_type(md);
 
 	/* stage inactive table */
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f8c7ca3e8947..923496ba72a0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2662,9 +2662,6 @@ static int dm_init_request_based_queue(struct mapped_device *md)
 {
 	struct request_queue *q = NULL;
 
-	if (md->queue->elevator)
-		return 0;
-
 	/* Fully initialize the queue */
 	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
 	if (!q)
-- 
cgit v1.2.1


From aa6df8dd28c01d9a3d2cfcfe9dd0a4a334d1cd81 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 29 Apr 2015 10:48:09 -0400
Subject: dm: fix free_rq_clone() NULL pointer when requeueing unmapped request

Commit 022333427a ("dm: optimize dm_mq_queue_rq to _not_ use kthread if
using pure blk-mq") mistakenly removed free_rq_clone()'s clone->q check
before testing clone->q->mq_ops.  It was an oversight to discontinue
that check for 1 of the 2 use-cases for free_rq_clone():
1) free_rq_clone() called when an unmapped original request is requeued
2) free_rq_clone() called in the request-based IO completion path

The clone->q check made sense for case #1 but not for #2.  However, we
cannot just reinstate the check as it'd mask a serious bug in the IO
completion case #2 -- no in-flight request should have an uninitialized
request_queue (basic block layer refcounting _should_ ensure this).

The NULL pointer seen for case #1 is detailed here:
https://www.redhat.com/archives/dm-devel/2015-April/msg00160.html

Fix this free_rq_clone() NULL pointer by simply checking if the
mapped_device's type is DM_TYPE_MQ_REQUEST_BASED (clone's queue is
blk-mq) rather than checking clone->q->mq_ops.  This avoids the need to
dereference clone->q, but a WARN_ON_ONCE is added to let us know if an
uninitialized clone request is being completed.

Reported-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 923496ba72a0..a930b72314ac 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1082,18 +1082,26 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 	dm_put(md);
 }
 
-static void free_rq_clone(struct request *clone)
+static void free_rq_clone(struct request *clone, bool must_be_mapped)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 
+	WARN_ON_ONCE(must_be_mapped && !clone->q);
+
 	blk_rq_unprep_clone(clone);
 
-	if (clone->q->mq_ops)
+	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
+		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
 	else if (!md->queue->mq_ops)
 		/* request_fn queue stacked on request_fn queue(s) */
 		free_clone_request(md, clone);
+	/*
+	 * NOTE: for the blk-mq queue stacked on request_fn queue(s) case:
+	 * no need to call free_clone_request() because we leverage blk-mq by
+	 * allocating the clone at the end of the blk-mq pdu (see: clone_rq)
+	 */
 
 	if (!md->queue->mq_ops)
 		free_rq_tio(tio);
@@ -1124,7 +1132,7 @@ static void dm_end_request(struct request *clone, int error)
 			rq->sense_len = clone->sense_len;
 	}
 
-	free_rq_clone(clone);
+	free_rq_clone(clone, true);
 	if (!rq->q->mq_ops)
 		blk_end_request_all(rq, error);
 	else
@@ -1143,7 +1151,7 @@ static void dm_unprep_request(struct request *rq)
 	}
 
 	if (clone)
-		free_rq_clone(clone);
+		free_rq_clone(clone, false);
 }
 
 /*
-- 
cgit v1.2.1


From c0403ec0bb5a8c5b267fb7e16021bec0b17e4964 Mon Sep 17 00:00:00 2001
From: Rabin Vincent <rabin.vincent@axis.com>
Date: Tue, 5 May 2015 15:15:56 +0200
Subject: Revert "dm crypt: fix deadlock when async crypto algorithm returns
 -EBUSY"

This reverts Linux 4.1-rc1 commit 0618764cb25f6fa9fb31152995de42a8a0496475.

The problem which that commit attempts to fix actually lies in the
Freescale CAAM crypto driver not dm-crypt.

dm-crypt uses CRYPTO_TFM_REQ_MAY_BACKLOG.  This means the the crypto
driver should internally backlog requests which arrive when the queue is
full and process them later.  Until the crypto hw's queue becomes full,
the driver returns -EINPROGRESS.  When the crypto hw's queue if full,
the driver returns -EBUSY, and if CRYPTO_TFM_REQ_MAY_BACKLOG is set, is
expected to backlog the request and process it when the hardware has
queue space.  At the point when the driver takes the request from the
backlog and starts processing it, it calls the completion function with
a status of -EINPROGRESS.  The completion function is called (for a
second time, in the case of backlogged requests) with a status/err of 0
when a request is done.

Crypto drivers for hardware without hardware queueing use the helpers,
crypto_init_queue(), crypto_enqueue_request(), crypto_dequeue_request()
and crypto_get_backlog() helpers to implement this behaviour correctly,
while others implement this behaviour without these helpers (ccp, for
example).

dm-crypt (before the patch that needs reverting) uses this API
correctly.  It queues up as many requests as the hw queues will allow
(i.e. as long as it gets back -EINPROGRESS from the request function).
Then, when it sees at least one backlogged request (gets -EBUSY), it
waits till that backlogged request is handled (completion gets called
with -EINPROGRESS), and then continues.  The references to
af_alg_wait_for_completion() and af_alg_complete() in that commit's
commit message are irrelevant because those functions only handle one
request at a time, unlink dm-crypt.

The problem is that the Freescale CAAM driver, which that commit
describes as having being tested with, fails to implement the
backlogging behaviour correctly.  In cam_jr_enqueue(), if the hardware
queue is full, it simply returns -EBUSY without backlogging the request.
What the observed deadlock was is not described in the commit message
but it is obviously the wait_for_completion() in crypto_convert() where
dm-crypto would wait for the completion being called with -EINPROGRESS
in the case of backlogged requests.  This completion will never be
completed due to the bug in the CAAM driver.

Commit 0618764cb25 incorrectly made dm-crypt wait for every request,
even when the driver/hardware queues are not full, which means that
dm-crypt will never see -EBUSY.  This means that that commit will cause
a performance regression on all crypto drivers which implement the API
correctly.

Revert it.  Correct backlog handling should be implemented in the CAAM
driver instead.

Cc'ing stable purely because commit 0618764cb25 did.  If for some reason
a stable@ kernel did pick up commit 0618764cb25 it should get reverted.

Signed-off-by: Rabin Vincent <rabin.vincent@axis.com>
Reviewed-by: Horia Geanta <horia.geanta@freescale.com>
Cc: stable@vger.kernel.org
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 9eeea196328a..5503e43e5f28 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -925,10 +925,11 @@ static int crypt_convert(struct crypt_config *cc,
 
 		switch (r) {
 		/* async */
-		case -EINPROGRESS:
 		case -EBUSY:
 			wait_for_completion(&ctx->restart);
 			reinit_completion(&ctx->restart);
+			/* fall through*/
+		case -EINPROGRESS:
 			ctx->req = NULL;
 			ctx->cc_sector++;
 			continue;
@@ -1345,8 +1346,10 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
 	struct crypt_config *cc = io->cc;
 
-	if (error == -EINPROGRESS)
+	if (error == -EINPROGRESS) {
+		complete(&ctx->restart);
 		return;
+	}
 
 	if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
 		error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
@@ -1357,15 +1360,12 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	crypt_free_req(cc, req_of_dmreq(cc, dmreq), io->base_bio);
 
 	if (!atomic_dec_and_test(&ctx->cc_pending))
-		goto done;
+		return;
 
 	if (bio_data_dir(io->base_bio) == READ)
 		kcryptd_crypt_read_done(io);
 	else
 		kcryptd_crypt_write_io_submit(io, 1);
-done:
-	if (!completion_done(&ctx->restart))
-		complete(&ctx->restart);
 }
 
 static void kcryptd_crypt(struct work_struct *work)
-- 
cgit v1.2.1


From c4cf5261f8bffd9de132b50660a69148e7575bd6 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 17 Apr 2015 16:15:18 -0600
Subject: bio: skip atomic inc/dec of ->bi_remaining for non-chains

Struct bio has an atomic ref count for chained bio's, and we use this
to know when to end IO on the bio. However, most bio's are not chained,
so we don't need to always introduce this atomic operation as part of
ending IO.

Add a helper to elevate the bi_remaining count, and flag the bio as
now actually needing the decrement at end_io time. Rename the field
to __bi_remaining to catch any current users of this doing the
incrementing manually.

For high IOPS workloads, this reduces the overhead of bio_endio()
substantially.

Tested-by: Robert Elliott <elliott@hp.com>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm-cache-target.c | 2 +-
 drivers/md/dm-raid1.c        | 2 +-
 drivers/md/dm-snap.c         | 2 +-
 drivers/md/dm-thin.c         | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7755af351867..705eb7b99d69 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -91,7 +91,7 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 	 * Must bump bi_remaining to allow bio to complete with
 	 * restored bi_end_io.
 	 */
-	atomic_inc(&bio->bi_remaining);
+	bio_inc_remaining(bio);
 }
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 089d62751f7f..d6a1c096b777 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1254,7 +1254,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 			dm_bio_restore(bd, bio);
 			bio_record->details.bi_bdev = NULL;
 
-			atomic_inc(&bio->bi_remaining);
+			bio_inc_remaining(bio);
 
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index f83a0f3fc365..8bfeae218531 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1478,7 +1478,7 @@ out:
 	if (full_bio) {
 		full_bio->bi_end_io = pe->full_bio_end_io;
 		full_bio->bi_private = pe->full_bio_private;
-		atomic_inc(&full_bio->bi_remaining);
+		bio_inc_remaining(full_bio);
 	}
 	increment_pending_exceptions_done_count();
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 921aafd12aee..342dbdad6131 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -795,7 +795,7 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
 	if (m->bio) {
 		m->bio->bi_end_io = m->saved_bi_end_io;
-		atomic_inc(&m->bio->bi_remaining);
+		bio_inc_remaining(m->bio);
 	}
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
@@ -812,7 +812,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	bio = m->bio;
 	if (bio) {
 		bio->bi_end_io = m->saved_bi_end_io;
-		atomic_inc(&bio->bi_remaining);
+		bio_inc_remaining(bio);
 	}
 
 	if (m->err) {
-- 
cgit v1.2.1


From dac56212e8127dbc0bff7be35c508bc280213309 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 17 Apr 2015 16:23:59 -0600
Subject: bio: skip atomic inc/dec of ->bi_cnt for most use cases

Struct bio has a reference count that controls when it can be freed.
Most uses cases is allocating the bio, which then returns with a
single reference to it, doing IO, and then dropping that single
reference. We can remove this atomic_dec_and_test() in the completion
path, if nobody else is holding a reference to the bio.

If someone does call bio_get() on the bio, then we flag the bio as
now having valid count and that we must properly honor the reference
count when it's being put.

Tested-by: Robert Elliott <elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/request.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index ab43faddb447..1616f668a4cb 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -619,7 +619,7 @@ static void do_bio_hook(struct search *s, struct bio *orig_bio)
 	bio->bi_end_io		= request_endio;
 	bio->bi_private		= &s->cl;
 
-	atomic_set(&bio->bi_cnt, 3);
+	bio_cnt_set(bio, 3);
 }
 
 static void search_free(struct closure *cl)
-- 
cgit v1.2.1


From b6538fe32966e63ef38897860ef220980d904974 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Fri, 8 May 2015 18:19:03 +1000
Subject: md-raid0: conditional mddev->queue access to suit dm-raid

This patch is a prerequisite for dm-raid "raid0" support to allow
dm-raid to access the MD RAID0 personality doing unconditional
accesses to mddev->queue, which is NULL in case of dm-raid stacked on
top of MD.

Most of the conditional mddev->queue accesses made it to upstream but
this missing one, which prohibits md raid0 to set disk stack limits
(being done in dm core in case of md underneath dm).

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Tested-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 2cb59a641cd2..6a68ef5246d4 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -188,8 +188,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		}
 		dev[j] = rdev1;
 
-		disk_stack_limits(mddev->gendisk, rdev1->bdev,
-				  rdev1->data_offset << 9);
+		if (mddev->queue)
+			disk_stack_limits(mddev->gendisk, rdev1->bdev,
+					  rdev1->data_offset << 9);
 
 		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
 			conf->has_merge_bvec = 1;
-- 
cgit v1.2.1


From f18c1a35f62caccb527e8b0990c8801596e7c662 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:04 +1000
Subject: md/raid5: new alloc_stripe() to allocate an initialize a stripe.

The new batch_lock and batch_list fields are being initialized in
grow_one_stripe() but not in resize_stripes().  This causes a crash
on resize.

So separate the core initialization into a new function and call it
from both allocation sites.

Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: 59fc630b8b5f ("RAID5: batch adjacent full stripe write")
---
 drivers/md/raid5.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 77dfd720aaa0..91a1e8b26b52 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1971,17 +1971,30 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	put_cpu();
 }
 
+static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
+{
+	struct stripe_head *sh;
+
+	sh = kmem_cache_zalloc(sc, gfp);
+	if (sh) {
+		spin_lock_init(&sh->stripe_lock);
+		spin_lock_init(&sh->batch_lock);
+		INIT_LIST_HEAD(&sh->batch_list);
+		INIT_LIST_HEAD(&sh->lru);
+		atomic_set(&sh->count, 1);
+	}
+	return sh;
+}
 static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 {
 	struct stripe_head *sh;
-	sh = kmem_cache_zalloc(conf->slab_cache, gfp);
+
+	sh = alloc_stripe(conf->slab_cache, gfp);
 	if (!sh)
 		return 0;
 
 	sh->raid_conf = conf;
 
-	spin_lock_init(&sh->stripe_lock);
-
 	if (grow_buffers(sh, gfp)) {
 		shrink_buffers(sh);
 		kmem_cache_free(conf->slab_cache, sh);
@@ -1990,13 +2003,8 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
 	sh->hash_lock_index =
 		conf->max_nr_stripes % NR_STRIPE_HASH_LOCKS;
 	/* we just created an active stripe so... */
-	atomic_set(&sh->count, 1);
 	atomic_inc(&conf->active_stripes);
-	INIT_LIST_HEAD(&sh->lru);
 
-	spin_lock_init(&sh->batch_lock);
-	INIT_LIST_HEAD(&sh->batch_list);
-	sh->batch_head = NULL;
 	release_stripe(sh);
 	conf->max_nr_stripes++;
 	return 1;
@@ -2109,13 +2117,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 		return -ENOMEM;
 
 	for (i = conf->max_nr_stripes; i; i--) {
-		nsh = kmem_cache_zalloc(sc, GFP_KERNEL);
+		nsh = alloc_stripe(sc, GFP_KERNEL);
 		if (!nsh)
 			break;
 
 		nsh->raid_conf = conf;
-		spin_lock_init(&nsh->stripe_lock);
-
 		list_add(&nsh->lru, &newstripes);
 	}
 	if (i) {
@@ -2142,13 +2148,11 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 				    lock_device_hash_lock(conf, hash));
 		osh = get_free_stripe(conf, hash);
 		unlock_device_hash_lock(conf, hash);
-		atomic_set(&nsh->count, 1);
+
 		for(i=0; i<conf->pool_size; i++) {
 			nsh->dev[i].page = osh->dev[i].page;
 			nsh->dev[i].orig_page = osh->dev[i].page;
 		}
-		for( ; i<newsize; i++)
-			nsh->dev[i].page = NULL;
 		nsh->hash_lock_index = hash;
 		kmem_cache_free(conf->slab_cache, osh);
 		cnt++;
-- 
cgit v1.2.1


From b0c783b32318bef29d64086fa812e8c659cb5b37 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:32 +1000
Subject: md/raid5: more incorrect BUG_ON in handle_stripe_fill.

It is not incorrect to call handle_stripe_fill() when
a batch of full-stripe writes is active.
It is, however, a BUG if fetch_block() then decides
it needs to actually fetch anything.

So move the 'BUG_ON' to where it belongs.

Signed-off-by: NeilBrown  <neilb@suse.de>
Fixes: 59fc630b8b5f ("RAID5: batch adjacent full stripe write")
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 91a1e8b26b52..415cac6d89bd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3302,6 +3302,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s,
 		 */
 		BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
 		BUG_ON(test_bit(R5_Wantread, &dev->flags));
+		BUG_ON(sh->batch_head);
 		if ((s->uptodate == disks - 1) &&
 		    (s->failed && (disk_idx == s->failed_num[0] ||
 				   disk_idx == s->failed_num[1]))) {
@@ -3370,7 +3371,6 @@ static void handle_stripe_fill(struct stripe_head *sh,
 {
 	int i;
 
-	BUG_ON(sh->batch_head);
 	/* look for blocks to read/compute, skip this if a compute
 	 * is already in flight, or if the stripe contents are in the
 	 * midst of changing due to a write
-- 
cgit v1.2.1


From 10d82c5f0d167ef75a2d8d7d4eed9ee43d3369c9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:33 +1000
Subject: md/raid5: avoid reading parity blocks for full-stripe write to
 degraded array

When performing a reconstruct write, we need to read all blocks
that are not being over-written .. except the parity (P and Q) blocks.

The code currently reads these (as they are not being over-written!)
unnecessarily.

Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: ea664c8245f3 ("md/raid5: need_this_block: tidy/fix last condition.")
---
 drivers/md/raid5.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 415cac6d89bd..85dc0e67fb88 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3282,7 +3282,9 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s,
 		/* reconstruct-write isn't being forced */
 		return 0;
 	for (i = 0; i < s->failed; i++) {
-		if (!test_bit(R5_UPTODATE, &fdev[i]->flags) &&
+		if (s->failed_num[i] != sh->pd_idx &&
+		    s->failed_num[i] != sh->qd_idx &&
+		    !test_bit(R5_UPTODATE, &fdev[i]->flags) &&
 		    !test_bit(R5_OVERWRITE, &fdev[i]->flags))
 			return 1;
 	}
-- 
cgit v1.2.1


From 6e9eac2dcee5e19f125967dd2be3e36558c42fff Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:34 +1000
Subject: md/raid5: don't record new size if resize_stripes fails.

If any memory allocation in resize_stripes fails we will return
-ENOMEM, but in some cases we update conf->pool_size anyway.

This means that if we try again, the allocations will be assumed
to be larger than they are, and badness results.

So only update pool_size if there is no error.

This bug was introduced in 2.6.17 and the patch is suitable for
-stable.

Fixes: ad01c9e3752f ("[PATCH] md: Allow stripes to be expanded in preparation for expanding an array")
Cc: stable@vger.kernel.org (v2.6.17+)
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 85dc0e67fb88..9748e525e4c0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2216,7 +2216,8 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 
 	conf->slab_cache = sc;
 	conf->active_name = 1-conf->active_name;
-	conf->pool_size = newsize;
+	if (!err)
+		conf->pool_size = newsize;
 	return err;
 }
 
-- 
cgit v1.2.1


From 738a273806ee0568369c9bb19ef3b102f54beef4 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:39 +1000
Subject: md/raid5: fix allocation of 'scribble' array.

As the new 'scribble' array is sized based on chunk size,
we need to make sure the size matches the largest of 'old'
and 'new' chunk sizes when the array is undergoing reshape.

We also potentially need to resize it even when not resizing
the stripe cache, as chunk size can change without changing
number of devices.

So move the 'resize' code into a separate function, and
consider old and new sizes when allocating.

Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: 46d5b785621a ("raid5: use flex_array for scribble data")
---
 drivers/md/raid5.c | 65 ++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 22 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9748e525e4c0..3873eaa6fa2e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2068,6 +2068,35 @@ static struct flex_array *scribble_alloc(int num, int cnt, gfp_t flags)
 	return ret;
 }
 
+static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
+{
+	unsigned long cpu;
+	int err = 0;
+
+	mddev_suspend(conf->mddev);
+	get_online_cpus();
+	for_each_present_cpu(cpu) {
+		struct raid5_percpu *percpu;
+		struct flex_array *scribble;
+
+		percpu = per_cpu_ptr(conf->percpu, cpu);
+		scribble = scribble_alloc(new_disks,
+					  new_sectors / STRIPE_SECTORS,
+					  GFP_NOIO);
+
+		if (scribble) {
+			flex_array_free(percpu->scribble);
+			percpu->scribble = scribble;
+		} else {
+			err = -ENOMEM;
+			break;
+		}
+	}
+	put_online_cpus();
+	mddev_resume(conf->mddev);
+	return err;
+}
+
 static int resize_stripes(struct r5conf *conf, int newsize)
 {
 	/* Make all the stripes able to hold 'newsize' devices.
@@ -2096,7 +2125,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	struct stripe_head *osh, *nsh;
 	LIST_HEAD(newstripes);
 	struct disk_info *ndisks;
-	unsigned long cpu;
 	int err;
 	struct kmem_cache *sc;
 	int i;
@@ -2178,25 +2206,6 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	} else
 		err = -ENOMEM;
 
-	get_online_cpus();
-	for_each_present_cpu(cpu) {
-		struct raid5_percpu *percpu;
-		struct flex_array *scribble;
-
-		percpu = per_cpu_ptr(conf->percpu, cpu);
-		scribble = scribble_alloc(newsize, conf->chunk_sectors /
-			STRIPE_SECTORS, GFP_NOIO);
-
-		if (scribble) {
-			flex_array_free(percpu->scribble);
-			percpu->scribble = scribble;
-		} else {
-			err = -ENOMEM;
-			break;
-		}
-	}
-	put_online_cpus();
-
 	/* Step 4, return new stripes to service */
 	while(!list_empty(&newstripes)) {
 		nsh = list_entry(newstripes.next, struct stripe_head, lru);
@@ -6228,8 +6237,11 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
 		percpu->spare_page = alloc_page(GFP_KERNEL);
 	if (!percpu->scribble)
 		percpu->scribble = scribble_alloc(max(conf->raid_disks,
-			conf->previous_raid_disks), conf->chunk_sectors /
-			STRIPE_SECTORS, GFP_KERNEL);
+						      conf->previous_raid_disks),
+						  max(conf->chunk_sectors,
+						      conf->prev_chunk_sectors)
+						   / STRIPE_SECTORS,
+						  GFP_KERNEL);
 
 	if (!percpu->scribble || (conf->level == 6 && !percpu->spare_page)) {
 		free_scratch_buffer(conf, percpu);
@@ -7205,6 +7217,15 @@ static int check_reshape(struct mddev *mddev)
 	if (!check_stripe_cache(mddev))
 		return -ENOSPC;
 
+	if (mddev->new_chunk_sectors > mddev->chunk_sectors ||
+	    mddev->delta_disks > 0)
+		if (resize_chunks(conf,
+				  conf->previous_raid_disks
+				  + max(0, mddev->delta_disks),
+				  max(mddev->new_chunk_sectors,
+				      mddev->chunk_sectors)
+			    ) < 0)
+			return -ENOMEM;
 	return resize_stripes(conf, (conf->previous_raid_disks
 				     + mddev->delta_disks));
 }
-- 
cgit v1.2.1


From bb27051f9fd7643f05d8f0babce3337f0b9b3087 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 8 May 2015 18:19:40 +1000
Subject: md/raid5: fix handling of degraded stripes in batches.

There is no need for special handling of stripe-batches when the array
is degraded.

There may be if there is a failure in the batch, but STRIPE_DEGRADED
does not imply an error.

So don't set STRIPE_BATCH_ERR in ops_run_io just because the array is
degraded.
This actually causes a bug: the STRIPE_DEGRADED flag gets cleared in
check_break_stripe_batch_list() and so the bitmap bit gets cleared
when it shouldn't.

So in check_break_stripe_batch_list(), split the batch up completely -
again STRIPE_DEGRADED isn't meaningful.

Also don't set STRIPE_BATCH_ERR when there is a write error to a
replacement device.  This simply removes the replacement device and
requires no extra handling.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3873eaa6fa2e..1ba97fdc6df1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1078,9 +1078,6 @@ again:
 			pr_debug("skip op %ld on disc %d for sector %llu\n",
 				bi->bi_rw, i, (unsigned long long)sh->sector);
 			clear_bit(R5_LOCKED, &sh->dev[i].flags);
-			if (sh->batch_head)
-				set_bit(STRIPE_BATCH_ERR,
-					&sh->batch_head->state);
 			set_bit(STRIPE_HANDLE, &sh->state);
 		}
 
@@ -2448,7 +2445,7 @@ static void raid5_end_write_request(struct bio *bi, int error)
 	}
 	rdev_dec_pending(rdev, conf->mddev);
 
-	if (sh->batch_head && !uptodate)
+	if (sh->batch_head && !uptodate && !replacement)
 		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
 
 	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
@@ -4214,15 +4211,9 @@ static void check_break_stripe_batch_list(struct stripe_head *sh)
 		return;
 
 	head_sh = sh;
-	do {
-		sh = list_first_entry(&sh->batch_list,
-				      struct stripe_head, batch_list);
-		BUG_ON(sh == head_sh);
-	} while (!test_bit(STRIPE_DEGRADED, &sh->state));
 
-	while (sh != head_sh) {
-		next = list_first_entry(&sh->batch_list,
-					struct stripe_head, batch_list);
+	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
+
 		list_del_init(&sh->batch_list);
 
 		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
@@ -4242,8 +4233,6 @@ static void check_break_stripe_batch_list(struct stripe_head *sh)
 
 		set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
-
-		sh = next;
 	}
 }
 
-- 
cgit v1.2.1


From 487696957e3bd64ccffe62c0ac4ff7bf662785ab Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Wed, 13 May 2015 09:30:08 -0700
Subject: raid5: fix broken async operation chain

ops_run_reconstruct6() doesn't correctly chain asyn operations. The tx returned
by async_gen_syndrome should be added as the dependent tx of next stripe.

The issue is introduced by commit 59fc630b8b5f9f21c8ce3ba153341c107dce1b0c
    RAID5: batch adjacent full stripe write

Reported-and-tested-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1ba97fdc6df1..b9f2b9cc6060 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1822,7 +1822,7 @@ again:
 	} else
 		init_async_submit(&submit, 0, tx, NULL, NULL,
 				  to_addr_conv(sh, percpu, j));
-	async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
+	tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE,  &submit);
 	if (!last_stripe) {
 		j++;
 		sh = list_first_entry(&sh->batch_list, struct stripe_head,
-- 
cgit v1.2.1


From a81157768a00e8cf8a7b43b5ea5cac931262374f Mon Sep 17 00:00:00 2001
From: Eric Work <work.eric@gmail.com>
Date: Mon, 18 May 2015 23:26:23 -0700
Subject: md/raid0: fix restore to sector variable in raid0_make_request

The variable "sector" in "raid0_make_request()" was improperly updated
by a call to "sector_div()" which modifies its first argument in place.
Commit 47d68979cc968535cb87f3e5f2e6a3533ea48fbd restored this variable
after the call for later re-use.  Unfortunetly the restore was done after
the referenced variable "bio" was advanced.  This lead to the original
value and the restored value being different.  Here we move this line to
the proper place.

One observed side effect of this bug was discarding a file though
unlinking would cause an unrelated file's contents to be discarded.

Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: 47d68979cc96 ("md/raid0: fix bug with chunksize not a power of 2.")
Cc: stable@vger.kernel.org (any that received above backport)
URL: https://bugzilla.kernel.org/show_bug.cgi?id=98501
---
 drivers/md/raid0.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 6a68ef5246d4..efb654eb5399 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -524,6 +524,9 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 			 ? (sector & (chunk_sects-1))
 			 : sector_div(sector, chunk_sects));
 
+		/* Restore due to sector_div */
+		sector = bio->bi_iter.bi_sector;
+
 		if (sectors < bio_sectors(bio)) {
 			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
 			bio_chain(split, bio);
@@ -531,7 +534,6 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 			split = bio;
 		}
 
-		sector = bio->bi_iter.bi_sector;
 		zone = find_zone(mddev->private, &sector);
 		tmp_dev = map_sector(mddev, zone, sector, &sector);
 		split->bi_bdev = tmp_dev->bdev;
-- 
cgit v1.2.1


From 8532e3439087de69bb1b71fd6be2baa6fc196a55 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 20 May 2015 15:05:09 +1000
Subject: md/bitmap: remove rcu annotation from pointer arithmetic.

Evaluating  "&mddev->disks" is simple pointer arithmetic, so
it does not need 'rcu' annotations - no dereferencing is happening.

Also enhance the comment to explain that 'rdev' in that case
is not actually a pointer to an rdev.

Reported-by: Patrick Marlier <patrick.marlier@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/bitmap.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2bc56e2a3526..135a0907e9de 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -177,11 +177,16 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
 	 * nr_pending is 0 and In_sync is clear, the entries we return will
 	 * still be in the same position on the list when we re-enter
 	 * list_for_each_entry_continue_rcu.
+	 *
+	 * Note that if entered with 'rdev == NULL' to start at the
+	 * beginning, we temporarily assign 'rdev' to an address which
+	 * isn't really an rdev, but which can be used by
+	 * list_for_each_entry_continue_rcu() to find the first entry.
 	 */
 	rcu_read_lock();
 	if (rdev == NULL)
 		/* start at the beginning */
-		rdev = list_entry_rcu(&mddev->disks, struct md_rdev, same_set);
+		rdev = list_entry(&mddev->disks, struct md_rdev, same_set);
 	else {
 		/* release the previous rdev and start from there. */
 		rdev_dec_pending(rdev, mddev);
-- 
cgit v1.2.1


From 326e1dbb57368087a36607aaebe9795b8d5453e5 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 22 May 2015 09:14:03 -0400
Subject: block: remove management of bi_remaining when restoring original
 bi_end_io

Commit c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for
non-chains") regressed all existing callers that followed this pattern:
 1) saving a bio's original bi_end_io
 2) wiring up an intermediate bi_end_io
 3) restoring the original bi_end_io from intermediate bi_end_io
 4) calling bio_endio() to execute the restored original bi_end_io

The regression was due to BIO_CHAIN only ever getting set if
bio_inc_remaining() is called.  For the above pattern it isn't set until
step 3 above (step 2 would've needed to establish BIO_CHAIN).  As such
the first bio_endio(), in step 2 above, never decremented __bi_remaining
before calling the intermediate bi_end_io -- leaving __bi_remaining with
the value 1 instead of 0.  When bio_inc_remaining() occurred during step
3 it brought it to a value of 2.  When the second bio_endio() was
called, in step 4 above, it should've called the original bi_end_io but
it didn't because there was an extra reference that wasn't dropped (due
to atomic operations being optimized away since BIO_CHAIN wasn't set
upfront).

Fix this issue by removing the __bi_remaining management complexity for
all callers that use the above pattern -- bio_chain() is the only
interface that _needs_ to be concerned with __bi_remaining.  For the
above pattern callers just expect the bi_end_io they set to get called!
Remove bio_endio_nodec() and also remove all bio_inc_remaining() calls
that aren't associated with the bio_chain() interface.

Also, the bio_inc_remaining() interface has been moved local to bio.c.

Fixes: c4cf5261 ("bio: skip atomic inc/dec of ->bi_remaining for non-chains")
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/io.c       | 2 +-
 drivers/md/dm-cache-target.c | 6 ------
 drivers/md/dm-raid1.c        | 2 --
 drivers/md/dm-snap.c         | 1 -
 drivers/md/dm-thin.c         | 9 +++------
 drivers/md/dm-verity.c       | 2 +-
 6 files changed, 5 insertions(+), 17 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fa028fa82df4..cb64e64a4789 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -55,7 +55,7 @@ static void bch_bio_submit_split_done(struct closure *cl)
 
 	s->bio->bi_end_io = s->bi_end_io;
 	s->bio->bi_private = s->bi_private;
-	bio_endio_nodec(s->bio, 0);
+	bio_endio(s->bio, 0);
 
 	closure_debug_destroy(&s->cl);
 	mempool_free(s, s->p->bio_split_hook);
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 705eb7b99d69..41b2594a80c6 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -86,12 +86,6 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 {
 	bio->bi_end_io = h->bi_end_io;
 	bio->bi_private = h->bi_private;
-
-	/*
-	 * Must bump bi_remaining to allow bio to complete with
-	 * restored bi_end_io.
-	 */
-	bio_inc_remaining(bio);
 }
 
 /*----------------------------------------------------------------*/
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index d6a1c096b777..743fa9bbae9e 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1254,8 +1254,6 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error)
 			dm_bio_restore(bd, bio);
 			bio_record->details.bi_bdev = NULL;
 
-			bio_inc_remaining(bio);
-
 			queue_bio(ms, bio, rw);
 			return DM_ENDIO_INCOMPLETE;
 		}
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 8bfeae218531..7c82d3ccce87 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -1478,7 +1478,6 @@ out:
 	if (full_bio) {
 		full_bio->bi_end_io = pe->full_bio_end_io;
 		full_bio->bi_private = pe->full_bio_private;
-		bio_inc_remaining(full_bio);
 	}
 	increment_pending_exceptions_done_count();
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 342dbdad6131..e852602c0091 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -793,10 +793,9 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-	if (m->bio) {
+	if (m->bio)
 		m->bio->bi_end_io = m->saved_bi_end_io;
-		bio_inc_remaining(m->bio);
-	}
+
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
@@ -810,10 +809,8 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	int r;
 
 	bio = m->bio;
-	if (bio) {
+	if (bio)
 		bio->bi_end_io = m->saved_bi_end_io;
-		bio_inc_remaining(bio);
-	}
 
 	if (m->err) {
 		cell_error(pool, m->cell);
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 66616db33e6f..bb9c6a00e4b0 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -459,7 +459,7 @@ static void verity_finish_io(struct dm_verity_io *io, int error)
 	bio->bi_end_io = io->orig_bi_end_io;
 	bio->bi_private = io->orig_bi_private;
 
-	bio_endio_nodec(bio, error);
+	bio_endio(bio, error);
 }
 
 static void verity_work(struct work_struct *w)
-- 
cgit v1.2.1


From 5f1b670d0bef508a5554d92525f5f6d00d640b38 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 22 May 2015 09:14:04 -0400
Subject: block, dm: don't copy bios for request clones

Currently dm-multipath has to clone the bios for every request sent
to the lower devices, which wastes cpu cycles and ties down memory.

This patch instead adds a new REQ_CLONE flag that instructs req_bio_endio
to not complete bios attached to a request, which we set on clone
requests similar to bios in a flush sequence.  With this change I/O
errors on a path failure only get propagated to dm-multipath, which
can then either resubmit the I/O or complete the bios on the original
request.

I've done some basic testing of this on a Linux target with ALUA support,
and it survives path failures during I/O nicely.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm-table.c |  25 +++++---
 drivers/md/dm.c       | 171 ++++++++++++--------------------------------------
 drivers/md/dm.h       |   5 +-
 3 files changed, 59 insertions(+), 142 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index d9b00b8565c6..3662b2e49b8d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -940,21 +940,28 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
-	struct dm_target *tgt;
 	unsigned i;
 
-	if (unlikely(type == DM_TYPE_NONE)) {
+	switch (type) {
+	case DM_TYPE_BIO_BASED:
+		for (i = 0; i < t->num_targets; i++) {
+			struct dm_target *tgt = t->targets + i;
+
+			per_bio_data_size = max(per_bio_data_size,
+						tgt->per_bio_data_size);
+		}
+		t->mempools = dm_alloc_bio_mempools(t->integrity_supported,
+						    per_bio_data_size);
+		break;
+	case DM_TYPE_REQUEST_BASED:
+	case DM_TYPE_MQ_REQUEST_BASED:
+		t->mempools = dm_alloc_rq_mempools(md, type);
+		break;
+	default:
 		DMWARN("no table type is set, can't allocate mempools");
 		return -EINVAL;
 	}
 
-	if (type == DM_TYPE_BIO_BASED)
-		for (i = 0; i < t->num_targets; i++) {
-			tgt = t->targets + i;
-			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
-		}
-
-	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a930b72314ac..38837f8ea327 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -990,57 +990,6 @@ static void clone_endio(struct bio *bio, int error)
 	dec_pending(io, error);
 }
 
-/*
- * Partial completion handling for request-based dm
- */
-static void end_clone_bio(struct bio *clone, int error)
-{
-	struct dm_rq_clone_bio_info *info =
-		container_of(clone, struct dm_rq_clone_bio_info, clone);
-	struct dm_rq_target_io *tio = info->tio;
-	struct bio *bio = info->orig;
-	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
-
-	bio_put(clone);
-
-	if (tio->error)
-		/*
-		 * An error has already been detected on the request.
-		 * Once error occurred, just let clone->end_io() handle
-		 * the remainder.
-		 */
-		return;
-	else if (error) {
-		/*
-		 * Don't notice the error to the upper layer yet.
-		 * The error handling decision is made by the target driver,
-		 * when the request is completed.
-		 */
-		tio->error = error;
-		return;
-	}
-
-	/*
-	 * I/O for the bio successfully completed.
-	 * Notice the data completion to the upper layer.
-	 */
-
-	/*
-	 * bios are processed from the head of the list.
-	 * So the completing bio should always be rq->bio.
-	 * If it's not, something wrong is happening.
-	 */
-	if (tio->orig->bio != bio)
-		DMERR("bio completion is going in the middle of the request");
-
-	/*
-	 * Update the original request.
-	 * Do not use blk_end_request() here, because it may complete
-	 * the original request before the clone, and break the ordering.
-	 */
-	blk_update_request(tio->orig, 0, nr_bytes);
-}
-
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 {
 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
@@ -1089,8 +1038,6 @@ static void free_rq_clone(struct request *clone, bool must_be_mapped)
 
 	WARN_ON_ONCE(must_be_mapped && !clone->q);
 
-	blk_rq_unprep_clone(clone);
-
 	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
 		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
@@ -1821,39 +1768,13 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 		dm_complete_request(rq, r);
 }
 
-static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
-				 void *data)
+static void setup_clone(struct request *clone, struct request *rq,
+		        struct dm_rq_target_io *tio)
 {
-	struct dm_rq_target_io *tio = data;
-	struct dm_rq_clone_bio_info *info =
-		container_of(bio, struct dm_rq_clone_bio_info, clone);
-
-	info->orig = bio_orig;
-	info->tio = tio;
-	bio->bi_end_io = end_clone_bio;
-
-	return 0;
-}
-
-static int setup_clone(struct request *clone, struct request *rq,
-		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
-{
-	int r;
-
-	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
-			      dm_rq_bio_constructor, tio);
-	if (r)
-		return r;
-
-	clone->cmd = rq->cmd;
-	clone->cmd_len = rq->cmd_len;
-	clone->sense = rq->sense;
+	blk_rq_prep_clone(clone, rq);
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
-
 	tio->clone = clone;
-
-	return 0;
 }
 
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
@@ -1874,12 +1795,7 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 		clone = tio->clone;
 
 	blk_rq_init(NULL, clone);
-	if (setup_clone(clone, rq, tio, gfp_mask)) {
-		/* -ENOMEM */
-		if (alloc_clone)
-			free_clone_request(md, clone);
-		return NULL;
-	}
+	setup_clone(clone, rq, tio);
 
 	return clone;
 }
@@ -1973,11 +1889,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		}
 		if (IS_ERR(clone))
 			return DM_MAPIO_REQUEUE;
-		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
-			/* -ENOMEM */
-			ti->type->release_clone_rq(clone);
-			return DM_MAPIO_REQUEUE;
-		}
+		setup_clone(clone, rq, tio);
 	}
 
 	switch (r) {
@@ -2431,8 +2343,6 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
-	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
-
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->rq_pool = p->rq_pool;
@@ -3536,48 +3446,23 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
+					     unsigned per_bio_data_size)
 {
-	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-	struct kmem_cache *cachep = NULL;
-	unsigned int pool_size = 0;
+	struct dm_md_mempools *pools;
+	unsigned int pool_size = dm_get_reserved_bio_based_ios();
 	unsigned int front_pad;
 
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
 		return NULL;
 
-	type = filter_md_type(type, md);
+	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
+		offsetof(struct dm_target_io, clone);
 
-	switch (type) {
-	case DM_TYPE_BIO_BASED:
-		cachep = _io_cache;
-		pool_size = dm_get_reserved_bio_based_ios();
-		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
-		break;
-	case DM_TYPE_REQUEST_BASED:
-		cachep = _rq_tio_cache;
-		pool_size = dm_get_reserved_rq_based_ios();
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-		/* fall through to setup remaining rq-based pools */
-	case DM_TYPE_MQ_REQUEST_BASED:
-		if (!pool_size)
-			pool_size = dm_get_reserved_rq_based_ios();
-		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
-		/* per_bio_data_size is not used. See __bind_mempools(). */
-		WARN_ON(per_bio_data_size != 0);
-		break;
-	default:
-		BUG();
-	}
-
-	if (cachep) {
-		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
-		if (!pools->io_pool)
-			goto out;
-	}
+	pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
+	if (!pools->io_pool)
+		goto out;
 
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
@@ -3587,10 +3472,34 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
 		goto out;
 
 	return pools;
-
 out:
 	dm_free_md_mempools(pools);
+	return NULL;
+}
+
+struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
+					    unsigned type)
+{
+	unsigned int pool_size = dm_get_reserved_rq_based_ios();
+	struct dm_md_mempools *pools;
 
+	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	if (!pools)
+		return NULL;
+
+	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
+		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+		if (!pools->rq_pool)
+			goto out;
+	}
+
+	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
+	if (!pools->io_pool)
+		goto out;
+
+	return pools;
+out:
+	dm_free_md_mempools(pools);
 	return NULL;
 }
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 6123c2bf9150..e6e66d087b26 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -222,8 +222,9 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
-					    unsigned integrity, unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
+					     unsigned per_bio_data_size);
+struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, unsigned type);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
-- 
cgit v1.2.1


From 4ae9944d132b160d444fa3aa875307eb0fa3eeec Mon Sep 17 00:00:00 2001
From: Junichi Nomura <j-nomura@ce.jp.nec.com>
Date: Tue, 26 May 2015 08:25:54 +0000
Subject: dm: run queue on re-queue

Without kicking queue, requeued request may stay forever in
the queue if there are no other I/O activities to the device.

The original error had been in v2.6.39 with commit 7eaceaccab5f
("block: remove per-queue plugging"), which replaced conditional
plugging by periodic runqueue.

Commit 9d1deb83d489 in v4.1-rc1 removed the periodic runqueue
and the problem started to manifest.

Fixes: 9d1deb83d489 ("dm: don't schedule delayed run of the queue if nothing to do")
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a930b72314ac..0bf79a0bad37 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1164,6 +1164,7 @@ static void old_requeue_request(struct request *rq)
 
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_requeue_request(q, rq);
+	blk_run_queue_async(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-- 
cgit v1.2.1


From 3a1407559a593d4360af12dd2df5296bf8eb0d28 Mon Sep 17 00:00:00 2001
From: Junichi Nomura <j-nomura@ce.jp.nec.com>
Date: Wed, 27 May 2015 04:22:07 +0000
Subject: dm: fix NULL pointer when clone_and_map_rq returns !DM_MAPIO_REMAPPED

When stacking request-based DM on blk_mq device, request cloning and
remapping are done in a single call to target's clone_and_map_rq().
The clone is allocated and valid only if clone_and_map_rq() returns
DM_MAPIO_REMAPPED.

The "IS_ERR(clone)" check in map_request() does not cover all the
!DM_MAPIO_REMAPPED cases that are possible (E.g. if underlying devices
are not ready or unavailable, clone_and_map_rq() may return
DM_MAPIO_REQUEUE without ever having established an ERR_PTR).  Fix this
by explicitly checking for a return that is not DM_MAPIO_REMAPPED in
map_request().

Without this fix, DM core may call setup_clone() for a NULL clone
and oops like this:

   BUG: unable to handle kernel NULL pointer dereference at 0000000000000068
   IP: [<ffffffff81227525>] blk_rq_prep_clone+0x7d/0x137
   ...
   CPU: 2 PID: 5793 Comm: kdmwork-253:3 Not tainted 4.0.0-nm #1
   ...
   Call Trace:
    [<ffffffffa01d1c09>] map_tio_request+0xa9/0x258 [dm_mod]
    [<ffffffff81071de9>] kthread_worker_fn+0xfd/0x150
    [<ffffffff81071cec>] ? kthread_parkme+0x24/0x24
    [<ffffffff81071cec>] ? kthread_parkme+0x24/0x24
    [<ffffffff81071fdd>] kthread+0xe6/0xee
    [<ffffffff81093a59>] ? put_lock_stats+0xe/0x20
    [<ffffffff81071ef7>] ? __init_kthread_worker+0x5b/0x5b
    [<ffffffff814c2d98>] ret_from_fork+0x58/0x90
    [<ffffffff81071ef7>] ? __init_kthread_worker+0x5b/0x5b

Fixes: e5863d9ad ("dm: allocate requests in target when stacking on blk-mq devices")
Reported-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 4.0+
---
 drivers/md/dm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 0bf79a0bad37..1c62ed8d09f4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1972,8 +1972,8 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 			dm_kill_unmapped_request(rq, r);
 			return r;
 		}
-		if (IS_ERR(clone))
-			return DM_MAPIO_REQUEUE;
+		if (r != DM_MAPIO_REMAPPED)
+			return r;
 		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
 			/* -ENOMEM */
 			ti->type->release_clone_rq(clone);
-- 
cgit v1.2.1


From 4c6dd53dd3674c310d7379c6b3273daa9fd95c79 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 27 May 2015 15:23:56 -0400
Subject: dm mpath: fix leak of dm_mpath_io structure in blk-mq .queue_rq error
 path

Otherwise kmemleak reported:

unreferenced object 0xffff88009b14e2b0 (size 16):
  comm "fio", pid 4274, jiffies 4294978034 (age 1253.210s)
  hex dump (first 16 bytes):
    40 12 f3 99 01 88 ff ff 00 10 00 00 00 00 00 00  @...............
  backtrace:
    [<ffffffff81600029>] kmemleak_alloc+0x49/0xb0
    [<ffffffff811679a8>] kmem_cache_alloc+0xf8/0x160
    [<ffffffff8111c950>] mempool_alloc_slab+0x10/0x20
    [<ffffffff8111cb37>] mempool_alloc+0x57/0x150
    [<ffffffffa04d2b61>] __multipath_map.isra.17+0xe1/0x220 [dm_multipath]
    [<ffffffffa04d2cb5>] multipath_clone_and_map+0x15/0x20 [dm_multipath]
    [<ffffffffa02889b5>] map_request.isra.39+0xd5/0x220 [dm_mod]
    [<ffffffffa028b0e4>] dm_mq_queue_rq+0x134/0x240 [dm_mod]
    [<ffffffff812cccb5>] __blk_mq_run_hw_queue+0x1d5/0x380
    [<ffffffff812ccaa5>] blk_mq_run_hw_queue+0xc5/0x100
    [<ffffffff812ce350>] blk_sq_make_request+0x240/0x300
    [<ffffffff812c0f30>] generic_make_request+0xc0/0x110
    [<ffffffff812c0ff2>] submit_bio+0x72/0x150
    [<ffffffff811c07cb>] do_blockdev_direct_IO+0x1f3b/0x2da0
    [<ffffffff811c166e>] __blockdev_direct_IO+0x3e/0x40
    [<ffffffff8120aa1a>] ext4_direct_IO+0x1aa/0x390

Fixes: e5863d9ad ("dm: allocate requests in target when stacking on blk-mq devices")
Reported-by: Bart Van Assche <bart.vanassche@sandisk.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 4.0+
---
 drivers/md/dm-mpath.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 63953477a07c..eff7bdd7731d 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -429,9 +429,11 @@ static int __multipath_map(struct dm_target *ti, struct request *clone,
 		/* blk-mq request-based interface */
 		*__clone = blk_get_request(bdev_get_queue(bdev),
 					   rq_data_dir(rq), GFP_ATOMIC);
-		if (IS_ERR(*__clone))
+		if (IS_ERR(*__clone)) {
 			/* ENOMEM, requeue */
+			clear_mapinfo(m, map_context);
 			return r;
+		}
 		(*__clone)->bio = (*__clone)->biotail = NULL;
 		(*__clone)->rq_disk = bdev->bd_disk;
 		(*__clone)->cmd_flags |= REQ_FAILFAST_TRANSPORT;
-- 
cgit v1.2.1


From 45714fbed4556149d7f1730f5bae74f81d5e2cd5 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 27 May 2015 15:25:27 -0400
Subject: dm: requeue from blk-mq dm_mq_queue_rq() using BLK_MQ_RQ_QUEUE_BUSY

Use BLK_MQ_RQ_QUEUE_BUSY to requeue a blk-mq request directly from the
DM blk-mq device's .queue_rq.  This cleans up the previous convoluted
handling of request requeueing that would return BLK_MQ_RQ_QUEUE_OK
(even though it wasn't) and then run blk_mq_requeue_request() followed
by blk_mq_kick_requeue_list().

Also, document that DM blk-mq ontop of old request_fn devices cannot
fail in clone_rq() since the clone request is preallocated as part of
the pdu.

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1c62ed8d09f4..1badfb250a18 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2754,13 +2754,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
 		/* clone request is allocated at the end of the pdu */
 		tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
-		if (!clone_rq(rq, md, tio, GFP_ATOMIC))
-			return BLK_MQ_RQ_QUEUE_BUSY;
+		(void) clone_rq(rq, md, tio, GFP_ATOMIC);
 		queue_kthread_work(&md->kworker, &tio->work);
 	} else {
 		/* Direct call is fine since .queue_rq allows allocations */
-		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
-			dm_requeue_unmapped_original_request(md, rq);
+		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+			/* Undo dm_start_request() before requeuing */
+			rq_completed(md, rq_data_dir(rq), false);
+			return BLK_MQ_RQ_QUEUE_BUSY;
+		}
 	}
 
 	return BLK_MQ_RQ_QUEUE_OK;
-- 
cgit v1.2.1


From 2b6b24574256c05be145936f1493aec74c6904e5 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 15:10:01 +1000
Subject: md/raid5: ensure whole batch is delayed for all required bitmap
 updates.

When we add a stripe to a batch, we need to be sure that
head stripe will wait for the bitmap update required for the new
stripe.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b9f2b9cc6060..c55a68f37c72 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -837,6 +837,15 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
 		    < IO_THRESHOLD)
 			md_wakeup_thread(conf->mddev->thread);
 
+	if (test_and_clear_bit(STRIPE_BIT_DELAY, &sh->state)) {
+		int seq = sh->bm_seq;
+		if (test_bit(STRIPE_BIT_DELAY, &sh->batch_head->state) &&
+		    sh->batch_head->bm_seq > seq)
+			seq = sh->batch_head->bm_seq;
+		set_bit(STRIPE_BIT_DELAY, &sh->batch_head->state);
+		sh->batch_head->bm_seq = seq;
+	}
+
 	atomic_inc(&sh->count);
 unlock_out:
 	unlock_two_stripes(head, sh);
-- 
cgit v1.2.1


From d0852df543e5aa7db34c1ad26d053782bcbf48f1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 May 2015 08:43:45 +1000
Subject: md/raid5: close race between STRIPE_BIT_DELAY and batching.

When we add a write to a stripe we need to make sure the bitmap
bit is set.  While doing that the stripe is not locked so it could
be added to a batch after which further changes to STRIPE_BIT_DELAY
and ->bm_seq are ineffective.

So we need to hold off adding to a stripe until bitmap_startwrite has
completed at least once, and we need to avoid further changes to
STRIPE_BIT_DELAY once the stripe has been added to a batch.

If a bitmap_startwrite() completes after the stripe was added to a
batch, it will not have set the bit, only incremented a counter, so no
extra delay of the stripe is needed.

Reported-by: Shaohua Li <shli@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 25 ++++++++++++++++++++++---
 drivers/md/raid5.h |  3 +++
 2 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c55a68f37c72..42d0ea6c8597 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -749,6 +749,7 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
 static bool stripe_can_batch(struct stripe_head *sh)
 {
 	return test_bit(STRIPE_BATCH_READY, &sh->state) &&
+		!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
 		is_full_stripe_write(sh);
 }
 
@@ -2996,14 +2997,32 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx,
 	pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
 		(unsigned long long)(*bip)->bi_iter.bi_sector,
 		(unsigned long long)sh->sector, dd_idx);
-	spin_unlock_irq(&sh->stripe_lock);
 
 	if (conf->mddev->bitmap && firstwrite) {
+		/* Cannot hold spinlock over bitmap_startwrite,
+		 * but must ensure this isn't added to a batch until
+		 * we have added to the bitmap and set bm_seq.
+		 * So set STRIPE_BITMAP_PENDING to prevent
+		 * batching.
+		 * If multiple add_stripe_bio() calls race here they
+		 * much all set STRIPE_BITMAP_PENDING.  So only the first one
+		 * to complete "bitmap_startwrite" gets to set
+		 * STRIPE_BIT_DELAY.  This is important as once a stripe
+		 * is added to a batch, STRIPE_BIT_DELAY cannot be changed
+		 * any more.
+		 */
+		set_bit(STRIPE_BITMAP_PENDING, &sh->state);
+		spin_unlock_irq(&sh->stripe_lock);
 		bitmap_startwrite(conf->mddev->bitmap, sh->sector,
 				  STRIPE_SECTORS, 0);
-		sh->bm_seq = conf->seq_flush+1;
-		set_bit(STRIPE_BIT_DELAY, &sh->state);
+		spin_lock_irq(&sh->stripe_lock);
+		clear_bit(STRIPE_BITMAP_PENDING, &sh->state);
+		if (!sh->batch_head) {
+			sh->bm_seq = conf->seq_flush+1;
+			set_bit(STRIPE_BIT_DELAY, &sh->state);
+		}
 	}
+	spin_unlock_irq(&sh->stripe_lock);
 
 	if (stripe_can_batch(sh))
 		stripe_add_to_batch_list(conf, sh);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 7dc0dd86074b..01cdb9f3a0c4 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -337,6 +337,9 @@ enum {
 	STRIPE_ON_RELEASE_LIST,
 	STRIPE_BATCH_READY,
 	STRIPE_BATCH_ERR,
+	STRIPE_BITMAP_PENDING,	/* Being added to bitmap, don't add
+				 * to batch yet.
+				 */
 };
 
 #define STRIPE_EXPAND_SYNC_FLAG \
-- 
cgit v1.2.1


From b15a9dbdbfe72848b7ed4cd3f97fe80daaf99c89 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 22 May 2015 15:20:04 +1000
Subject: md/raid5: Ensure a batch member is not handled prematurely.

If a stripe is a member of a batch, but not the head, it must
not be handled separately from the rest of the batch.

'clear_batch_ready()' handles this requirement to some
extent but not completely.  If a member is passed to handle_stripe()
a second time it returns '0' indicating the stripe can be handled,
which is wrong.
So add an extra test.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 42d0ea6c8597..e58736740bac 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4200,9 +4200,13 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
 
 static int clear_batch_ready(struct stripe_head *sh)
 {
+	/* Return '1' if this is a member of batch, or
+	 * '0' if it is a lone stripe or a head which can now be
+	 * handled.
+	 */
 	struct stripe_head *tmp;
 	if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state))
-		return 0;
+		return (sh->batch_head && sh->batch_head != sh);
 	spin_lock(&sh->stripe_lock);
 	if (!sh->batch_head) {
 		spin_unlock(&sh->stripe_lock);
-- 
cgit v1.2.1


From 4e3d62ff4976f26d22b8b91572a49136bb3a23f1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 11:50:16 +1000
Subject: md/raid5: remove condition test from check_break_stripe_batch_list.

handle_stripe_clean_event() contains a chunk of code very
similar to check_break_stripe_batch_list().
If we make the latter more like the former, we can end up
with just one copy of this code.

This  first step removed the condition (and the 'check_') part
of the name.  This has the added advantage of making it clear
what check is being performed at the point where the function is
called.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e58736740bac..fc5c4039c394 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4234,16 +4234,11 @@ static int clear_batch_ready(struct stripe_head *sh)
 	return 0;
 }
 
-static void check_break_stripe_batch_list(struct stripe_head *sh)
+static void break_stripe_batch_list(struct stripe_head *head_sh)
 {
-	struct stripe_head *head_sh, *next;
+	struct stripe_head *sh, *next;
 	int i;
 
-	if (!test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
-		return;
-
-	head_sh = sh;
-
 	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
 
 		list_del_init(&sh->batch_list);
@@ -4290,7 +4285,8 @@ static void handle_stripe(struct stripe_head *sh)
 		return;
 	}
 
-	check_break_stripe_batch_list(sh);
+	if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
+		break_stripe_batch_list(sh);
 
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
 		spin_lock(&sh->stripe_lock);
-- 
cgit v1.2.1


From fb642b92c267beeefd352af9bc461eac93a7552c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 12:00:47 +1000
Subject: md/raid5: duplicate some more handle_stripe_clean_event code in
 break_stripe_batch_list

break_stripe_batch list didn't clear head_sh->batch_head.
This was probably a bug.

Also clear all R5_Overlap flags and if any were cleared, wake up
'wait_for_overlap'.
This isn't always necessary but the worst effect is a little
extra checking for code that is waiting on wait_for_overlap.

Also, don't use wake_up_nr() because that does the wrong thing
if 'nr' is zero, and it number of flags cleared doesn't
strongly correlate with the number of threads to wake.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index fc5c4039c394..6de2e1edd492 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3557,7 +3557,8 @@ unhash:
 	spin_lock_irq(&head_sh->stripe_lock);
 	head_sh->batch_head = NULL;
 	spin_unlock_irq(&head_sh->stripe_lock);
-	wake_up_nr(&conf->wait_for_overlap, wakeup_nr);
+	if (wakeup_nr)
+		wake_up(&conf->wait_for_overlap);
 	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
 		set_bit(STRIPE_HANDLE, &head_sh->state);
 }
@@ -4238,6 +4239,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh)
 {
 	struct stripe_head *sh, *next;
 	int i;
+	int do_wakeup = 0;
 
 	list_for_each_entry_safe(sh, next, &head_sh->batch_list, batch_list) {
 
@@ -4250,10 +4252,12 @@ static void break_stripe_batch_list(struct stripe_head *head_sh)
 						 STRIPE_EXPAND_SYNC_FLAG));
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
-		for (i = 0; i < sh->disks; i++)
+		for (i = 0; i < sh->disks; i++) {
+			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
+				do_wakeup = 1;
 			sh->dev[i].flags = head_sh->dev[i].flags &
 				(~((1 << R5_WriteError) | (1 << R5_Overlap)));
-
+		}
 		spin_lock_irq(&sh->stripe_lock);
 		sh->batch_head = NULL;
 		spin_unlock_irq(&sh->stripe_lock);
@@ -4261,6 +4265,15 @@ static void break_stripe_batch_list(struct stripe_head *head_sh)
 		set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
 	}
+	spin_lock_irq(&head_sh->stripe_lock);
+	head_sh->batch_head = NULL;
+	spin_unlock_irq(&head_sh->stripe_lock);
+	for (i = 0; i < head_sh->disks; i++)
+		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
+			do_wakeup = 1;
+
+	if (do_wakeup)
+		wake_up(&head_sh->raid_conf->wait_for_overlap);
 }
 
 static void handle_stripe(struct stripe_head *sh)
-- 
cgit v1.2.1


From 3960ce796198254b7a1b420dc9a26d80928523bd Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 12:20:36 +1000
Subject: md/raid5: add handle_flags arg to break_stripe_batch_list.

When we break a stripe_batch_list we sometimes want to set
STRIPE_HANDLE on the individual stripes, and sometimes not.

So pass a 'handle_flags' arg.  If it is zero, always set STRIPE_HANDLE
(on non-head stripes).  If not zero, only set it if any of the given
flags are present.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6de2e1edd492..0b65eb51e562 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4235,7 +4235,8 @@ static int clear_batch_ready(struct stripe_head *sh)
 	return 0;
 }
 
-static void break_stripe_batch_list(struct stripe_head *head_sh)
+static void break_stripe_batch_list(struct stripe_head *head_sh,
+				    unsigned long handle_flags)
 {
 	struct stripe_head *sh, *next;
 	int i;
@@ -4261,8 +4262,9 @@ static void break_stripe_batch_list(struct stripe_head *head_sh)
 		spin_lock_irq(&sh->stripe_lock);
 		sh->batch_head = NULL;
 		spin_unlock_irq(&sh->stripe_lock);
-
-		set_bit(STRIPE_HANDLE, &sh->state);
+		if (handle_flags == 0 ||
+		    sh->state & handle_flags)
+			set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
 	}
 	spin_lock_irq(&head_sh->stripe_lock);
@@ -4271,6 +4273,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh)
 	for (i = 0; i < head_sh->disks; i++)
 		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
 			do_wakeup = 1;
+	if (head_sh->state & handle_flags)
+		set_bit(STRIPE_HANDLE, &head_sh->state);
 
 	if (do_wakeup)
 		wake_up(&head_sh->raid_conf->wait_for_overlap);
@@ -4299,7 +4303,7 @@ static void handle_stripe(struct stripe_head *sh)
 	}
 
 	if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state))
-		break_stripe_batch_list(sh);
+		break_stripe_batch_list(sh, 0);
 
 	if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
 		spin_lock(&sh->stripe_lock);
-- 
cgit v1.2.1


From 1b956f7a8f9aa63ea9644ab8c3374cf381993363 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 12:40:26 +1000
Subject: md/raid5: be more selective about distributing flags across batch.

When a batch of stripes is broken up, we keep some of the flags
that were per-stripe, and copy other flags from the head to all
others.

This only happens while a stripe is being handled, so many of the
flags are irrelevant.

The "SYNC_FLAGS" (which I've renamed to make it clear there are
several) and STRIPE_DEGRADED are set per-stripe and so need to be
preserved.  STRIPE_INSYNC is the only flag that is set on the head
that needs to be propagated to all others.

For safety, add a WARN_ON if others are set, except:
 STRIPE_HANDLE - this is safe and per-stripe and we are going to set
      in several cases anyway
 STRIPE_INSYNC
 STRIPE_IO_STARTED - this is just a hint and doesn't hurt.
 STRIPE_ON_PLUG_LIST
 STRIPE_ON_RELEASE_LIST - It is a point pointless for a batched
           stripe to be on one of these lists, but it can happen
           as can be safely ignored.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 55 +++++++++++++++++++++++++++++++++++++++++++-----------
 drivers/md/raid5.h |  2 +-
 2 files changed, 45 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0b65eb51e562..1141b7f62e6e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3534,10 +3534,27 @@ unhash:
 				      struct stripe_head, batch_list);
 		list_del_init(&sh->batch_list);
 
-		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
-			      head_sh->state & ~((1 << STRIPE_ACTIVE) |
-						 (1 << STRIPE_PREREAD_ACTIVE) |
-						 STRIPE_EXPAND_SYNC_FLAG));
+		WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
+					  (1 << STRIPE_SYNCING) |
+					  (1 << STRIPE_REPLACED) |
+					  (1 << STRIPE_PREREAD_ACTIVE) |
+					  (1 << STRIPE_DELAYED) |
+					  (1 << STRIPE_BIT_DELAY) |
+					  (1 << STRIPE_FULL_WRITE) |
+					  (1 << STRIPE_BIOFILL_RUN) |
+					  (1 << STRIPE_COMPUTE_RUN)  |
+					  (1 << STRIPE_OPS_REQ_PENDING) |
+					  (1 << STRIPE_DISCARD) |
+					  (1 << STRIPE_BATCH_READY) |
+					  (1 << STRIPE_BATCH_ERR) |
+					  (1 << STRIPE_BITMAP_PENDING)));
+		WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
+					      (1 << STRIPE_REPLACED)));
+
+		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
+					    (1 << STRIPE_DEGRADED)),
+			      head_sh->state & (1 << STRIPE_INSYNC));
+
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
 		for (i = 0; i < sh->disks; i++) {
@@ -3549,7 +3566,7 @@ unhash:
 		spin_lock_irq(&sh->stripe_lock);
 		sh->batch_head = NULL;
 		spin_unlock_irq(&sh->stripe_lock);
-		if (sh->state & STRIPE_EXPAND_SYNC_FLAG)
+		if (sh->state & STRIPE_EXPAND_SYNC_FLAGS)
 			set_bit(STRIPE_HANDLE, &sh->state);
 		release_stripe(sh);
 	}
@@ -3559,7 +3576,7 @@ unhash:
 	spin_unlock_irq(&head_sh->stripe_lock);
 	if (wakeup_nr)
 		wake_up(&conf->wait_for_overlap);
-	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAG)
+	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAGS)
 		set_bit(STRIPE_HANDLE, &head_sh->state);
 }
 
@@ -4246,11 +4263,27 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
 
 		list_del_init(&sh->batch_list);
 
-		set_mask_bits(&sh->state, ~STRIPE_EXPAND_SYNC_FLAG,
-			      head_sh->state & ~((1 << STRIPE_ACTIVE) |
-						 (1 << STRIPE_PREREAD_ACTIVE) |
-						 (1 << STRIPE_DEGRADED) |
-						 STRIPE_EXPAND_SYNC_FLAG));
+		WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
+					  (1 << STRIPE_SYNCING) |
+					  (1 << STRIPE_REPLACED) |
+					  (1 << STRIPE_PREREAD_ACTIVE) |
+					  (1 << STRIPE_DELAYED) |
+					  (1 << STRIPE_BIT_DELAY) |
+					  (1 << STRIPE_FULL_WRITE) |
+					  (1 << STRIPE_BIOFILL_RUN) |
+					  (1 << STRIPE_COMPUTE_RUN)  |
+					  (1 << STRIPE_OPS_REQ_PENDING) |
+					  (1 << STRIPE_DISCARD) |
+					  (1 << STRIPE_BATCH_READY) |
+					  (1 << STRIPE_BATCH_ERR) |
+					  (1 << STRIPE_BITMAP_PENDING)));
+		WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
+					      (1 << STRIPE_REPLACED)));
+
+		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
+					    (1 << STRIPE_DEGRADED)),
+			      head_sh->state & (1 << STRIPE_INSYNC));
+
 		sh->check_state = head_sh->check_state;
 		sh->reconstruct_state = head_sh->reconstruct_state;
 		for (i = 0; i < sh->disks; i++) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 01cdb9f3a0c4..896d603ad0da 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -342,7 +342,7 @@ enum {
 				 */
 };
 
-#define STRIPE_EXPAND_SYNC_FLAG \
+#define STRIPE_EXPAND_SYNC_FLAGS \
 	((1 << STRIPE_EXPAND_SOURCE) |\
 	(1 << STRIPE_EXPAND_READY) |\
 	(1 << STRIPE_EXPANDING) |\
-- 
cgit v1.2.1


From 787b76fa37159050f6d26aebfa6210009baed93b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 21 May 2015 12:56:41 +1000
Subject: md/raid5: call break_stripe_batch_list from handle_stripe_clean_event

Now that the code in break_stripe_batch_list() is nearly identical
to the end of handle_stripe_clean_event, replace the later
with a function call.

The only remaining difference of any interest is the masking that is
applieds to dev[i].flags copied from head_sh.
R5_WriteError certainly isn't wanted as it is set per-stripe, not
per-patch.  R5_Overlap isn't wanted as it is explicitly handled.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 61 ++++--------------------------------------------------
 1 file changed, 4 insertions(+), 57 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1141b7f62e6e..3254504b1080 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3420,6 +3420,8 @@ static void handle_stripe_fill(struct stripe_head *sh,
 	set_bit(STRIPE_HANDLE, &sh->state);
 }
 
+static void break_stripe_batch_list(struct stripe_head *head_sh,
+				    unsigned long handle_flags);
 /* handle_stripe_clean_event
  * any written block on an uptodate or failed drive can be returned.
  * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
@@ -3433,7 +3435,6 @@ static void handle_stripe_clean_event(struct r5conf *conf,
 	int discard_pending = 0;
 	struct stripe_head *head_sh = sh;
 	bool do_endio = false;
-	int wakeup_nr = 0;
 
 	for (i = disks; i--; )
 		if (sh->dev[i].written) {
@@ -3522,62 +3523,8 @@ unhash:
 		if (atomic_dec_and_test(&conf->pending_full_writes))
 			md_wakeup_thread(conf->mddev->thread);
 
-	if (!head_sh->batch_head || !do_endio)
-		return;
-	for (i = 0; i < head_sh->disks; i++) {
-		if (test_and_clear_bit(R5_Overlap, &head_sh->dev[i].flags))
-			wakeup_nr++;
-	}
-	while (!list_empty(&head_sh->batch_list)) {
-		int i;
-		sh = list_first_entry(&head_sh->batch_list,
-				      struct stripe_head, batch_list);
-		list_del_init(&sh->batch_list);
-
-		WARN_ON_ONCE(sh->state & ((1 << STRIPE_ACTIVE) |
-					  (1 << STRIPE_SYNCING) |
-					  (1 << STRIPE_REPLACED) |
-					  (1 << STRIPE_PREREAD_ACTIVE) |
-					  (1 << STRIPE_DELAYED) |
-					  (1 << STRIPE_BIT_DELAY) |
-					  (1 << STRIPE_FULL_WRITE) |
-					  (1 << STRIPE_BIOFILL_RUN) |
-					  (1 << STRIPE_COMPUTE_RUN)  |
-					  (1 << STRIPE_OPS_REQ_PENDING) |
-					  (1 << STRIPE_DISCARD) |
-					  (1 << STRIPE_BATCH_READY) |
-					  (1 << STRIPE_BATCH_ERR) |
-					  (1 << STRIPE_BITMAP_PENDING)));
-		WARN_ON_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) |
-					      (1 << STRIPE_REPLACED)));
-
-		set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS |
-					    (1 << STRIPE_DEGRADED)),
-			      head_sh->state & (1 << STRIPE_INSYNC));
-
-		sh->check_state = head_sh->check_state;
-		sh->reconstruct_state = head_sh->reconstruct_state;
-		for (i = 0; i < sh->disks; i++) {
-			if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
-				wakeup_nr++;
-			sh->dev[i].flags = head_sh->dev[i].flags;
-		}
-
-		spin_lock_irq(&sh->stripe_lock);
-		sh->batch_head = NULL;
-		spin_unlock_irq(&sh->stripe_lock);
-		if (sh->state & STRIPE_EXPAND_SYNC_FLAGS)
-			set_bit(STRIPE_HANDLE, &sh->state);
-		release_stripe(sh);
-	}
-
-	spin_lock_irq(&head_sh->stripe_lock);
-	head_sh->batch_head = NULL;
-	spin_unlock_irq(&head_sh->stripe_lock);
-	if (wakeup_nr)
-		wake_up(&conf->wait_for_overlap);
-	if (head_sh->state & STRIPE_EXPAND_SYNC_FLAGS)
-		set_bit(STRIPE_HANDLE, &head_sh->state);
+	if (head_sh->batch_head && do_endio)
+		break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS);
 }
 
 static void handle_stripe_dirtying(struct r5conf *conf,
-- 
cgit v1.2.1


From 626f2092c85ac847bb80b3257eb6a565dec32278 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 22 May 2015 14:03:10 +1000
Subject: md/raid5: break stripe-batches when the array has failed.

Once the array has too much failure, we need to break
stripe-batches up so they can all be dealt with.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3254504b1080..553d54b87052 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4337,6 +4337,7 @@ static void handle_stripe(struct stripe_head *sh)
 	if (s.failed > conf->max_degraded) {
 		sh->check_state = 0;
 		sh->reconstruct_state = 0;
+		break_stripe_batch_list(sh, 0);
 		if (s.to_read+s.to_write+s.written)
 			handle_failed_stripe(conf, sh, &s, disks, &s.return_bi);
 		if (s.syncing + s.replacing)
-- 
cgit v1.2.1


From 56ccc1125bc141cf63927eda7febff4216dea2d3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 28 May 2015 17:53:29 +1000
Subject: md: fix race when unfreezing sync_action

A recent change removed the need for locking around writing
to "sync_action" (and various other places), but introduced a
subtle race.
When e.g. setting 'reshape' on a 'frozen' array, the 'frozen'
flag is cleared before 'reshape' is set, so the md thread can
get in and start trying recovery - which isn't wanted.

So instead of clearing MD_RECOVERY_FROZEN for any command
except 'frozen', only clear it when each specific command
is parsed.  This allows the handling of 'reshape' to clear
the bit while a lock is held.

Also remove some places where we set MD_RECOVERY_NEEDED,
as it is always set on non-error exit of the function.


Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: 6791875e2e53 ("md: make reconfig_mutex optional for writes to md sysfs files.")
---
 drivers/md/md.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d4f31e195e26..8f10f4ea70ea 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4211,12 +4211,12 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 	if (!mddev->pers || !mddev->pers->sync_request)
 		return -EINVAL;
 
-	if (cmd_match(page, "frozen"))
-		set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-	else
-		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 
 	if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
+		if (cmd_match(page, "frozen"))
+			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+		else
+			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		flush_workqueue(md_misc_wq);
 		if (mddev->sync_thread) {
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
@@ -4229,16 +4229,17 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
 		return -EBUSY;
 	else if (cmd_match(page, "resync"))
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 	else if (cmd_match(page, "recover")) {
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
-		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	} else if (cmd_match(page, "reshape")) {
 		int err;
 		if (mddev->pers->start_reshape == NULL)
 			return -EINVAL;
 		err = mddev_lock(mddev);
 		if (!err) {
+			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 			err = mddev->pers->start_reshape(mddev);
 			mddev_unlock(mddev);
 		}
@@ -4250,6 +4251,7 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 			set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		else if (!cmd_match(page, "repair"))
 			return -EINVAL;
+		clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
 		set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	}
-- 
cgit v1.2.1


From e5d8de32cc02a259e1a237ab57cba00f2930fa6a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 28 May 2015 15:12:52 -0400
Subject: dm: fix false warning in free_rq_clone() for unmapped requests

When stacking request-based dm device on non blk-mq device and
device-mapper target could not map the request (error target is used,
multipath target with all paths down, etc), the WARN_ON_ONCE() in
free_rq_clone() will trigger when it shouldn't.

The warning was added by commit aa6df8d ("dm: fix free_rq_clone() NULL
pointer when requeueing unmapped request").  But free_rq_clone() with
clone->q == NULL is valid usage for the case where
dm_kill_unmapped_request() initiates request cleanup.

Fix this false warning by just removing the WARN_ON -- it only generated
false positives and was never useful in catching the intended case
(completing clone request not being mapped e.g. clone->q being NULL).

Fixes: aa6df8d ("dm: fix free_rq_clone() NULL pointer when requeueing unmapped request")
Reported-by: Bart Van Assche <bart.vanassche@sandisk.com>
Reported-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1badfb250a18..e24069aaeb18 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1082,13 +1082,11 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 	dm_put(md);
 }
 
-static void free_rq_clone(struct request *clone, bool must_be_mapped)
+static void free_rq_clone(struct request *clone)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 
-	WARN_ON_ONCE(must_be_mapped && !clone->q);
-
 	blk_rq_unprep_clone(clone);
 
 	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
@@ -1132,7 +1130,7 @@ static void dm_end_request(struct request *clone, int error)
 			rq->sense_len = clone->sense_len;
 	}
 
-	free_rq_clone(clone, true);
+	free_rq_clone(clone);
 	if (!rq->q->mq_ops)
 		blk_end_request_all(rq, error);
 	else
@@ -1151,7 +1149,7 @@ static void dm_unprep_request(struct request *rq)
 	}
 
 	if (clone)
-		free_rq_clone(clone, false);
+		free_rq_clone(clone);
 }
 
 /*
-- 
cgit v1.2.1


From 15b94a690470038aa08247eedbebbe7e2218d5ee Mon Sep 17 00:00:00 2001
From: Junichi Nomura <j-nomura@ce.jp.nec.com>
Date: Fri, 29 May 2015 08:51:03 +0000
Subject: dm: fix reload failure of 0 path multipath mapping on blk-mq devices

dm-multipath accepts 0 path mapping.

  # echo '0 2097152 multipath 0 0 0 0' | dmsetup create newdev

Such a mapping can be used to release underlying devices while still
holding requests in its queue until working paths come back.

However, once the multipath device is created over blk-mq devices,
it rejects reloading of 0 path mapping:

  # echo '0 2097152 multipath 0 0 1 1 queue-length 0 1 1 /dev/sda 1' \
      | dmsetup create mpath1
  # echo '0 2097152 multipath 0 0 0 0' | dmsetup load mpath1
  device-mapper: reload ioctl on mpath1 failed: Invalid argument
  Command failed

With following kernel message:
  device-mapper: ioctl: can't change device type after initial table load.

DM tries to inherit the current table type using dm_table_set_type()
but it doesn't work as expected because of unnecessary check about
whether the target type is hybrid or not.

Hybrid type is for targets that work as either request-based or bio-based
and not required for blk-mq or non blk-mq checking.

Fixes: 65803c205983 ("dm table: train hybrid target type detection to select blk-mq if appropriate")
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index d9b00b8565c6..16ba55ad7089 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -820,6 +820,12 @@ void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
 }
 EXPORT_SYMBOL(dm_consume_args);
 
+static bool __table_type_request_based(unsigned table_type)
+{
+	return (table_type == DM_TYPE_REQUEST_BASED ||
+		table_type == DM_TYPE_MQ_REQUEST_BASED);
+}
+
 static int dm_table_set_type(struct dm_table *t)
 {
 	unsigned i;
@@ -852,8 +858,7 @@ static int dm_table_set_type(struct dm_table *t)
 		 * Determine the type from the live device.
 		 * Default to bio-based if device is new.
 		 */
-		if (live_md_type == DM_TYPE_REQUEST_BASED ||
-		    live_md_type == DM_TYPE_MQ_REQUEST_BASED)
+		if (__table_type_request_based(live_md_type))
 			request_based = 1;
 		else
 			bio_based = 1;
@@ -903,7 +908,7 @@ static int dm_table_set_type(struct dm_table *t)
 			}
 		t->type = DM_TYPE_MQ_REQUEST_BASED;
 
-	} else if (hybrid && list_empty(devices) && live_md_type != DM_TYPE_NONE) {
+	} else if (list_empty(devices) && __table_type_request_based(live_md_type)) {
 		/* inherit live MD type */
 		t->type = live_md_type;
 
@@ -925,10 +930,7 @@ struct target_type *dm_table_get_immutable_target_type(struct dm_table *t)
 
 bool dm_table_request_based(struct dm_table *t)
 {
-	unsigned table_type = dm_table_get_type(t);
-
-	return (table_type == DM_TYPE_REQUEST_BASED ||
-		table_type == DM_TYPE_MQ_REQUEST_BASED);
+	return __table_type_request_based(dm_table_get_type(t));
 }
 
 bool dm_table_mq_request_based(struct dm_table *t)
-- 
cgit v1.2.1


From 1c220c69ce0dcc0f234a9f263ad9c0864f971852 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 29 May 2015 14:52:51 +0100
Subject: dm: fix casting bug in dm_merge_bvec()

dm_merge_bvec() was originally added in f6fccb ("dm: introduce
merge_bvec_fn").  In that commit a value in sectors is converted to
bytes using << 9, and then assigned to an int.  This code made
assumptions about the value of BIO_MAX_SECTORS.

A later commit 148e51 ("dm: improve documentation and code clarity in
dm_merge_bvec") was meant to have no functional change but it removed
the use of BIO_MAX_SECTORS in favor of using queue_max_sectors().  At
this point the cast from sector_t to int resulted in a zero value.  The
fallout being dm_merge_bvec() would only allow a single page to be added
to a bio.

This interim fix is minimal for the benefit of stable@ because the more
comprehensive cleanup of passing a sector_t to all DM targets' merge
function will impact quite a few DM targets.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 3.19+
---
 drivers/md/dm.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e24069aaeb18..2caf492890d6 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1723,8 +1723,7 @@ static int dm_merge_bvec(struct request_queue *q,
 	struct mapped_device *md = q->queuedata;
 	struct dm_table *map = dm_get_live_table_fast(md);
 	struct dm_target *ti;
-	sector_t max_sectors;
-	int max_size = 0;
+	sector_t max_sectors, max_size = 0;
 
 	if (unlikely(!map))
 		goto out;
@@ -1739,8 +1738,16 @@ static int dm_merge_bvec(struct request_queue *q,
 	max_sectors = min(max_io_len(bvm->bi_sector, ti),
 			  (sector_t) queue_max_sectors(q));
 	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
-	if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
-		max_size = 0;
+
+	/*
+	 * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t
+	 * to the targets' merge function since it holds sectors not bytes).
+	 * Just doing this as an interim fix for stable@ because the more
+	 * comprehensive cleanup of switching to sector_t will impact every
+	 * DM target that implements a ->merge hook.
+	 */
+	if (max_size > INT_MAX)
+		max_size = INT_MAX;
 
 	/*
 	 * merge_bvec_fn() returns number of bytes
@@ -1748,7 +1755,7 @@ static int dm_merge_bvec(struct request_queue *q,
 	 * max is precomputed maximal io size
 	 */
 	if (max_size && ti->type->merge)
-		max_size = ti->type->merge(ti, bvm, biovec, max_size);
+		max_size = ti->type->merge(ti, bvm, biovec, (int) max_size);
 	/*
 	 * If the target doesn't support merge method and some of the devices
 	 * provided their merge_bvec method (we know this by looking for the
-- 
cgit v1.2.1


From cbc4e3c1350beb47beab8f34ad9be3d34a20c705 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 27 Apr 2015 16:37:50 -0400
Subject: dm: do not allocate any mempools for blk-mq request-based DM

Do not allocate the io_pool mempool for blk-mq request-based DM
(DM_TYPE_MQ_REQUEST_BASED) in dm_alloc_rq_mempools().

Also refine __bind_mempools() to have more precise awareness of which
mempools each type of DM device uses -- avoids mempool churn when
reloading DM tables (particularly for DM_TYPE_REQUEST_BASED).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c |  4 +--
 drivers/md/dm.c       | 69 ++++++++++++++++++++++++++++-----------------------
 2 files changed, 40 insertions(+), 33 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a5f94125ad01..85e1d39e9a38 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -964,8 +964,8 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 		return -EINVAL;
 	}
 
-	if (!t->mempools)
-		return -ENOMEM;
+	if (IS_ERR(t->mempools))
+		return PTR_ERR(t->mempools);
 
 	return 0;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4d6f089a0e9e..916f6015981c 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2323,39 +2323,52 @@ static void free_dev(struct mapped_device *md)
 	kfree(md);
 }
 
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+	if (type == DM_TYPE_BIO_BASED)
+		return type;
+
+	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
+}
+
 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-	if (md->bs) {
-		/* The md already has necessary mempools. */
-		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
+	switch (filter_md_type(dm_table_get_type(t), md)) {
+	case DM_TYPE_BIO_BASED:
+		if (md->bs && md->io_pool) {
 			/*
+			 * This bio-based md already has necessary mempools.
 			 * Reload bioset because front_pad may have changed
 			 * because a different table was loaded.
 			 */
 			bioset_free(md->bs);
 			md->bs = p->bs;
 			p->bs = NULL;
+			goto out;
 		}
-		/*
-		 * There's no need to reload with request-based dm
-		 * because the size of front_pad doesn't change.
-		 * Note for future: If you are to reload bioset,
-		 * prep-ed requests in the queue may refer
-		 * to bio from the old bioset, so you must walk
-		 * through the queue to unprep.
-		 */
-		goto out;
+		break;
+	case DM_TYPE_REQUEST_BASED:
+		if (md->rq_pool && md->io_pool)
+			/*
+			 * This request-based md already has necessary mempools.
+			 */
+			goto out;
+		break;
+	case DM_TYPE_MQ_REQUEST_BASED:
+		BUG_ON(p); /* No mempools needed */
+		return;
 	}
 
+	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
+
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->rq_pool = p->rq_pool;
 	p->rq_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
-
 out:
 	/* mempool bind completed, no longer need any mempools in the table */
 	dm_table_free_md_mempools(t);
@@ -2734,14 +2747,6 @@ out_tag_set:
 	return err;
 }
 
-static unsigned filter_md_type(unsigned type, struct mapped_device *md)
-{
-	if (type == DM_TYPE_BIO_BASED)
-		return type;
-
-	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
-}
-
 /*
  * Setup the DM device's queue based on md's type
  */
@@ -3463,7 +3468,7 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 
 	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
 		offsetof(struct dm_target_io, clone);
@@ -3482,24 +3487,26 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 	return pools;
 out:
 	dm_free_md_mempools(pools);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
 					    unsigned type)
 {
-	unsigned int pool_size = dm_get_reserved_rq_based_ios();
+	unsigned int pool_size;
 	struct dm_md_mempools *pools;
 
+	if (filter_md_type(type, md) == DM_TYPE_MQ_REQUEST_BASED)
+		return NULL; /* No mempools needed */
+
+	pool_size = dm_get_reserved_rq_based_ios();
 	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
-	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-	}
+	pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+	if (!pools->rq_pool)
+		goto out;
 
 	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
 	if (!pools->io_pool)
@@ -3508,7 +3515,7 @@ struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
 	return pools;
 out:
 	dm_free_md_mempools(pools);
-	return NULL;
+	return ERR_PTR(-ENOMEM);
 }
 
 void dm_free_md_mempools(struct dm_md_mempools *pools)
-- 
cgit v1.2.1


From 2d76fff18fd12284493456b01c998e540b140c23 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 29 Apr 2015 12:07:12 -0400
Subject: dm: cleanup methods that requeue requests

More often than not a request that is requeued _is_ mapped (meaning the
clone request is allocated and clone->q is initialized).  Rename
dm_requeue_unmapped_original_request() to avoid potential confusion due
to function name containing "unmapped".

Also, remove dm_requeue_unmapped_request() since callers can easily call
the dm_requeue_original_request() directly.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 916f6015981c..4b6cb1220182 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1113,8 +1113,8 @@ static void old_requeue_request(struct request *rq)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void dm_requeue_unmapped_original_request(struct mapped_device *md,
-						 struct request *rq)
+static void dm_requeue_original_request(struct mapped_device *md,
+					struct request *rq)
 {
 	int rw = rq_data_dir(rq);
 
@@ -1130,13 +1130,6 @@ static void dm_requeue_unmapped_original_request(struct mapped_device *md,
 	rq_completed(md, rw, false);
 }
 
-static void dm_requeue_unmapped_request(struct request *clone)
-{
-	struct dm_rq_target_io *tio = clone->end_io_data;
-
-	dm_requeue_unmapped_original_request(tio->md, tio->orig);
-}
-
 static void old_stop_queue(struct request_queue *q)
 {
 	unsigned long flags;
@@ -1200,7 +1193,7 @@ static void dm_done(struct request *clone, int error, bool mapped)
 		return;
 	else if (r == DM_ENDIO_REQUEUE)
 		/* The target wants to requeue the I/O */
-		dm_requeue_unmapped_request(clone);
+		dm_requeue_original_request(tio->md, tio->orig);
 	else {
 		DMWARN("unimplemented target endio return value: %d", r);
 		BUG();
@@ -1910,7 +1903,7 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		break;
 	case DM_MAPIO_REQUEUE:
 		/* The target wants to requeue the I/O */
-		dm_requeue_unmapped_request(clone);
+		dm_requeue_original_request(md, tio->orig);
 		break;
 	default:
 		if (r > 0) {
@@ -1933,7 +1926,7 @@ static void map_tio_request(struct kthread_work *work)
 	struct mapped_device *md = tio->md;
 
 	if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
-		dm_requeue_unmapped_original_request(md, rq);
+		dm_requeue_original_request(md, rq);
 }
 
 static void dm_start_request(struct mapped_device *md, struct request *orig)
-- 
cgit v1.2.1


From 0f20972f7bf6922df49ef7ce7a6df802347d2c52 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 28 Apr 2015 11:50:29 -0400
Subject: dm: factor out a common cleanup_mapped_device()

Introduce a single common method for cleaning up a DM device's
mapped_device.  No functional change, just eliminates duplication of
delicate mapped_device cleanup code.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 78 +++++++++++++++++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 35 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 4b6cb1220182..767bce906588 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2166,6 +2166,40 @@ static void dm_init_old_md_queue(struct mapped_device *md)
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 }
 
+static void cleanup_mapped_device(struct mapped_device *md)
+{
+	cleanup_srcu_struct(&md->io_barrier);
+
+	if (md->wq)
+		destroy_workqueue(md->wq);
+	if (md->kworker_task)
+		kthread_stop(md->kworker_task);
+	if (md->io_pool)
+		mempool_destroy(md->io_pool);
+	if (md->rq_pool)
+		mempool_destroy(md->rq_pool);
+	if (md->bs)
+		bioset_free(md->bs);
+
+	if (md->disk) {
+		spin_lock(&_minor_lock);
+		md->disk->private_data = NULL;
+		spin_unlock(&_minor_lock);
+		if (blk_get_integrity(md->disk))
+			blk_integrity_unregister(md->disk);
+		del_gendisk(md->disk);
+		put_disk(md->disk);
+	}
+
+	if (md->queue)
+		blk_cleanup_queue(md->queue);
+
+	if (md->bdev) {
+		bdput(md->bdev);
+		md->bdev = NULL;
+	}
+}
+
 /*
  * Allocate and initialise a blank device with a given minor.
  */
@@ -2211,13 +2245,13 @@ static struct mapped_device *alloc_dev(int minor)
 
 	md->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!md->queue)
-		goto bad_queue;
+		goto bad;
 
 	dm_init_md_queue(md);
 
 	md->disk = alloc_disk(1);
 	if (!md->disk)
-		goto bad_disk;
+		goto bad;
 
 	atomic_set(&md->pending[0], 0);
 	atomic_set(&md->pending[1], 0);
@@ -2238,11 +2272,11 @@ static struct mapped_device *alloc_dev(int minor)
 
 	md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
 	if (!md->wq)
-		goto bad_thread;
+		goto bad;
 
 	md->bdev = bdget_disk(md->disk, 0);
 	if (!md->bdev)
-		goto bad_bdev;
+		goto bad;
 
 	bio_init(&md->flush_bio);
 	md->flush_bio.bi_bdev = md->bdev;
@@ -2259,15 +2293,8 @@ static struct mapped_device *alloc_dev(int minor)
 
 	return md;
 
-bad_bdev:
-	destroy_workqueue(md->wq);
-bad_thread:
-	del_gendisk(md->disk);
-	put_disk(md->disk);
-bad_disk:
-	blk_cleanup_queue(md->queue);
-bad_queue:
-	cleanup_srcu_struct(&md->io_barrier);
+bad:
+	cleanup_mapped_device(md);
 bad_io_barrier:
 	free_minor(minor);
 bad_minor:
@@ -2284,32 +2311,13 @@ static void free_dev(struct mapped_device *md)
 	int minor = MINOR(disk_devt(md->disk));
 
 	unlock_fs(md);
-	destroy_workqueue(md->wq);
 
-	if (md->kworker_task)
-		kthread_stop(md->kworker_task);
-	if (md->io_pool)
-		mempool_destroy(md->io_pool);
-	if (md->rq_pool)
-		mempool_destroy(md->rq_pool);
-	if (md->bs)
-		bioset_free(md->bs);
+	cleanup_mapped_device(md);
+	if (md->use_blk_mq)
+		blk_mq_free_tag_set(&md->tag_set);
 
-	cleanup_srcu_struct(&md->io_barrier);
 	free_table_devices(&md->table_devices);
 	dm_stats_cleanup(&md->stats);
-
-	spin_lock(&_minor_lock);
-	md->disk->private_data = NULL;
-	spin_unlock(&_minor_lock);
-	if (blk_get_integrity(md->disk))
-		blk_integrity_unregister(md->disk);
-	del_gendisk(md->disk);
-	put_disk(md->disk);
-	blk_cleanup_queue(md->queue);
-	if (md->use_blk_mq)
-		blk_mq_free_tag_set(&md->tag_set);
-	bdput(md->bdev);
 	free_minor(minor);
 
 	module_put(THIS_MODULE);
-- 
cgit v1.2.1


From 8b908f8e94540296de95682640281a95ee5d320c Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 13 May 2015 17:53:13 -0400
Subject: dm thin: cleanup overwrite's endio restore to be centralized

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e852602c0091..c0fbb6c306b2 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -705,6 +705,8 @@ static void overwrite_endio(struct bio *bio, int err)
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
 
+	bio->bi_end_io = m->saved_bi_end_io;
+
 	m->err = err;
 	complete_mapping_preparation(m);
 }
@@ -793,9 +795,6 @@ static void inc_remap_and_issue_cell(struct thin_c *tc,
 
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
-	if (m->bio)
-		m->bio->bi_end_io = m->saved_bi_end_io;
-
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
@@ -805,13 +804,9 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
 	struct pool *pool = tc->pool;
-	struct bio *bio;
+	struct bio *bio = m->bio;
 	int r;
 
-	bio = m->bio;
-	if (bio)
-		bio->bi_end_io = m->saved_bi_end_io;
-
 	if (m->err) {
 		cell_error(pool, m->cell);
 		goto out;
-- 
cgit v1.2.1


From f8ae75253e4174089ffe3046715c679183f1919f Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 14 May 2015 11:28:37 -0400
Subject: dm thin: cleanup schedule_zero() to read more logically

The overwrite has only ever about optimizing away the need to zero a
block if the entire block was being overwritten.  As such it is only
relevant when zeroing is enabled.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-thin.c | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c0fbb6c306b2..c552df7b3420 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -1086,16 +1086,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 	 * zeroing pre-existing data, we can issue the bio immediately.
 	 * Otherwise we use kcopyd to zero the data first.
 	 */
-	if (!pool->pf.zero_new_blocks)
+	if (pool->pf.zero_new_blocks) {
+		if (io_overwrites_block(pool, bio))
+			remap_and_issue_overwrite(tc, bio, data_block, m);
+		else
+			ll_zero(tc, m, data_block * pool->sectors_per_block,
+				(data_block + 1) * pool->sectors_per_block);
+	} else
 		process_prepared_mapping(m);
-
-	else if (io_overwrites_block(pool, bio))
-		remap_and_issue_overwrite(tc, bio, data_block, m);
-
-	else
-		ll_zero(tc, m,
-			data_block * pool->sectors_per_block,
-			(data_block + 1) * pool->sectors_per_block);
 }
 
 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
-- 
cgit v1.2.1


From 49f154c7327139c275c9392b5fd69cad82f106bc Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 23 Apr 2015 15:06:27 -0400
Subject: dm thin metadata: remove in-core 'read_only' flag

Leverage the block manager's read_only flag instead of duplicating it;
access with new dm_bm_is_read_only() method.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c                 | 6 +-----
 drivers/md/persistent-data/dm-block-manager.c | 6 ++++++
 drivers/md/persistent-data/dm-block-manager.h | 1 +
 3 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 79f694120ddf..cb6dd055053d 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -184,7 +184,6 @@ struct dm_pool_metadata {
 	uint64_t trans_id;
 	unsigned long flags;
 	sector_t data_block_size;
-	bool read_only:1;
 
 	/*
 	 * Set if a transaction has to be aborted but the attempt to roll back
@@ -836,7 +835,6 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 	init_rwsem(&pmd->root_lock);
 	pmd->time = 0;
 	INIT_LIST_HEAD(&pmd->thin_devices);
-	pmd->read_only = false;
 	pmd->fail_io = false;
 	pmd->bdev = bdev;
 	pmd->data_block_size = data_block_size;
@@ -880,7 +878,7 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
 		return -EBUSY;
 	}
 
-	if (!pmd->read_only && !pmd->fail_io) {
+	if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
 		r = __commit_transaction(pmd);
 		if (r < 0)
 			DMWARN("%s: __commit_transaction() failed, error = %d",
@@ -1739,7 +1737,6 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 {
 	down_write(&pmd->root_lock);
-	pmd->read_only = true;
 	dm_bm_set_read_only(pmd->bm);
 	up_write(&pmd->root_lock);
 }
@@ -1747,7 +1744,6 @@ void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
 {
 	down_write(&pmd->root_lock);
-	pmd->read_only = false;
 	dm_bm_set_read_write(pmd->bm);
 	up_write(&pmd->root_lock);
 }
diff --git a/drivers/md/persistent-data/dm-block-manager.c b/drivers/md/persistent-data/dm-block-manager.c
index 087411c95ffc..4d6c9b689eaa 100644
--- a/drivers/md/persistent-data/dm-block-manager.c
+++ b/drivers/md/persistent-data/dm-block-manager.c
@@ -609,6 +609,12 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b)
 	dm_bufio_prefetch(bm->bufio, b, 1);
 }
 
+bool dm_bm_is_read_only(struct dm_block_manager *bm)
+{
+	return bm->read_only;
+}
+EXPORT_SYMBOL_GPL(dm_bm_is_read_only);
+
 void dm_bm_set_read_only(struct dm_block_manager *bm)
 {
 	bm->read_only = true;
diff --git a/drivers/md/persistent-data/dm-block-manager.h b/drivers/md/persistent-data/dm-block-manager.h
index 1b95dfc17786..84330f59886d 100644
--- a/drivers/md/persistent-data/dm-block-manager.h
+++ b/drivers/md/persistent-data/dm-block-manager.h
@@ -123,6 +123,7 @@ void dm_bm_prefetch(struct dm_block_manager *bm, dm_block_t b);
  * Additionally you should not use dm_bm_unlock_move, however no error will
  * be returned if you do.
  */
+bool dm_bm_is_read_only(struct dm_block_manager *bm);
 void dm_bm_set_read_only(struct dm_block_manager *bm);
 void dm_bm_set_read_write(struct dm_block_manager *bm);
 
-- 
cgit v1.2.1


From 0f4106b32f36165a4f40b6aad0372e02ff14cf34 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 29 Apr 2015 14:03:07 +0200
Subject: dm raid: fixup documentation for discard support

Remove comment above parse_raid_params() that claims
"devices_handle_discard_safely" is a table line argument when it is
actually is a module parameter.

Also, backfill dm-raid target version 1.6.0 documentation.

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 88e4c7f24986..423e42e9a1ad 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -477,8 +477,6 @@ too_many:
  *                                      will form the "stripe"
  *    [[no]sync]			Force or prevent recovery of the
  *                                      entire array
- *    [devices_handle_discard_safely]	Allow discards on RAID4/5/6; useful if RAID
- *					member device(s) properly support TRIM/UNMAP
  *    [rebuild <idx>]			Rebuild the drive indicated by the index
  *    [daemon_sleep <ms>]		Time between bitmap daemon work to
  *                                      clear bits
-- 
cgit v1.2.1


From c76d53f43ec4f9b9f200f031d303f21bdf6927d0 Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 29 Apr 2015 14:03:00 +0200
Subject: dm raid: a few cleanups

- ensure maximum device limit in superblock
- rename DMPF_* (print flags) to CTR_FLAG_* (constructor flags)
  and their respective struct raid_set member
- use strcasecmp() in raid10_format_to_md_layout() as in the constructor

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 91 ++++++++++++++++++++++++++--------------------------
 1 file changed, 46 insertions(+), 45 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 423e42e9a1ad..af49ddebaa62 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -17,6 +17,7 @@
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "raid"
+#define	MAX_RAID_DEVICES	253 /* raid4/5/6 limit */
 
 static bool devices_handle_discard_safely = false;
 
@@ -45,25 +46,25 @@ struct raid_dev {
 };
 
 /*
- * Flags for rs->print_flags field.
+ * Flags for rs->ctr_flags field.
  */
-#define DMPF_SYNC              0x1
-#define DMPF_NOSYNC            0x2
-#define DMPF_REBUILD           0x4
-#define DMPF_DAEMON_SLEEP      0x8
-#define DMPF_MIN_RECOVERY_RATE 0x10
-#define DMPF_MAX_RECOVERY_RATE 0x20
-#define DMPF_MAX_WRITE_BEHIND  0x40
-#define DMPF_STRIPE_CACHE      0x80
-#define DMPF_REGION_SIZE       0x100
-#define DMPF_RAID10_COPIES     0x200
-#define DMPF_RAID10_FORMAT     0x400
+#define CTR_FLAG_SYNC              0x1
+#define CTR_FLAG_NOSYNC            0x2
+#define CTR_FLAG_REBUILD           0x4
+#define CTR_FLAG_DAEMON_SLEEP      0x8
+#define CTR_FLAG_MIN_RECOVERY_RATE 0x10
+#define CTR_FLAG_MAX_RECOVERY_RATE 0x20
+#define CTR_FLAG_MAX_WRITE_BEHIND  0x40
+#define CTR_FLAG_STRIPE_CACHE      0x80
+#define CTR_FLAG_REGION_SIZE       0x100
+#define CTR_FLAG_RAID10_COPIES     0x200
+#define CTR_FLAG_RAID10_FORMAT     0x400
 
 struct raid_set {
 	struct dm_target *ti;
 
 	uint32_t bitmap_loaded;
-	uint32_t print_flags;
+	uint32_t ctr_flags;
 
 	struct mddev md;
 	struct raid_type *raid_type;
@@ -119,15 +120,15 @@ static int raid10_format_to_md_layout(char *format, unsigned copies)
 {
 	unsigned n = 1, f = 1;
 
-	if (!strcmp("near", format))
+	if (!strcasecmp("near", format))
 		n = copies;
 	else
 		f = copies;
 
-	if (!strcmp("offset", format))
+	if (!strcasecmp("offset", format))
 		return 0x30000 | (f << 8) | n;
 
-	if (!strcmp("far", format))
+	if (!strcasecmp("far", format))
 		return 0x20000 | (f << 8) | n;
 
 	return (f << 8) | n;
@@ -553,12 +554,12 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 	for (i = 0; i < num_raid_params; i++) {
 		if (!strcasecmp(argv[i], "nosync")) {
 			rs->md.recovery_cp = MaxSector;
-			rs->print_flags |= DMPF_NOSYNC;
+			rs->ctr_flags |= CTR_FLAG_NOSYNC;
 			continue;
 		}
 		if (!strcasecmp(argv[i], "sync")) {
 			rs->md.recovery_cp = 0;
-			rs->print_flags |= DMPF_SYNC;
+			rs->ctr_flags |= CTR_FLAG_SYNC;
 			continue;
 		}
 
@@ -583,7 +584,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				return -EINVAL;
 			}
 			raid10_format = argv[i];
-			rs->print_flags |= DMPF_RAID10_FORMAT;
+			rs->ctr_flags |= CTR_FLAG_RAID10_FORMAT;
 			continue;
 		}
 
@@ -600,7 +601,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			}
 			clear_bit(In_sync, &rs->dev[value].rdev.flags);
 			rs->dev[value].rdev.recovery_offset = 0;
-			rs->print_flags |= DMPF_REBUILD;
+			rs->ctr_flags |= CTR_FLAG_REBUILD;
 		} else if (!strcasecmp(key, "write_mostly")) {
 			if (rs->raid_type->level != 1) {
 				rs->ti->error = "write_mostly option is only valid for RAID1";
@@ -616,7 +617,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				rs->ti->error = "max_write_behind option is only valid for RAID1";
 				return -EINVAL;
 			}
-			rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
+			rs->ctr_flags |= CTR_FLAG_MAX_WRITE_BEHIND;
 
 			/*
 			 * In device-mapper, we specify things in sectors, but
@@ -629,14 +630,14 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 			}
 			rs->md.bitmap_info.max_write_behind = value;
 		} else if (!strcasecmp(key, "daemon_sleep")) {
-			rs->print_flags |= DMPF_DAEMON_SLEEP;
+			rs->ctr_flags |= CTR_FLAG_DAEMON_SLEEP;
 			if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
 				rs->ti->error = "daemon sleep period out of range";
 				return -EINVAL;
 			}
 			rs->md.bitmap_info.daemon_sleep = value;
 		} else if (!strcasecmp(key, "stripe_cache")) {
-			rs->print_flags |= DMPF_STRIPE_CACHE;
+			rs->ctr_flags |= CTR_FLAG_STRIPE_CACHE;
 
 			/*
 			 * In device-mapper, we specify things in sectors, but
@@ -654,21 +655,21 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				return -EINVAL;
 			}
 		} else if (!strcasecmp(key, "min_recovery_rate")) {
-			rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
+			rs->ctr_flags |= CTR_FLAG_MIN_RECOVERY_RATE;
 			if (value > INT_MAX) {
 				rs->ti->error = "min_recovery_rate out of range";
 				return -EINVAL;
 			}
 			rs->md.sync_speed_min = (int)value;
 		} else if (!strcasecmp(key, "max_recovery_rate")) {
-			rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
+			rs->ctr_flags |= CTR_FLAG_MAX_RECOVERY_RATE;
 			if (value > INT_MAX) {
 				rs->ti->error = "max_recovery_rate out of range";
 				return -EINVAL;
 			}
 			rs->md.sync_speed_max = (int)value;
 		} else if (!strcasecmp(key, "region_size")) {
-			rs->print_flags |= DMPF_REGION_SIZE;
+			rs->ctr_flags |= CTR_FLAG_REGION_SIZE;
 			region_size = value;
 		} else if (!strcasecmp(key, "raid10_copies") &&
 			   (rs->raid_type->level == 10)) {
@@ -676,7 +677,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 				rs->ti->error = "Bad value for 'raid10_copies'";
 				return -EINVAL;
 			}
-			rs->print_flags |= DMPF_RAID10_COPIES;
+			rs->ctr_flags |= CTR_FLAG_RAID10_COPIES;
 			raid10_copies = value;
 		} else {
 			DMERR("Unable to parse RAID parameter: %s", key);
@@ -945,7 +946,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 		return -EINVAL;
 	}
 
-	if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
+	if (!(rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC)))
 		mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
 
 	/*
@@ -1071,7 +1072,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	freshest = NULL;
 	rdev_for_each_safe(rdev, tmp, mddev) {
 		/*
-		 * Skipping super_load due to DMPF_SYNC will cause
+		 * Skipping super_load due to CTR_FLAG_SYNC will cause
 		 * the array to undergo initialization again as
 		 * though it were new.  This is the intended effect
 		 * of the "sync" directive.
@@ -1080,7 +1081,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		 * that the "sync" directive is disallowed during the
 		 * reshape.
 		 */
-		if (rs->print_flags & DMPF_SYNC)
+		if (rs->ctr_flags & CTR_FLAG_SYNC)
 			continue;
 
 		if (!rdev->meta_bdev)
@@ -1241,7 +1242,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	}
 
 	if ((kstrtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
-	    (num_raid_devs >= INT_MAX)) {
+	    (num_raid_devs > MAX_RAID_DEVICES)) {
 		ti->error = "Cannot understand number of raid devices";
 		return -EINVAL;
 	}
@@ -1444,7 +1445,7 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	case STATUSTYPE_TABLE:
 		/* The string you would use to construct this array */
 		for (i = 0; i < rs->md.raid_disks; i++) {
-			if ((rs->print_flags & DMPF_REBUILD) &&
+			if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
 			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
 				raid_param_cnt += 2; /* for rebuilds */
@@ -1453,33 +1454,33 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 				raid_param_cnt += 2;
 		}
 
-		raid_param_cnt += (hweight32(rs->print_flags & ~DMPF_REBUILD) * 2);
-		if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
+		raid_param_cnt += (hweight32(rs->ctr_flags & ~CTR_FLAG_REBUILD) * 2);
+		if (rs->ctr_flags & (CTR_FLAG_SYNC | CTR_FLAG_NOSYNC))
 			raid_param_cnt--;
 
 		DMEMIT("%s %u %u", rs->raid_type->name,
 		       raid_param_cnt, rs->md.chunk_sectors);
 
-		if ((rs->print_flags & DMPF_SYNC) &&
+		if ((rs->ctr_flags & CTR_FLAG_SYNC) &&
 		    (rs->md.recovery_cp == MaxSector))
 			DMEMIT(" sync");
-		if (rs->print_flags & DMPF_NOSYNC)
+		if (rs->ctr_flags & CTR_FLAG_NOSYNC)
 			DMEMIT(" nosync");
 
 		for (i = 0; i < rs->md.raid_disks; i++)
-			if ((rs->print_flags & DMPF_REBUILD) &&
+			if ((rs->ctr_flags & CTR_FLAG_REBUILD) &&
 			    rs->dev[i].data_dev &&
 			    !test_bit(In_sync, &rs->dev[i].rdev.flags))
 				DMEMIT(" rebuild %u", i);
 
-		if (rs->print_flags & DMPF_DAEMON_SLEEP)
+		if (rs->ctr_flags & CTR_FLAG_DAEMON_SLEEP)
 			DMEMIT(" daemon_sleep %lu",
 			       rs->md.bitmap_info.daemon_sleep);
 
-		if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
+		if (rs->ctr_flags & CTR_FLAG_MIN_RECOVERY_RATE)
 			DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
 
-		if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
+		if (rs->ctr_flags & CTR_FLAG_MAX_RECOVERY_RATE)
 			DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
 
 		for (i = 0; i < rs->md.raid_disks; i++)
@@ -1487,11 +1488,11 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 			    test_bit(WriteMostly, &rs->dev[i].rdev.flags))
 				DMEMIT(" write_mostly %u", i);
 
-		if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
+		if (rs->ctr_flags & CTR_FLAG_MAX_WRITE_BEHIND)
 			DMEMIT(" max_write_behind %lu",
 			       rs->md.bitmap_info.max_write_behind);
 
-		if (rs->print_flags & DMPF_STRIPE_CACHE) {
+		if (rs->ctr_flags & CTR_FLAG_STRIPE_CACHE) {
 			struct r5conf *conf = rs->md.private;
 
 			/* convert from kiB to sectors */
@@ -1499,15 +1500,15 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 			       conf ? conf->max_nr_stripes * 2 : 0);
 		}
 
-		if (rs->print_flags & DMPF_REGION_SIZE)
+		if (rs->ctr_flags & CTR_FLAG_REGION_SIZE)
 			DMEMIT(" region_size %lu",
 			       rs->md.bitmap_info.chunksize >> 9);
 
-		if (rs->print_flags & DMPF_RAID10_COPIES)
+		if (rs->ctr_flags & CTR_FLAG_RAID10_COPIES)
 			DMEMIT(" raid10_copies %u",
 			       raid10_md_layout_to_copies(rs->md.layout));
 
-		if (rs->print_flags & DMPF_RAID10_FORMAT)
+		if (rs->ctr_flags & CTR_FLAG_RAID10_FORMAT)
 			DMEMIT(" raid10_format %s",
 			       raid10_md_layout_to_format(rs->md.layout));
 
-- 
cgit v1.2.1


From 0cf4503174c12025ac7ea61048cb7c1d4d1ed85c Mon Sep 17 00:00:00 2001
From: Heinz Mauelshagen <heinzm@redhat.com>
Date: Wed, 29 Apr 2015 14:03:04 +0200
Subject: dm raid: add support for the MD RAID0 personality

Add dm-raid access to the MD RAID0 personality to enable single zone
striping.

The following changes enable that access:
- add type definition to raid_types array
- make bitmap creation conditonal in super_validate(), because
  bitmaps are not allowed in raid0
- set rdev->sectors to the data image size in super_validate()
  to allow the raid0 personality to calculate the MD array
  size properly
- use mdddev(un)lock() functions instead of direct mutex_(un)lock()
  (wrapped in here because it's a trivial change)
- enhance raid_status() to always report full sync for raid0
  so that userspace checks for 100% sync will succeed and allow
  for resize (and takeover/reshape once added in future paches)
- enhance raid_resume() to not load bitmap in case of raid0
- add merge function to avoid data corruption (seen with readahead)
  that resulted from bio payloads that grew too large.  This problem
  did not occur with the other raid levels because it either did not
  apply without striping (raid1) or was avoided via stripe caching.
- raise version to 1.7.0 because of the raid0 API change

Signed-off-by: Heinz Mauelshagen <heinzm@redhat.com>
Reviewed-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid.c | 132 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 84 insertions(+), 48 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index af49ddebaa62..2daa67793511 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -1,6 +1,6 @@
 /*
  * Copyright (C) 2010-2011 Neil Brown
- * Copyright (C) 2010-2014 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2010-2015 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
@@ -82,6 +82,7 @@ static struct raid_type {
 	const unsigned level;		/* RAID level. */
 	const unsigned algorithm;	/* RAID algorithm. */
 } raid_types[] = {
+	{"raid0",    "RAID0 (striping)",                0, 2, 0, 0 /* NONE */},
 	{"raid1",    "RAID1 (mirroring)",               0, 2, 1, 0 /* NONE */},
 	{"raid10",   "RAID10 (striped mirrors)",        0, 2, 10, UINT_MAX /* Varies */},
 	{"raid4",    "RAID4 (dedicated parity disk)",	1, 2, 5, ALGORITHM_PARITY_0},
@@ -719,7 +720,7 @@ static int parse_raid_params(struct raid_set *rs, char **argv,
 		rs->md.layout = raid10_format_to_md_layout(raid10_format,
 							   raid10_copies);
 		rs->md.new_layout = rs->md.layout;
-	} else if ((rs->raid_type->level > 1) &&
+	} else if ((!rs->raid_type->level || rs->raid_type->level > 1) &&
 		   sector_div(sectors_per_dev,
 			      (rs->md.raid_disks - rs->raid_type->parity_devs))) {
 		rs->ti->error = "Target length not divisible by number of data devices";
@@ -1025,8 +1026,9 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
 	return 0;
 }
 
-static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
+static int super_validate(struct raid_set *rs, struct md_rdev *rdev)
 {
+	struct mddev *mddev = &rs->md;
 	struct dm_raid_superblock *sb = page_address(rdev->sb_page);
 
 	/*
@@ -1036,8 +1038,10 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
 	if (!mddev->events && super_init_validation(mddev, rdev))
 		return -EINVAL;
 
-	mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
-	rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
+	/* Enable bitmap creation for RAID levels != 0 */
+	mddev->bitmap_info.offset = (rs->raid_type->level) ? to_sector(4096) : 0;
+	rdev->mddev->bitmap_info.default_offset = mddev->bitmap_info.offset;
+
 	if (!test_bit(FirstUse, &rdev->flags)) {
 		rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
 		if (rdev->recovery_offset != MaxSector)
@@ -1081,6 +1085,8 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 		 * that the "sync" directive is disallowed during the
 		 * reshape.
 		 */
+		rdev->sectors = to_sector(i_size_read(rdev->bdev->bd_inode));
+
 		if (rs->ctr_flags & CTR_FLAG_SYNC)
 			continue;
 
@@ -1139,11 +1145,11 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
 	 * validation for the remaining devices.
 	 */
 	ti->error = "Unable to assemble array: Invalid superblocks";
-	if (super_validate(mddev, freshest))
+	if (super_validate(rs, freshest))
 		return -EINVAL;
 
 	rdev_for_each(rdev, mddev)
-		if ((rdev != freshest) && super_validate(mddev, rdev))
+		if ((rdev != freshest) && super_validate(rs, rdev))
 			return -EINVAL;
 
 	return 0;
@@ -1281,10 +1287,11 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	 */
 	configure_discard_support(ti, rs);
 
-	mutex_lock(&rs->md.reconfig_mutex);
+	/* Has to be held on running the array */
+	mddev_lock_nointr(&rs->md);
 	ret = md_run(&rs->md);
 	rs->md.in_sync = 0; /* Assume already marked dirty */
-	mutex_unlock(&rs->md.reconfig_mutex);
+	mddev_unlock(&rs->md);
 
 	if (ret) {
 		ti->error = "Fail to run raid array";
@@ -1367,34 +1374,40 @@ static void raid_status(struct dm_target *ti, status_type_t type,
 	case STATUSTYPE_INFO:
 		DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
 
-		if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
-			sync = rs->md.curr_resync_completed;
-		else
-			sync = rs->md.recovery_cp;
-
-		if (sync >= rs->md.resync_max_sectors) {
-			/*
-			 * Sync complete.
-			 */
+		if (rs->raid_type->level) {
+			if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
+				sync = rs->md.curr_resync_completed;
+			else
+				sync = rs->md.recovery_cp;
+
+			if (sync >= rs->md.resync_max_sectors) {
+				/*
+				 * Sync complete.
+				 */
+				array_in_sync = 1;
+				sync = rs->md.resync_max_sectors;
+			} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
+				/*
+				 * If "check" or "repair" is occurring, the array has
+				 * undergone and initial sync and the health characters
+				 * should not be 'a' anymore.
+				 */
+				array_in_sync = 1;
+			} else {
+				/*
+				 * The array may be doing an initial sync, or it may
+				 * be rebuilding individual components.  If all the
+				 * devices are In_sync, then it is the array that is
+				 * being initialized.
+				 */
+				for (i = 0; i < rs->md.raid_disks; i++)
+					if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
+						array_in_sync = 1;
+			}
+		} else {
+			/* RAID0 */
 			array_in_sync = 1;
 			sync = rs->md.resync_max_sectors;
-		} else if (test_bit(MD_RECOVERY_REQUESTED, &rs->md.recovery)) {
-			/*
-			 * If "check" or "repair" is occurring, the array has
-			 * undergone and initial sync and the health characters
-			 * should not be 'a' anymore.
-			 */
-			array_in_sync = 1;
-		} else {
-			/*
-			 * The array may be doing an initial sync, or it may
-			 * be rebuilding individual components.  If all the
-			 * devices are In_sync, then it is the array that is
-			 * being initialized.
-			 */
-			for (i = 0; i < rs->md.raid_disks; i++)
-				if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
-					array_in_sync = 1;
 		}
 
 		/*
@@ -1683,26 +1696,48 @@ static void raid_resume(struct dm_target *ti)
 {
 	struct raid_set *rs = ti->private;
 
-	set_bit(MD_CHANGE_DEVS, &rs->md.flags);
-	if (!rs->bitmap_loaded) {
-		bitmap_load(&rs->md);
-		rs->bitmap_loaded = 1;
-	} else {
-		/*
-		 * A secondary resume while the device is active.
-		 * Take this opportunity to check whether any failed
-		 * devices are reachable again.
-		 */
-		attempt_restore_of_faulty_devices(rs);
+	if (rs->raid_type->level) {
+		set_bit(MD_CHANGE_DEVS, &rs->md.flags);
+
+		if (!rs->bitmap_loaded) {
+			bitmap_load(&rs->md);
+			rs->bitmap_loaded = 1;
+		} else {
+			/*
+			 * A secondary resume while the device is active.
+			 * Take this opportunity to check whether any failed
+			 * devices are reachable again.
+			 */
+			attempt_restore_of_faulty_devices(rs);
+		}
+
+		clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
 	}
 
-	clear_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
 	mddev_resume(&rs->md);
 }
 
+static int raid_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+		      struct bio_vec *biovec, int max_size)
+{
+	struct raid_set *rs = ti->private;
+	struct md_personality *pers = rs->md.pers;
+
+	if (pers && pers->mergeable_bvec)
+		return min(max_size, pers->mergeable_bvec(&rs->md, bvm, biovec));
+
+	/*
+	 * In case we can't request the personality because
+	 * the raid set is not running yet
+	 *
+	 * -> return safe minimum
+	 */
+	return rs->md.chunk_sectors;
+}
+
 static struct target_type raid_target = {
 	.name = "raid",
-	.version = {1, 6, 0},
+	.version = {1, 7, 0},
 	.module = THIS_MODULE,
 	.ctr = raid_ctr,
 	.dtr = raid_dtr,
@@ -1714,6 +1749,7 @@ static struct target_type raid_target = {
 	.presuspend = raid_presuspend,
 	.postsuspend = raid_postsuspend,
 	.resume = raid_resume,
+	.merge = raid_merge,
 };
 
 static int __init dm_raid_init(void)
-- 
cgit v1.2.1


From e223e1de4f8a586662c4917f6f673126574960dd Mon Sep 17 00:00:00 2001
From: Luis Henriques <luis.henriques@canonical.com>
Date: Mon, 27 Apr 2015 21:29:36 +0100
Subject: dm stripe: drop useless exit point from dm_stripe_init()

Signed-off-by: Luis Henriques <luis.henriques@canonical.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-stripe.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index f8b37d4c05d8..a672a1502c14 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -451,10 +451,8 @@ int __init dm_stripe_init(void)
 	int r;
 
 	r = dm_register_target(&stripe_target);
-	if (r < 0) {
+	if (r < 0)
 		DMWARN("target registration failed");
-		return r;
-	}
 
 	return r;
 }
-- 
cgit v1.2.1


From f4ad317aedf836e7bb67108a119bfec7f3d97ef1 Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Sun, 19 Apr 2015 00:07:30 +0200
Subject: dm log writes: use ULL suffix for 64-bit constants
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On 32-bit:
drivers/md/dm-log-writes.c: In function ‘log_super’:
drivers/md/dm-log-writes.c:323: warning: integer constant is too large for ‘long’ type

Add a ULL suffix to WRITE_LOG_MAGIC to fix this.
Also add a ULL suffix to WRITE_LOG_VERSION as it's stored in a __le64
field.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-log-writes.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 93e08446a87d..ad1b049ae2ab 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -55,8 +55,8 @@
 #define LOG_DISCARD_FLAG (1 << 2)
 #define LOG_MARK_FLAG (1 << 3)
 
-#define WRITE_LOG_VERSION 1
-#define WRITE_LOG_MAGIC 0x6a736677736872
+#define WRITE_LOG_VERSION 1ULL
+#define WRITE_LOG_MAGIC 0x6a736677736872ULL
 
 /*
  * The disk format for this is braindead simple.
-- 
cgit v1.2.1


From ed63287dd670f8e9d2412a913de7fdc50a689831 Mon Sep 17 00:00:00 2001
From: Lidong Zhong <lzhong@suse.com>
Date: Wed, 13 May 2015 14:04:10 +0800
Subject: dm raid1: keep issuing IO after leg failure

Currently if there is a leg failure, the bio will be put into the hold
list until userspace does a remove/replace on the leg.  Doing so in a
cluster config (clvmd) is problematic because there may be a temporary
path failure that results in cluster raid1 remove/replace.  Such
recovery takes a long time due to a full resync.

Update dm-raid1 to optionally ignore these failures so bios continue
being issued without interrupton.  To enable this feature userspace
must pass "keep_log" when creating the dm-raid1 device.

Signed-off-by: Lidong Zhong <lzhong@suse.com>
Tested-by: Liuhua Wang <lwang@suse.com>
Acked-by: Heinz Mauelshagen <heinzm@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-raid1.c | 75 +++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 58 insertions(+), 17 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 743fa9bbae9e..d83696bf403b 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -23,8 +23,10 @@
 
 #define MAX_RECOVERY 1	/* Maximum number of regions recovered in parallel. */
 
-#define DM_RAID1_HANDLE_ERRORS 0x01
+#define DM_RAID1_HANDLE_ERRORS	0x01
+#define DM_RAID1_KEEP_LOG	0x02
 #define errors_handled(p)	((p)->features & DM_RAID1_HANDLE_ERRORS)
+#define keep_log(p)		((p)->features & DM_RAID1_KEEP_LOG)
 
 static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
 
@@ -229,7 +231,7 @@ static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
 	if (m != get_default_mirror(ms))
 		goto out;
 
-	if (!ms->in_sync) {
+	if (!ms->in_sync && !keep_log(ms)) {
 		/*
 		 * Better to issue requests to same failing device
 		 * than to risk returning corrupt data.
@@ -370,6 +372,17 @@ static int recover(struct mirror_set *ms, struct dm_region *reg)
 	return r;
 }
 
+static void reset_ms_flags(struct mirror_set *ms)
+{
+	unsigned int m;
+
+	ms->leg_failure = 0;
+	for (m = 0; m < ms->nr_mirrors; m++) {
+		atomic_set(&(ms->mirror[m].error_count), 0);
+		ms->mirror[m].error_type = 0;
+	}
+}
+
 static void do_recovery(struct mirror_set *ms)
 {
 	struct dm_region *reg;
@@ -398,6 +411,7 @@ static void do_recovery(struct mirror_set *ms)
 		/* the sync is complete */
 		dm_table_event(ms->ti->table);
 		ms->in_sync = 1;
+		reset_ms_flags(ms);
 	}
 }
 
@@ -759,7 +773,7 @@ static void do_writes(struct mirror_set *ms, struct bio_list *writes)
 		dm_rh_delay(ms->rh, bio);
 
 	while ((bio = bio_list_pop(&nosync))) {
-		if (unlikely(ms->leg_failure) && errors_handled(ms)) {
+		if (unlikely(ms->leg_failure) && errors_handled(ms) && !keep_log(ms)) {
 			spin_lock_irq(&ms->lock);
 			bio_list_add(&ms->failures, bio);
 			spin_unlock_irq(&ms->lock);
@@ -803,15 +817,21 @@ static void do_failures(struct mirror_set *ms, struct bio_list *failures)
 
 		/*
 		 * If all the legs are dead, fail the I/O.
-		 * If we have been told to handle errors, hold the bio
-		 * and wait for userspace to deal with the problem.
+		 * If the device has failed and keep_log is enabled,
+		 * fail the I/O.
+		 *
+		 * If we have been told to handle errors, and keep_log
+		 * isn't enabled, hold the bio and wait for userspace to
+		 * deal with the problem.
+		 *
 		 * Otherwise pretend that the I/O succeeded. (This would
 		 * be wrong if the failed leg returned after reboot and
 		 * got replicated back to the good legs.)
 		 */
-		if (!get_valid_mirror(ms))
+
+		if (unlikely(!get_valid_mirror(ms) || (keep_log(ms) && ms->log_failure)))
 			bio_endio(bio, -EIO);
-		else if (errors_handled(ms))
+		else if (errors_handled(ms) && !keep_log(ms))
 			hold_bio(ms, bio);
 		else
 			bio_endio(bio, 0);
@@ -987,6 +1007,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
 	unsigned num_features;
 	struct dm_target *ti = ms->ti;
 	char dummy;
+	int i;
 
 	*args_used = 0;
 
@@ -1007,15 +1028,25 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
 		return -EINVAL;
 	}
 
-	if (!strcmp("handle_errors", argv[0]))
-		ms->features |= DM_RAID1_HANDLE_ERRORS;
-	else {
-		ti->error = "Unrecognised feature requested";
+	for (i = 0; i < num_features; i++) {
+		if (!strcmp("handle_errors", argv[0]))
+			ms->features |= DM_RAID1_HANDLE_ERRORS;
+		else if (!strcmp("keep_log", argv[0]))
+			ms->features |= DM_RAID1_KEEP_LOG;
+		else {
+			ti->error = "Unrecognised feature requested";
+			return -EINVAL;
+		}
+
+		argc--;
+		argv++;
+		(*args_used)++;
+	}
+	if (!errors_handled(ms) && keep_log(ms)) {
+		ti->error = "keep_log feature requires the handle_errors feature";
 		return -EINVAL;
 	}
 
-	(*args_used)++;
-
 	return 0;
 }
 
@@ -1029,7 +1060,7 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
  * log_type is "core" or "disk"
  * #log_params is between 1 and 3
  *
- * If present, features must be "handle_errors".
+ * If present, supported features are "handle_errors" and "keep_log".
  */
 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
@@ -1363,6 +1394,7 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
 			  unsigned status_flags, char *result, unsigned maxlen)
 {
 	unsigned int m, sz = 0;
+	int num_feature_args = 0;
 	struct mirror_set *ms = (struct mirror_set *) ti->private;
 	struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
 	char buffer[ms->nr_mirrors + 1];
@@ -1392,8 +1424,17 @@ static void mirror_status(struct dm_target *ti, status_type_t type,
 			DMEMIT(" %s %llu", ms->mirror[m].dev->name,
 			       (unsigned long long)ms->mirror[m].offset);
 
-		if (ms->features & DM_RAID1_HANDLE_ERRORS)
-			DMEMIT(" 1 handle_errors");
+		num_feature_args += !!errors_handled(ms);
+		num_feature_args += !!keep_log(ms);
+		if (num_feature_args) {
+			DMEMIT(" %d", num_feature_args);
+			if (errors_handled(ms))
+				DMEMIT(" handle_errors");
+			if (keep_log(ms))
+				DMEMIT(" keep_log");
+		}
+
+		break;
 	}
 }
 
@@ -1413,7 +1454,7 @@ static int mirror_iterate_devices(struct dm_target *ti,
 
 static struct target_type mirror_target = {
 	.name	 = "mirror",
-	.version = {1, 13, 2},
+	.version = {1, 14, 0},
 	.module	 = THIS_MODULE,
 	.ctr	 = mirror_ctr,
 	.dtr	 = mirror_dtr,
-- 
cgit v1.2.1


From 54cea3f6681ad9360814e2926d1f723bbd0f74ed Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Fri, 15 May 2015 17:00:25 +0200
Subject: dm crypt: add comments to better describe crypto processing logic

A crypto driver can process requests synchronously or asynchronously
and can use an internal driver queue to backlog requests.
Add some comments to clarify internal logic and completion return codes.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-crypt.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 5503e43e5f28..0f48fed44a17 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1,7 +1,7 @@
 /*
  * Copyright (C) 2003 Jana Saout <jana@saout.de>
  * Copyright (C) 2004 Clemens Fruhwirth <clemens@endorphin.org>
- * Copyright (C) 2006-2009 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2006-2015 Red Hat, Inc. All rights reserved.
  * Copyright (C) 2013 Milan Broz <gmazyland@gmail.com>
  *
  * This file is released under the GPL.
@@ -891,6 +891,11 @@ static void crypt_alloc_req(struct crypt_config *cc,
 		ctx->req = mempool_alloc(cc->req_pool, GFP_NOIO);
 
 	ablkcipher_request_set_tfm(ctx->req, cc->tfms[key_index]);
+
+	/*
+	 * Use REQ_MAY_BACKLOG so a cipher driver internally backlogs
+	 * requests if driver request queue is full.
+	 */
 	ablkcipher_request_set_callback(ctx->req,
 	    CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
 	    kcryptd_async_done, dmreq_of_req(cc, ctx->req));
@@ -924,24 +929,32 @@ static int crypt_convert(struct crypt_config *cc,
 		r = crypt_convert_block(cc, ctx, ctx->req);
 
 		switch (r) {
-		/* async */
+		/*
+		 * The request was queued by a crypto driver
+		 * but the driver request queue is full, let's wait.
+		 */
 		case -EBUSY:
 			wait_for_completion(&ctx->restart);
 			reinit_completion(&ctx->restart);
-			/* fall through*/
+			/* fall through */
+		/*
+		 * The request is queued and processed asynchronously,
+		 * completion function kcryptd_async_done() will be called.
+		 */
 		case -EINPROGRESS:
 			ctx->req = NULL;
 			ctx->cc_sector++;
 			continue;
-
-		/* sync */
+		/*
+		 * The request was already processed (synchronously).
+		 */
 		case 0:
 			atomic_dec(&ctx->cc_pending);
 			ctx->cc_sector++;
 			cond_resched();
 			continue;
 
-		/* error */
+		/* There was an error while processing the request. */
 		default:
 			atomic_dec(&ctx->cc_pending);
 			return r;
@@ -1346,6 +1359,11 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
 	struct dm_crypt_io *io = container_of(ctx, struct dm_crypt_io, ctx);
 	struct crypt_config *cc = io->cc;
 
+	/*
+	 * A request from crypto driver backlog is going to be processed now,
+	 * finish the completion and continue in crypt_convert().
+	 * (Callback will be called for the second time for this request.)
+	 */
 	if (error == -EINPROGRESS) {
 		complete(&ctx->restart);
 		return;
-- 
cgit v1.2.1


From fb4100ae7f312c3d614b37621c2b17b3b7cf65f8 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 20 May 2015 10:30:32 +0100
Subject: dm cache: fix race when issuing a POLICY_REPLACE operation

There is a race between a policy deciding to replace a cache entry,
the core target writing back any dirty data from this block, and other
IO threads doing IO to the same block.

This sort of problem is avoided most of the time by the core target
grabbing a bio prison cell before making the request to the policy.
But for a demotion the core target doesn't know which block will be
demoted, so can't do this in advance.

Fix this demotion race by introducing a callback to the policy interface
that allows the policy to grab the cell on behalf of the core target.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-cache-policy-cleaner.c  |  3 +-
 drivers/md/dm-cache-policy-internal.h |  5 +--
 drivers/md/dm-cache-policy-mq.c       | 41 +++++++++++++++++--------
 drivers/md/dm-cache-policy.h          | 15 ++++++++-
 drivers/md/dm-cache-target.c          | 58 +++++++++++++++++++++++------------
 5 files changed, 85 insertions(+), 37 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index b04d1f904d07..004e463c9423 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -171,7 +171,8 @@ static void remove_cache_hash_entry(struct wb_cache_entry *e)
 /* Public interface (see dm-cache-policy.h */
 static int wb_map(struct dm_cache_policy *pe, dm_oblock_t oblock,
 		  bool can_block, bool can_migrate, bool discarded_oblock,
-		  struct bio *bio, struct policy_result *result)
+		  struct bio *bio, struct policy_locker *locker,
+		  struct policy_result *result)
 {
 	struct policy *p = to_policy(pe);
 	struct wb_cache_entry *e;
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 2256a1f24f73..c198e6defb9c 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -16,9 +16,10 @@
  */
 static inline int policy_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 			     bool can_block, bool can_migrate, bool discarded_oblock,
-			     struct bio *bio, struct policy_result *result)
+			     struct bio *bio, struct policy_locker *locker,
+			     struct policy_result *result)
 {
-	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, result);
+	return p->map(p, oblock, can_block, can_migrate, discarded_oblock, bio, locker, result);
 }
 
 static inline int policy_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 3ddd1162334d..515d44bf24d3 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -693,9 +693,10 @@ static void requeue(struct mq_policy *mq, struct entry *e)
  * - set the hit count to a hard coded value other than 1, eg, is it better
  *   if it goes in at level 2?
  */
-static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
+static int demote_cblock(struct mq_policy *mq,
+			 struct policy_locker *locker, dm_oblock_t *oblock)
 {
-	struct entry *demoted = pop(mq, &mq->cache_clean);
+	struct entry *demoted = peek(&mq->cache_clean);
 
 	if (!demoted)
 		/*
@@ -707,6 +708,13 @@ static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
 		 */
 		return -ENOSPC;
 
+	if (locker->fn(locker, demoted->oblock))
+		/*
+		 * We couldn't lock the demoted block.
+		 */
+		return -EBUSY;
+
+	del(mq, demoted);
 	*oblock = demoted->oblock;
 	free_entry(&mq->cache_pool, demoted);
 
@@ -795,6 +803,7 @@ static int cache_entry_found(struct mq_policy *mq,
  * finding which cache block to use.
  */
 static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
+			      struct policy_locker *locker,
 			      struct policy_result *result)
 {
 	int r;
@@ -803,11 +812,12 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 	/* Ensure there's a free cblock in the cache */
 	if (epool_empty(&mq->cache_pool)) {
 		result->op = POLICY_REPLACE;
-		r = demote_cblock(mq, &result->old_oblock);
+		r = demote_cblock(mq, locker, &result->old_oblock);
 		if (r) {
 			result->op = POLICY_MISS;
 			return 0;
 		}
+
 	} else
 		result->op = POLICY_NEW;
 
@@ -829,7 +839,8 @@ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 
 static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 				 bool can_migrate, bool discarded_oblock,
-				 int data_dir, struct policy_result *result)
+				 int data_dir, struct policy_locker *locker,
+				 struct policy_result *result)
 {
 	int r = 0;
 
@@ -842,7 +853,7 @@ static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 
 	else {
 		requeue(mq, e);
-		r = pre_cache_to_cache(mq, e, result);
+		r = pre_cache_to_cache(mq, e, locker, result);
 	}
 
 	return r;
@@ -872,6 +883,7 @@ static void insert_in_pre_cache(struct mq_policy *mq,
 }
 
 static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
+			    struct policy_locker *locker,
 			    struct policy_result *result)
 {
 	int r;
@@ -879,7 +891,7 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 
 	if (epool_empty(&mq->cache_pool)) {
 		result->op = POLICY_REPLACE;
-		r = demote_cblock(mq, &result->old_oblock);
+		r = demote_cblock(mq, locker, &result->old_oblock);
 		if (unlikely(r)) {
 			result->op = POLICY_MISS;
 			insert_in_pre_cache(mq, oblock);
@@ -907,11 +919,12 @@ static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 
 static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
 			  bool can_migrate, bool discarded_oblock,
-			  int data_dir, struct policy_result *result)
+			  int data_dir, struct policy_locker *locker,
+			  struct policy_result *result)
 {
 	if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
 		if (can_migrate)
-			insert_in_cache(mq, oblock, result);
+			insert_in_cache(mq, oblock, locker, result);
 		else
 			return -EWOULDBLOCK;
 	} else {
@@ -928,7 +941,8 @@ static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
  */
 static int map(struct mq_policy *mq, dm_oblock_t oblock,
 	       bool can_migrate, bool discarded_oblock,
-	       int data_dir, struct policy_result *result)
+	       int data_dir, struct policy_locker *locker,
+	       struct policy_result *result)
 {
 	int r = 0;
 	struct entry *e = hash_lookup(mq, oblock);
@@ -942,11 +956,11 @@ static int map(struct mq_policy *mq, dm_oblock_t oblock,
 
 	else if (e)
 		r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
-					  data_dir, result);
+					  data_dir, locker, result);
 
 	else
 		r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
-				   data_dir, result);
+				   data_dir, locker, result);
 
 	if (r == -EWOULDBLOCK)
 		result->op = POLICY_MISS;
@@ -1012,7 +1026,8 @@ static void copy_tick(struct mq_policy *mq)
 
 static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 		  bool can_block, bool can_migrate, bool discarded_oblock,
-		  struct bio *bio, struct policy_result *result)
+		  struct bio *bio, struct policy_locker *locker,
+		  struct policy_result *result)
 {
 	int r;
 	struct mq_policy *mq = to_mq_policy(p);
@@ -1028,7 +1043,7 @@ static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 
 	iot_examine_bio(&mq->tracker, bio);
 	r = map(mq, oblock, can_migrate, discarded_oblock,
-		bio_data_dir(bio), result);
+		bio_data_dir(bio), locker, result);
 
 	mutex_unlock(&mq->lock);
 
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index f50fe360c546..5524e21e4836 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -69,6 +69,18 @@ enum policy_operation {
 	POLICY_REPLACE
 };
 
+/*
+ * When issuing a POLICY_REPLACE the policy needs to make a callback to
+ * lock the block being demoted.  This doesn't need to occur during a
+ * writeback operation since the block remains in the cache.
+ */
+struct policy_locker;
+typedef int (*policy_lock_fn)(struct policy_locker *l, dm_oblock_t oblock);
+
+struct policy_locker {
+	policy_lock_fn fn;
+};
+
 /*
  * This is the instruction passed back to the core target.
  */
@@ -122,7 +134,8 @@ struct dm_cache_policy {
 	 */
 	int (*map)(struct dm_cache_policy *p, dm_oblock_t oblock,
 		   bool can_block, bool can_migrate, bool discarded_oblock,
-		   struct bio *bio, struct policy_result *result);
+		   struct bio *bio, struct policy_locker *locker,
+		   struct policy_result *result);
 
 	/*
 	 * Sometimes we want to see if a block is in the cache, without
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 41b2594a80c6..d5982480630b 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1439,16 +1439,43 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
 		   &cache->stats.read_miss : &cache->stats.write_miss);
 }
 
+/*----------------------------------------------------------------*/
+
+struct old_oblock_lock {
+	struct policy_locker locker;
+	struct cache *cache;
+	struct prealloc *structs;
+	struct dm_bio_prison_cell *cell;
+};
+
+static int null_locker(struct policy_locker *locker, dm_oblock_t b)
+{
+	/* This should never be called */
+	BUG();
+	return 0;
+}
+
+static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
+{
+	struct old_oblock_lock *l = container_of(locker, struct old_oblock_lock, locker);
+	struct dm_bio_prison_cell *cell_prealloc = prealloc_get_cell(l->structs);
+
+	return bio_detain(l->cache, b, NULL, cell_prealloc,
+			  (cell_free_fn) prealloc_put_cell,
+			  l->structs, &l->cell);
+}
+
 static void process_bio(struct cache *cache, struct prealloc *structs,
 			struct bio *bio)
 {
 	int r;
 	bool release_cell = true;
 	dm_oblock_t block = get_bio_block(cache, bio);
-	struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
+	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
 	struct policy_result lookup_result;
 	bool passthrough = passthrough_mode(&cache->features);
 	bool discarded_block, can_migrate;
+	struct old_oblock_lock ool;
 
 	/*
 	 * Check to see if that block is currently migrating.
@@ -1463,8 +1490,12 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 	discarded_block = is_discarded_oblock(cache, block);
 	can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
 
+	ool.locker.fn = cell_locker;
+	ool.cache = cache;
+	ool.structs = structs;
+	ool.cell = NULL;
 	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
-		       bio, &lookup_result);
+		       bio, &ool.locker, &lookup_result);
 
 	if (r == -EWOULDBLOCK)
 		/* migration has been denied */
@@ -1521,27 +1552,11 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 		break;
 
 	case POLICY_REPLACE:
-		cell_prealloc = prealloc_get_cell(structs);
-		r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
-			       (cell_free_fn) prealloc_put_cell,
-			       structs, &old_ocell);
-		if (r > 0) {
-			/*
-			 * We have to be careful to avoid lock inversion of
-			 * the cells.  So we back off, and wait for the
-			 * old_ocell to become free.
-			 */
-			policy_force_mapping(cache->policy, block,
-					     lookup_result.old_oblock);
-			atomic_inc(&cache->stats.cache_cell_clash);
-			break;
-		}
 		atomic_inc(&cache->stats.demotion);
 		atomic_inc(&cache->stats.promotion);
-
 		demote_then_promote(cache, structs, lookup_result.old_oblock,
 				    block, lookup_result.cblock,
-				    old_ocell, new_ocell);
+				    ool.cell, new_ocell);
 		release_cell = false;
 		break;
 
@@ -2589,6 +2604,9 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 	bool discarded_block;
 	struct policy_result lookup_result;
 	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
+	struct old_oblock_lock ool;
+
+	ool.locker.fn = null_locker;
 
 	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
 		/*
@@ -2627,7 +2645,7 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 	discarded_block = is_discarded_oblock(cache, block);
 
 	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
-		       bio, &lookup_result);
+		       bio, &ool.locker, &lookup_result);
 	if (r == -EWOULDBLOCK) {
 		cell_defer(cache, *cell, true);
 		return DM_MAPIO_SUBMITTED;
-- 
cgit v1.2.1


From 77289d32073c4eac57fcca2abe6caefc6f3dc7d6 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 13:45:30 +0100
Subject: dm cache: add io_tracker

A little class that keeps track of the volume of io that is in flight,
and the length of time that a device has been idle for.

FIXME: rather than jiffes, may be best to use ktime_t (to support faster
devices).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 73 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index d5982480630b..6f9bdd1bf7c4 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -25,6 +25,79 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
 
 /*----------------------------------------------------------------*/
 
+#define IOT_RESOLUTION 4
+
+struct io_tracker {
+	spinlock_t lock;
+
+	/*
+	 * Sectors of in-flight IO.
+	 */
+	sector_t in_flight;
+
+	/*
+	 * The time, in jiffies, when this device became idle (if it is
+	 * indeed idle).
+	 */
+	unsigned long idle_time;
+	unsigned long last_update_time;
+};
+
+static void iot_init(struct io_tracker *iot)
+{
+	spin_lock_init(&iot->lock);
+	iot->in_flight = 0ul;
+	iot->idle_time = 0ul;
+	iot->last_update_time = jiffies;
+}
+
+static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
+{
+	if (iot->in_flight)
+		return false;
+
+	return time_after(jiffies, iot->idle_time + jifs);
+}
+
+static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
+{
+	bool r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iot->lock, flags);
+	r = __iot_idle_for(iot, jifs);
+	spin_unlock_irqrestore(&iot->lock, flags);
+
+	return r;
+}
+
+static void iot_io_begin(struct io_tracker *iot, sector_t len)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iot->lock, flags);
+	iot->in_flight += len;
+	spin_unlock_irqrestore(&iot->lock, flags);
+}
+
+static void __iot_io_end(struct io_tracker *iot, sector_t len)
+{
+	iot->in_flight -= len;
+	if (!iot->in_flight)
+		iot->idle_time = jiffies;
+}
+
+static void iot_io_end(struct io_tracker *iot, sector_t len)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&iot->lock, flags);
+	__iot_io_end(iot, len);
+	spin_unlock_irqrestore(&iot->lock, flags);
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * Glossary:
  *
-- 
cgit v1.2.1


From 066dbaa386c751164c39ab025e5e8803b4a4d691 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:18:01 +0100
Subject: dm cache: track IO to the origin device using io_tracker

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 56 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 49 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 6f9bdd1bf7c4..940c7b2b5ab4 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -355,6 +355,8 @@ struct cache {
 	 */
 	spinlock_t invalidation_lock;
 	struct list_head invalidation_requests;
+
+	struct io_tracker origin_tracker;
 };
 
 struct per_bio_data {
@@ -362,6 +364,7 @@ struct per_bio_data {
 	unsigned req_nr:2;
 	struct dm_deferred_entry *all_io_entry;
 	struct dm_hook_info hook_info;
+	sector_t len;
 
 	/*
 	 * writethrough fields.  These MUST remain at the end of this
@@ -768,6 +771,7 @@ static struct per_bio_data *init_per_bio_data(struct bio *bio, size_t data_size)
 	pb->tick = false;
 	pb->req_nr = dm_bio_get_target_bio_nr(bio);
 	pb->all_io_entry = NULL;
+	pb->len = 0;
 
 	return pb;
 }
@@ -865,12 +869,43 @@ static void inc_ds(struct cache *cache, struct bio *bio,
 	pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
 }
 
+static bool accountable_bio(struct cache *cache, struct bio *bio)
+{
+	return ((bio->bi_bdev == cache->origin_dev->bdev) &&
+		!(bio->bi_rw & REQ_DISCARD));
+}
+
+static void accounted_begin(struct cache *cache, struct bio *bio)
+{
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	if (accountable_bio(cache, bio)) {
+		pb->len = bio_sectors(bio);
+		iot_io_begin(&cache->origin_tracker, pb->len);
+	}
+}
+
+static void accounted_complete(struct cache *cache, struct bio *bio)
+{
+	size_t pb_data_size = get_per_bio_data_size(cache);
+	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
+
+	iot_io_end(&cache->origin_tracker, pb->len);
+}
+
+static void accounted_request(struct cache *cache, struct bio *bio)
+{
+	accounted_begin(cache, bio);
+	generic_make_request(bio);
+}
+
 static void issue(struct cache *cache, struct bio *bio)
 {
 	unsigned long flags;
 
 	if (!bio_triggers_commit(cache, bio)) {
-		generic_make_request(bio);
+		accounted_request(cache, bio);
 		return;
 	}
 
@@ -1166,7 +1201,7 @@ static void issue_overwrite(struct dm_cache_migration *mg, struct bio *bio)
 	 * No need to inc_ds() here, since the cell will be held for the
 	 * duration of the io.
 	 */
-	generic_make_request(bio);
+	accounted_request(mg->cache, bio);
 }
 
 static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
@@ -1722,7 +1757,7 @@ static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
 	 * These bios have already been through inc_ds()
 	 */
 	while ((bio = bio_list_pop(&bios)))
-		submit_bios ? generic_make_request(bio) : bio_io_error(bio);
+		submit_bios ? accounted_request(cache, bio) : bio_io_error(bio);
 }
 
 static void process_deferred_writethrough_bios(struct cache *cache)
@@ -1742,7 +1777,7 @@ static void process_deferred_writethrough_bios(struct cache *cache)
 	 * These bios have already been through inc_ds()
 	 */
 	while ((bio = bio_list_pop(&bios)))
-		generic_make_request(bio);
+		accounted_request(cache, bio);
 }
 
 static void writeback_some_dirty_blocks(struct cache *cache)
@@ -2602,6 +2637,8 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	spin_lock_init(&cache->invalidation_lock);
 	INIT_LIST_HEAD(&cache->invalidation_requests);
 
+	iot_init(&cache->origin_tracker);
+
 	*result = cache;
 	return 0;
 
@@ -2791,9 +2828,13 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	struct cache *cache = ti->private;
 
 	r = __cache_map(cache, bio, &cell);
-	if (r == DM_MAPIO_REMAPPED && cell) {
-		inc_ds(cache, bio, cell);
-		cell_defer(cache, cell, false);
+	if (r == DM_MAPIO_REMAPPED) {
+		accounted_begin(cache, bio);
+
+		if (cell) {
+			inc_ds(cache, bio, cell);
+			cell_defer(cache, cell, false);
+		}
 	}
 
 	return r;
@@ -2815,6 +2856,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 	}
 
 	check_for_quiesced_migrations(cache, pb);
+	accounted_complete(cache, bio);
 
 	return 0;
 }
-- 
cgit v1.2.1


From 20f6814b94fff4a98b123f1c2b691e936be27aaf Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:20:09 +0100
Subject: dm cache: pass a new 'critical' flag to the policies when requesting
 writeback work

We only allow non critical writeback if the origin is idle.  It is up
to the policy to decide what writeback work is critical.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-cleaner.c  | 3 ++-
 drivers/md/dm-cache-policy-internal.h | 5 +++--
 drivers/md/dm-cache-policy-mq.c       | 2 +-
 drivers/md/dm-cache-policy.h          | 7 +++++--
 drivers/md/dm-cache-target.c          | 3 ++-
 5 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-cleaner.c b/drivers/md/dm-cache-policy-cleaner.c
index 004e463c9423..240c9f0e85e7 100644
--- a/drivers/md/dm-cache-policy-cleaner.c
+++ b/drivers/md/dm-cache-policy-cleaner.c
@@ -359,7 +359,8 @@ static struct wb_cache_entry *get_next_dirty_entry(struct policy *p)
 
 static int wb_writeback_work(struct dm_cache_policy *pe,
 			     dm_oblock_t *oblock,
-			     dm_cblock_t *cblock)
+			     dm_cblock_t *cblock,
+			     bool critical_only)
 {
 	int r = -ENOENT;
 	struct policy *p = to_policy(pe);
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index c198e6defb9c..776c685167e6 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -55,9 +55,10 @@ static inline int policy_walk_mappings(struct dm_cache_policy *p,
 
 static inline int policy_writeback_work(struct dm_cache_policy *p,
 					dm_oblock_t *oblock,
-					dm_cblock_t *cblock)
+					dm_cblock_t *cblock,
+					bool critical_only)
 {
-	return p->writeback_work ? p->writeback_work(p, oblock, cblock) : -ENOENT;
+	return p->writeback_work ? p->writeback_work(p, oblock, cblock, critical_only) : -ENOENT;
 }
 
 static inline void policy_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 515d44bf24d3..7cbae125879c 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1236,7 +1236,7 @@ static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
 }
 
 static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
-			     dm_cblock_t *cblock)
+			     dm_cblock_t *cblock, bool critical_only)
 {
 	int r;
 	struct mq_policy *mq = to_mq_policy(p);
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 5524e21e4836..6106ca3aa350 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -178,7 +178,9 @@ struct dm_cache_policy {
 	int (*remove_cblock)(struct dm_cache_policy *p, dm_cblock_t cblock);
 
 	/*
-	 * Provide a dirty block to be written back by the core target.
+	 * Provide a dirty block to be written back by the core target.  If
+	 * critical_only is set then the policy should only provide work if
+	 * it urgently needs it.
 	 *
 	 * Returns:
 	 *
@@ -186,7 +188,8 @@ struct dm_cache_policy {
 	 *
 	 * -ENODATA: no dirty blocks available
 	 */
-	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock);
+	int (*writeback_work)(struct dm_cache_policy *p, dm_oblock_t *oblock, dm_cblock_t *cblock,
+			      bool critical_only);
 
 	/*
 	 * How full is the cache?
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 940c7b2b5ab4..5a9cd2c5a359 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1787,6 +1787,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 	dm_cblock_t cblock;
 	struct prealloc structs;
 	struct dm_bio_prison_cell *old_ocell;
+	bool busy = !iot_idle_for(&cache->origin_tracker, HZ);
 
 	memset(&structs, 0, sizeof(structs));
 
@@ -1794,7 +1795,7 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 		if (prealloc_data_structs(cache, &structs))
 			break;
 
-		r = policy_writeback_work(cache->policy, &oblock, &cblock);
+		r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
 		if (r)
 			break;
 
-- 
cgit v1.2.1


From 451b9e0071b2833744db7f518115bc085bc7b23c Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:22:02 +0100
Subject: dm cache: pull out some bitset utility functions for reuse

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-internal.h | 28 ++++++++++++++++++++++++++++
 drivers/md/dm-cache-target.c          | 24 ------------------------
 2 files changed, 28 insertions(+), 24 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 776c685167e6..9dc05a52369e 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -7,6 +7,7 @@
 #ifndef DM_CACHE_POLICY_INTERNAL_H
 #define DM_CACHE_POLICY_INTERNAL_H
 
+#include <linux/vmalloc.h>
 #include "dm-cache-policy.h"
 
 /*----------------------------------------------------------------*/
@@ -106,6 +107,33 @@ static inline int policy_set_config_value(struct dm_cache_policy *p,
 
 /*----------------------------------------------------------------*/
 
+/*
+ * Some utility functions commonly used by policies and the core target.
+ */
+static inline size_t bitset_size_in_bytes(unsigned nr_entries)
+{
+	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
+}
+
+static inline unsigned long *alloc_bitset(unsigned nr_entries)
+{
+	size_t s = bitset_size_in_bytes(nr_entries);
+	return vzalloc(s);
+}
+
+static inline void clear_bitset(void *bitset, unsigned nr_entries)
+{
+	size_t s = bitset_size_in_bytes(nr_entries);
+	memset(bitset, 0, s);
+}
+
+static inline void free_bitset(unsigned long *bits)
+{
+	vfree(bits);
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * Creates a new cache policy given a policy name, a cache size, an origin size and the block size.
  */
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 5a9cd2c5a359..5d3b20b91ba3 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -111,30 +111,6 @@ static void iot_io_end(struct io_tracker *iot, sector_t len)
 
 /*----------------------------------------------------------------*/
 
-static size_t bitset_size_in_bytes(unsigned nr_entries)
-{
-	return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
-}
-
-static unsigned long *alloc_bitset(unsigned nr_entries)
-{
-	size_t s = bitset_size_in_bytes(nr_entries);
-	return vzalloc(s);
-}
-
-static void clear_bitset(void *bitset, unsigned nr_entries)
-{
-	size_t s = bitset_size_in_bytes(nr_entries);
-	memset(bitset, 0, s);
-}
-
-static void free_bitset(unsigned long *bits)
-{
-	vfree(bits);
-}
-
-/*----------------------------------------------------------------*/
-
 /*
  * There are a couple of places where we let a bio run, but want to do some
  * work before calling its endio function.  We do this by temporarily
-- 
cgit v1.2.1


From 3cdf93f9d85979b22b6abfd4ab19350860e4dfac Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:23:35 +0100
Subject: dm bio prison: add dm_cell_promote_or_release()

Rather than always releasing the prisoners in a cell, the client may
want to promote one of them to be the new holder.  There is a race here
though between releasing an empty cell, and other threads adding new
inmates.  So this function makes the decision with its lock held.

This function can have two outcomes:
i)  An inmate is promoted to be the holder of the cell (return value of 0).
ii) The cell has no inmate for promotion and is released (return value of 1).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-bio-prison.c | 26 ++++++++++++++++++++++++++
 drivers/md/dm-bio-prison.h | 13 +++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-bio-prison.c b/drivers/md/dm-bio-prison.c
index be065300e93c..cd6d1d21e057 100644
--- a/drivers/md/dm-bio-prison.c
+++ b/drivers/md/dm-bio-prison.c
@@ -255,6 +255,32 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
 }
 EXPORT_SYMBOL_GPL(dm_cell_visit_release);
 
+static int __promote_or_release(struct dm_bio_prison *prison,
+				struct dm_bio_prison_cell *cell)
+{
+	if (bio_list_empty(&cell->bios)) {
+		rb_erase(&cell->node, &prison->cells);
+		return 1;
+	}
+
+	cell->holder = bio_list_pop(&cell->bios);
+	return 0;
+}
+
+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
+			       struct dm_bio_prison_cell *cell)
+{
+	int r;
+	unsigned long flags;
+
+	spin_lock_irqsave(&prison->lock, flags);
+	r = __promote_or_release(prison, cell);
+	spin_unlock_irqrestore(&prison->lock, flags);
+
+	return r;
+}
+EXPORT_SYMBOL_GPL(dm_cell_promote_or_release);
+
 /*----------------------------------------------------------------*/
 
 #define DEFERRED_SET_SIZE 64
diff --git a/drivers/md/dm-bio-prison.h b/drivers/md/dm-bio-prison.h
index 74cf01144b1f..54352f009bfd 100644
--- a/drivers/md/dm-bio-prison.h
+++ b/drivers/md/dm-bio-prison.h
@@ -101,6 +101,19 @@ void dm_cell_visit_release(struct dm_bio_prison *prison,
 			   void (*visit_fn)(void *, struct dm_bio_prison_cell *),
 			   void *context, struct dm_bio_prison_cell *cell);
 
+/*
+ * Rather than always releasing the prisoners in a cell, the client may
+ * want to promote one of them to be the new holder.  There is a race here
+ * though between releasing an empty cell, and other threads adding new
+ * inmates.  So this function makes the decision with its lock held.
+ *
+ * This function can have two outcomes:
+ * i) An inmate is promoted to be the holder of the cell (return value of 0).
+ * ii) The cell has no inmate for promotion and is released (return value of 1).
+ */
+int dm_cell_promote_or_release(struct dm_bio_prison *prison,
+			       struct dm_bio_prison_cell *cell);
+
 /*----------------------------------------------------------------*/
 
 /*
-- 
cgit v1.2.1


From 651f5fa2a3959ff5db60c09a84efd66309fe4035 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:26:08 +0100
Subject: dm cache: defer whole cells

Currently individual bios are deferred to the worker thread if they
cannot be processed immediately (eg, a block is in the process of
being moved to the fast device).

This patch passes whole cells across to the worker.  This saves
reaquiring the cell, and also collects bios destined for the same block
together, which allows them to be mapped with a single look up to the
policy.  This reduces the overhead of using dm-cache.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 325 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 262 insertions(+), 63 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 5d3b20b91ba3..d2d91c164420 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -257,6 +257,7 @@ struct cache {
 	int sectors_per_block_shift;
 
 	spinlock_t lock;
+	struct list_head deferred_cells;
 	struct bio_list deferred_bios;
 	struct bio_list deferred_flush_bios;
 	struct bio_list deferred_writethrough_bios;
@@ -969,26 +970,63 @@ static void dec_io_migrations(struct cache *cache)
 	atomic_dec(&cache->nr_io_migrations);
 }
 
-static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
-			 bool holder)
+static void __cell_release(struct cache *cache, struct dm_bio_prison_cell *cell,
+			   bool holder, struct bio_list *bios)
 {
 	(holder ? dm_cell_release : dm_cell_release_no_holder)
-		(cache->prison, cell, &cache->deferred_bios);
+		(cache->prison, cell, bios);
 	free_prison_cell(cache, cell);
 }
 
-static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
-		       bool holder)
+static bool discard_or_flush(struct bio *bio)
+{
+	return bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD);
+}
+
+static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+	if (discard_or_flush(cell->holder))
+		/*
+		 * We have to handle these bios
+		 * individually.
+		 */
+		__cell_release(cache, cell, true, &cache->deferred_bios);
+
+	else
+		list_add_tail(&cell->user_list, &cache->deferred_cells);
+}
+
+static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell, bool holder)
 {
 	unsigned long flags;
 
+	if (!holder && dm_cell_promote_or_release(cache->prison, cell)) {
+		/*
+		 * There was no prisoner to promote to holder, the
+		 * cell has been released.
+		 */
+		free_prison_cell(cache, cell);
+		return;
+	}
+
 	spin_lock_irqsave(&cache->lock, flags);
-	__cell_defer(cache, cell, holder);
+	__cell_defer(cache, cell);
 	spin_unlock_irqrestore(&cache->lock, flags);
 
 	wake_worker(cache);
 }
 
+static void cell_error_with_code(struct cache *cache, struct dm_bio_prison_cell *cell, int err)
+{
+	dm_cell_error(cache->prison, cell, err);
+	dm_bio_prison_free_cell(cache->prison, cell);
+}
+
+static void cell_requeue(struct cache *cache, struct dm_bio_prison_cell *cell)
+{
+	cell_error_with_code(cache, cell, DM_ENDIO_REQUEUE);
+}
+
 static void free_io_migration(struct dm_cache_migration *mg)
 {
 	dec_io_migrations(mg->cache);
@@ -1525,6 +1563,107 @@ static void inc_miss_counter(struct cache *cache, struct bio *bio)
 
 /*----------------------------------------------------------------*/
 
+struct inc_detail {
+	struct cache *cache;
+	struct bio_list bios_for_issue;
+	struct bio_list unhandled_bios;
+	bool any_writes;
+};
+
+static void inc_fn(void *context, struct dm_bio_prison_cell *cell)
+{
+	struct bio *bio;
+	struct inc_detail *detail = context;
+	struct cache *cache = detail->cache;
+
+	inc_ds(cache, cell->holder, cell);
+	if (bio_data_dir(cell->holder) == WRITE)
+		detail->any_writes = true;
+
+	while ((bio = bio_list_pop(&cell->bios))) {
+		if (discard_or_flush(bio)) {
+			bio_list_add(&detail->unhandled_bios, bio);
+			continue;
+		}
+
+		if (bio_data_dir(bio) == WRITE)
+			detail->any_writes = true;
+
+		bio_list_add(&detail->bios_for_issue, bio);
+		inc_ds(cache, bio, cell);
+	}
+}
+
+// FIXME: refactor these two
+static void remap_cell_to_origin_clear_discard(struct cache *cache,
+					       struct dm_bio_prison_cell *cell,
+					       dm_oblock_t oblock, bool issue_holder)
+{
+	struct bio *bio;
+	unsigned long flags;
+	struct inc_detail detail;
+
+	detail.cache = cache;
+	bio_list_init(&detail.bios_for_issue);
+	bio_list_init(&detail.unhandled_bios);
+	detail.any_writes = false;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
+	bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	remap_to_origin(cache, cell->holder);
+	if (issue_holder)
+		issue(cache, cell->holder);
+	else
+		accounted_begin(cache, cell->holder);
+
+	if (detail.any_writes)
+		clear_discard(cache, oblock_to_dblock(cache, oblock));
+
+	while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+		remap_to_origin(cache, bio);
+		issue(cache, bio);
+	}
+}
+
+static void remap_cell_to_cache_dirty(struct cache *cache, struct dm_bio_prison_cell *cell,
+				      dm_oblock_t oblock, dm_cblock_t cblock, bool issue_holder)
+{
+	struct bio *bio;
+	unsigned long flags;
+	struct inc_detail detail;
+
+	detail.cache = cache;
+	bio_list_init(&detail.bios_for_issue);
+	bio_list_init(&detail.unhandled_bios);
+	detail.any_writes = false;
+
+	spin_lock_irqsave(&cache->lock, flags);
+	dm_cell_visit_release(cache->prison, inc_fn, &detail, cell);
+	bio_list_merge(&cache->deferred_bios, &detail.unhandled_bios);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	remap_to_cache(cache, cell->holder, cblock);
+	if (issue_holder)
+		issue(cache, cell->holder);
+	else
+		accounted_begin(cache, cell->holder);
+
+	if (detail.any_writes) {
+		set_dirty(cache, oblock, cblock);
+		clear_discard(cache, oblock_to_dblock(cache, oblock));
+	}
+
+	while ((bio = bio_list_pop(&detail.bios_for_issue))) {
+		remap_to_cache(cache, bio, cblock);
+		issue(cache, bio);
+	}
+}
+
+/*----------------------------------------------------------------*/
+
 struct old_oblock_lock {
 	struct policy_locker locker;
 	struct cache *cache;
@@ -1549,28 +1688,18 @@ static int cell_locker(struct policy_locker *locker, dm_oblock_t b)
 			  l->structs, &l->cell);
 }
 
-static void process_bio(struct cache *cache, struct prealloc *structs,
-			struct bio *bio)
+static void process_cell(struct cache *cache, struct prealloc *structs,
+			 struct dm_bio_prison_cell *new_ocell)
 {
 	int r;
 	bool release_cell = true;
+	struct bio *bio = new_ocell->holder;
 	dm_oblock_t block = get_bio_block(cache, bio);
-	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
 	struct policy_result lookup_result;
 	bool passthrough = passthrough_mode(&cache->features);
 	bool discarded_block, can_migrate;
 	struct old_oblock_lock ool;
 
-	/*
-	 * Check to see if that block is currently migrating.
-	 */
-	cell_prealloc = prealloc_get_cell(structs);
-	r = bio_detain(cache, block, bio, cell_prealloc,
-		       (cell_free_fn) prealloc_put_cell,
-		       structs, &new_ocell);
-	if (r > 0)
-		return;
-
 	discarded_block = is_discarded_oblock(cache, block);
 	can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
 
@@ -1615,9 +1744,9 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
 				inc_and_issue(cache, bio, new_ocell);
 
-			} else  {
-				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
-				inc_and_issue(cache, bio, new_ocell);
+			} else {
+				remap_cell_to_cache_dirty(cache, new_ocell, block, lookup_result.cblock, true);
+				release_cell = false;
 			}
 		}
 
@@ -1625,8 +1754,8 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 
 	case POLICY_MISS:
 		inc_miss_counter(cache, bio);
-		remap_to_origin_clear_discard(cache, bio, block);
-		inc_and_issue(cache, bio, new_ocell);
+		remap_cell_to_origin_clear_discard(cache, new_ocell, block, true);
+		release_cell = false;
 		break;
 
 	case POLICY_NEW:
@@ -1654,10 +1783,30 @@ static void process_bio(struct cache *cache, struct prealloc *structs,
 		cell_defer(cache, new_ocell, false);
 }
 
+static void process_bio(struct cache *cache, struct prealloc *structs,
+			struct bio *bio)
+{
+	int r;
+	dm_oblock_t block = get_bio_block(cache, bio);
+	struct dm_bio_prison_cell *cell_prealloc, *new_ocell;
+
+	/*
+	 * Check to see if that block is currently migrating.
+	 */
+	cell_prealloc = prealloc_get_cell(structs);
+	r = bio_detain(cache, block, bio, cell_prealloc,
+		       (cell_free_fn) prealloc_put_cell,
+		       structs, &new_ocell);
+	if (r > 0)
+		return;
+
+	process_cell(cache, structs, new_ocell);
+}
+
 static int need_commit_due_to_time(struct cache *cache)
 {
-	return !time_in_range(jiffies, cache->last_commit_jiffies,
-			      cache->last_commit_jiffies + COMMIT_PERIOD);
+	return jiffies < cache->last_commit_jiffies ||
+	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
 }
 
 static int commit_if_needed(struct cache *cache)
@@ -1716,6 +1865,40 @@ static void process_deferred_bios(struct cache *cache)
 	prealloc_free_structs(cache, &structs);
 }
 
+static void process_deferred_cells(struct cache *cache)
+{
+	unsigned long flags;
+	struct dm_bio_prison_cell *cell, *tmp;
+	struct list_head cells;
+	struct prealloc structs;
+
+	memset(&structs, 0, sizeof(structs));
+
+	INIT_LIST_HEAD(&cells);
+
+	spin_lock_irqsave(&cache->lock, flags);
+	list_splice_init(&cache->deferred_cells, &cells);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	list_for_each_entry_safe(cell, tmp, &cells, user_list) {
+		/*
+		 * If we've got no free migration structs, and processing
+		 * this bio might require one, we pause until there are some
+		 * prepared mappings to process.
+		 */
+		if (prealloc_data_structs(cache, &structs)) {
+			spin_lock_irqsave(&cache->lock, flags);
+			list_splice(&cells, &cache->deferred_cells);
+			spin_unlock_irqrestore(&cache->lock, flags);
+			break;
+		}
+
+		process_cell(cache, &structs, cell);
+	}
+
+	prealloc_free_structs(cache, &structs);
+}
+
 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
 {
 	unsigned long flags;
@@ -1883,7 +2066,22 @@ static void stop_worker(struct cache *cache)
 	flush_workqueue(cache->wq);
 }
 
-static void requeue_deferred_io(struct cache *cache)
+static void requeue_deferred_cells(struct cache *cache)
+{
+	unsigned long flags;
+	struct list_head cells;
+	struct dm_bio_prison_cell *cell, *tmp;
+
+	INIT_LIST_HEAD(&cells);
+	spin_lock_irqsave(&cache->lock, flags);
+	list_splice_init(&cache->deferred_cells, &cells);
+	spin_unlock_irqrestore(&cache->lock, flags);
+
+	list_for_each_entry_safe(cell, tmp, &cells, user_list)
+		cell_requeue(cache, cell);
+}
+
+static void requeue_deferred_bios(struct cache *cache)
 {
 	struct bio *bio;
 	struct bio_list bios;
@@ -1904,6 +2102,7 @@ static int more_work(struct cache *cache)
 			!list_empty(&cache->need_commit_migrations);
 	else
 		return !bio_list_empty(&cache->deferred_bios) ||
+			!list_empty(&cache->deferred_cells) ||
 			!bio_list_empty(&cache->deferred_flush_bios) ||
 			!bio_list_empty(&cache->deferred_writethrough_bios) ||
 			!list_empty(&cache->quiesced_migrations) ||
@@ -1921,6 +2120,7 @@ static void do_worker(struct work_struct *ws)
 			writeback_some_dirty_blocks(cache);
 			process_deferred_writethrough_bios(cache);
 			process_deferred_bios(cache);
+			process_deferred_cells(cache);
 			process_invalidation_requests(cache);
 		}
 
@@ -1935,6 +2135,7 @@ static void do_worker(struct work_struct *ws)
 			 * FIXME: rollback metadata or just go into a
 			 * failure mode and error everything
 			 */
+
 		} else {
 			process_deferred_flush_bios(cache, true);
 			process_migrations(cache, &cache->need_commit_migrations,
@@ -2525,6 +2726,7 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 	}
 
 	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->deferred_cells);
 	bio_list_init(&cache->deferred_bios);
 	bio_list_init(&cache->deferred_flush_bios);
 	bio_list_init(&cache->deferred_writethrough_bios);
@@ -2682,9 +2884,14 @@ out:
 	return r;
 }
 
-static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_prison_cell **cell)
+/*----------------------------------------------------------------*/
+
+static int cache_map(struct dm_target *ti, struct bio *bio)
 {
+	struct cache *cache = ti->private;
+
 	int r;
+	struct dm_bio_prison_cell *cell = NULL;
 	dm_oblock_t block = get_bio_block(cache, bio);
 	size_t pb_data_size = get_per_bio_data_size(cache);
 	bool can_migrate = false;
@@ -2702,10 +2909,11 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 		 * Just remap to the origin and carry on.
 		 */
 		remap_to_origin(cache, bio);
+		accounted_begin(cache, bio);
 		return DM_MAPIO_REMAPPED;
 	}
 
-	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
+	if (discard_or_flush(bio)) {
 		defer_bio(cache, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
@@ -2713,15 +2921,15 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 	/*
 	 * Check to see if that block is currently migrating.
 	 */
-	*cell = alloc_prison_cell(cache);
-	if (!*cell) {
+	cell = alloc_prison_cell(cache);
+	if (!cell) {
 		defer_bio(cache, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	r = bio_detain(cache, block, bio, *cell,
+	r = bio_detain(cache, block, bio, cell,
 		       (cell_free_fn) free_prison_cell,
-		       cache, cell);
+		       cache, &cell);
 	if (r) {
 		if (r < 0)
 			defer_bio(cache, bio);
@@ -2734,12 +2942,12 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
 		       bio, &ool.locker, &lookup_result);
 	if (r == -EWOULDBLOCK) {
-		cell_defer(cache, *cell, true);
+		cell_defer(cache, cell, true);
 		return DM_MAPIO_SUBMITTED;
 
 	} else if (r) {
 		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
-		cell_defer(cache, *cell, false);
+		cell_defer(cache, cell, false);
 		bio_io_error(bio);
 		return DM_MAPIO_SUBMITTED;
 	}
@@ -2753,21 +2961,30 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 				 * We need to invalidate this block, so
 				 * defer for the worker thread.
 				 */
-				cell_defer(cache, *cell, true);
+				cell_defer(cache, cell, true);
 				r = DM_MAPIO_SUBMITTED;
 
 			} else {
 				inc_miss_counter(cache, bio);
 				remap_to_origin_clear_discard(cache, bio, block);
+				accounted_begin(cache, bio);
+				inc_ds(cache, bio, cell);
+				// FIXME: we want to remap hits or misses straight
+				// away rather than passing over to the worker.
+				cell_defer(cache, cell, false);
 			}
 
 		} else {
 			inc_hit_counter(cache, bio);
 			if (bio_data_dir(bio) == WRITE && writethrough_mode(&cache->features) &&
-			    !is_dirty(cache, lookup_result.cblock))
+			    !is_dirty(cache, lookup_result.cblock)) {
 				remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
-			else
-				remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
+				accounted_begin(cache, bio);
+				inc_ds(cache, bio, cell);
+				cell_defer(cache, cell, false);
+
+			} else
+				remap_cell_to_cache_dirty(cache, cell, block, lookup_result.cblock, false);
 		}
 		break;
 
@@ -2779,18 +2996,18 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 			 * longer needed because the block has been demoted.
 			 */
 			bio_endio(bio, 0);
-			cell_defer(cache, *cell, false);
+			// FIXME: remap everything as a miss
+			cell_defer(cache, cell, false);
 			r = DM_MAPIO_SUBMITTED;
 
 		} else
-			remap_to_origin_clear_discard(cache, bio, block);
-
+			remap_cell_to_origin_clear_discard(cache, cell, block, false);
 		break;
 
 	default:
 		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
 			    (unsigned) lookup_result.op);
-		cell_defer(cache, *cell, false);
+		cell_defer(cache, cell, false);
 		bio_io_error(bio);
 		r = DM_MAPIO_SUBMITTED;
 	}
@@ -2798,25 +3015,6 @@ static int __cache_map(struct cache *cache, struct bio *bio, struct dm_bio_priso
 	return r;
 }
 
-static int cache_map(struct dm_target *ti, struct bio *bio)
-{
-	int r;
-	struct dm_bio_prison_cell *cell = NULL;
-	struct cache *cache = ti->private;
-
-	r = __cache_map(cache, bio, &cell);
-	if (r == DM_MAPIO_REMAPPED) {
-		accounted_begin(cache, bio);
-
-		if (cell) {
-			inc_ds(cache, bio, cell);
-			cell_defer(cache, cell, false);
-		}
-	}
-
-	return r;
-}
-
 static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 {
 	struct cache *cache = ti->private;
@@ -2913,7 +3111,8 @@ static void cache_postsuspend(struct dm_target *ti)
 	start_quiescing(cache);
 	wait_for_migrations(cache);
 	stop_worker(cache);
-	requeue_deferred_io(cache);
+	requeue_deferred_bios(cache);
+	requeue_deferred_cells(cache);
 	stop_quiescing(cache);
 
 	(void) sync_metadata(cache);
-- 
cgit v1.2.1


From 40775257b97e27305cf5c2425be7acaa6edee4ea Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:29:58 +0100
Subject: dm cache: boost promotion of blocks that will be overwritten

When considering whether to move a block to the cache we already give
preferential treatment to discarded blocks, since they are cheap to
promote (no read of the origin required since the data is junk).

The same is true of blocks that are about to be completely
overwritten, so we likewise boost their promotion chances.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index d2d91c164420..7829d947ef01 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1697,17 +1697,17 @@ static void process_cell(struct cache *cache, struct prealloc *structs,
 	dm_oblock_t block = get_bio_block(cache, bio);
 	struct policy_result lookup_result;
 	bool passthrough = passthrough_mode(&cache->features);
-	bool discarded_block, can_migrate;
+	bool fast_promotion, can_migrate;
 	struct old_oblock_lock ool;
 
-	discarded_block = is_discarded_oblock(cache, block);
-	can_migrate = !passthrough && (discarded_block || spare_migration_bandwidth(cache));
+	fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
+	can_migrate = !passthrough && (fast_promotion || spare_migration_bandwidth(cache));
 
 	ool.locker.fn = cell_locker;
 	ool.cache = cache;
 	ool.structs = structs;
 	ool.cell = NULL;
-	r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
+	r = policy_map(cache->policy, block, true, can_migrate, fast_promotion,
 		       bio, &ool.locker, &lookup_result);
 
 	if (r == -EWOULDBLOCK)
@@ -2895,7 +2895,7 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 	dm_oblock_t block = get_bio_block(cache, bio);
 	size_t pb_data_size = get_per_bio_data_size(cache);
 	bool can_migrate = false;
-	bool discarded_block;
+	bool fast_promotion;
 	struct policy_result lookup_result;
 	struct per_bio_data *pb = init_per_bio_data(bio, pb_data_size);
 	struct old_oblock_lock ool;
@@ -2937,9 +2937,9 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_SUBMITTED;
 	}
 
-	discarded_block = is_discarded_oblock(cache, block);
+	fast_promotion = is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio);
 
-	r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
+	r = policy_map(cache->policy, block, false, can_migrate, fast_promotion,
 		       bio, &ool.locker, &lookup_result);
 	if (r == -EWOULDBLOCK) {
 		cell_defer(cache, cell, true);
-- 
cgit v1.2.1


From 4452226ea276e74fc3e252c88d9bb7e8f8e44bf0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:26 -0400
Subject: writeback: move backing_dev_info->state into bdi_writeback

Currently, a bdi (backing_dev_info) embeds single wb (bdi_writeback)
and the role of the separation is unclear.  For cgroup support for
writeback IOs, a bdi will be updated to host multiple wb's where each
wb serves writeback IOs of a different cgroup on the bdi.  To achieve
that, a wb should carry all states necessary for servicing writeback
IOs for a cgroup independently.

This patch moves bdi->state into wb.

* enum bdi_state is renamed to wb_state and the prefix of all enums is
  changed from BDI_ to WB_.

* Explicit zeroing of bdi->state is removed without adding zeoring of
  wb->state as the whole data structure is zeroed on init anyway.

* As there's still only one bdi_writeback per backing_dev_info, all
  uses of bdi->state are mechanically replaced with bdi->wb.state
  introducing no behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: drbd-dev@lists.linbit.com
Cc: Neil Brown <neilb@suse.de>
Cc: Alasdair Kergon <agk@redhat.com>
Cc: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/dm.c     | 2 +-
 drivers/md/raid1.c  | 4 ++--
 drivers/md/raid10.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 38837f8ea327..2161ed9329c4 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2074,7 +2074,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 			 * the query about congestion status of request_queue
 			 */
 			if (dm_request_based(md))
-				r = md->queue->backing_dev_info.state &
+				r = md->queue->backing_dev_info.wb.state &
 				    bdi_bits;
 			else
 				r = dm_table_any_congested(map, bdi_bits);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 9157a29c8dbf..f80f1af61ce7 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -745,7 +745,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
 	struct r1conf *conf = mddev->private;
 	int i, ret = 0;
 
-	if ((bits & (1 << BDI_async_congested)) &&
+	if ((bits & (1 << WB_async_congested)) &&
 	    conf->pending_count >= max_queued_requests)
 		return 1;
 
@@ -760,7 +760,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
 			/* Note the '|| 1' - when read_balance prefers
 			 * non-congested targets, it can be removed
 			 */
-			if ((bits & (1<<BDI_async_congested)) || 1)
+			if ((bits & (1 << WB_async_congested)) || 1)
 				ret |= bdi_congested(&q->backing_dev_info, bits);
 			else
 				ret &= bdi_congested(&q->backing_dev_info, bits);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e793ab6b3570..fca825718f29 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -914,7 +914,7 @@ static int raid10_congested(struct mddev *mddev, int bits)
 	struct r10conf *conf = mddev->private;
 	int i, ret = 0;
 
-	if ((bits & (1 << BDI_async_congested)) &&
+	if ((bits & (1 << WB_async_congested)) &&
 	    conf->pending_count >= max_queued_requests)
 		return 1;
 
-- 
cgit v1.2.1


From 66114cad64bf76a155fec1f0fff0de771cf909d5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 22 May 2015 17:13:32 -0400
Subject: writeback: separate out include/linux/backing-dev-defs.h

With the planned cgroup writeback support, backing-dev related
declarations will be more widely used across block and cgroup;
unfortunately, including backing-dev.h from include/linux/blkdev.h
makes cyclic include dependency quite likely.

This patch separates out backing-dev-defs.h which only has the
essential definitions and updates blkdev.h to include it.  c files
which need access to more backing-dev details now include
backing-dev.h directly.  This takes backing-dev.h off the common
include dependency chain making it a lot easier to use it across block
and cgroup.

v2: fs/fat build failure fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Jens Axboe <axboe@fb.com>
---
 drivers/md/bcache/request.c | 1 +
 drivers/md/dm.h             | 1 +
 drivers/md/md.h             | 1 +
 3 files changed, 3 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 1616f668a4cb..4afb2d26b148 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/hash.h>
 #include <linux/random.h>
+#include <linux/backing-dev.h>
 
 #include <trace/events/bcache.h>
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index e6e66d087b26..7fff744f0865 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -14,6 +14,7 @@
 #include <linux/device-mapper.h>
 #include <linux/list.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/hdreg.h>
 #include <linux/completion.h>
 #include <linux/kobject.h>
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 4046a6c6f223..7da6e9c3cb53 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -16,6 +16,7 @@
 #define _MD_MD_H
 
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/mm.h>
-- 
cgit v1.2.1


From 66a636356647a9be8885c2ce2948de126577698a Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 15 May 2015 15:33:34 +0100
Subject: dm cache: add stochastic-multi-queue (smq) policy

The stochastic-multi-queue (smq) policy addresses some of the problems
with the current multiqueue (mq) policy.

Memory usage
------------

The mq policy uses a lot of memory; 88 bytes per cache block on a 64
bit machine.

SMQ uses 28bit indexes to implement it's data structures rather than
pointers.  It avoids storing an explicit hit count for each block.  It
has a 'hotspot' queue rather than a pre cache which uses a quarter of
the entries (each hotspot block covers a larger area than a single
cache block).

All these mean smq uses ~25bytes per cache block.  Still a lot of
memory, but a substantial improvement nontheless.

Level balancing
---------------

MQ places entries in different levels of the multiqueue structures
based on their hit count (~ln(hit count)).  This means the bottom
levels generally have the most entries, and the top ones have very
few.  Having unbalanced levels like this reduces the efficacy of the
multiqueue.

SMQ does not maintain a hit count, instead it swaps hit entries with
the least recently used entry from the level above.  The over all
ordering being a side effect of this stochastic process.  With this
scheme we can decide how many entries occupy each multiqueue level,
resulting in better promotion/demotion decisions.

Adaptability
------------

The MQ policy maintains a hit count for each cache block.  For a
different block to get promoted to the cache it's hit count has to
exceed the lowest currently in the cache.  This means it can take a
long time for the cache to adapt between varying IO patterns.
Periodically degrading the hit counts could help with this, but I
haven't found a nice general solution.

SMQ doesn't maintain hit counts, so a lot of this problem just goes
away.  In addition it tracks performance of the hotspot queue, which
is used to decide which blocks to promote.  If the hotspot queue is
performing badly then it starts moving entries more quickly between
levels.  This lets it adapt to new IO patterns very quickly.

Performance
-----------

In my tests SMQ shows substantially better performance than MQ.  Once
this matures a bit more I'm sure it'll become the default policy.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/Kconfig               |   12 +
 drivers/md/Makefile              |    2 +
 drivers/md/dm-cache-policy-smq.c | 1768 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 1782 insertions(+)
 create mode 100644 drivers/md/dm-cache-policy-smq.c

(limited to 'drivers/md')

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index edcf4ab66e00..b59727309072 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -304,6 +304,18 @@ config DM_CACHE_MQ
          This is meant to be a general purpose policy.  It prioritises
          reads over writes.
 
+config DM_CACHE_SMQ
+       tristate "Stochastic MQ Cache Policy (EXPERIMENTAL)"
+       depends on DM_CACHE
+       default y
+       ---help---
+         A cache policy that uses a multiqueue ordered by recent hits
+         to select which blocks should be promoted and demoted.
+         This is meant to be a general purpose policy.  It prioritises
+         reads over writes.  This SMQ policy (vs MQ) offers the promise
+         of less memory utilization, improved performance and increased
+         adaptability in the face of changing workloads.
+
 config DM_CACHE_CLEANER
        tristate "Cleaner Cache Policy (EXPERIMENTAL)"
        depends on DM_CACHE
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index dba4db5985fb..462f443a4f85 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -13,6 +13,7 @@ dm-log-userspace-y \
 dm-thin-pool-y	+= dm-thin.o dm-thin-metadata.o
 dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o
 dm-cache-mq-y   += dm-cache-policy-mq.o
+dm-cache-smq-y   += dm-cache-policy-smq.o
 dm-cache-cleaner-y += dm-cache-policy-cleaner.o
 dm-era-y	+= dm-era-target.o
 md-mod-y	+= md.o bitmap.o
@@ -54,6 +55,7 @@ obj-$(CONFIG_DM_THIN_PROVISIONING)	+= dm-thin-pool.o
 obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
+obj-$(CONFIG_DM_CACHE_SMQ)	+= dm-cache-smq.o
 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
 obj-$(CONFIG_DM_ERA)		+= dm-era.o
 obj-$(CONFIG_DM_LOG_WRITES)	+= dm-log-writes.o
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
new file mode 100644
index 000000000000..55a657f78f00
--- /dev/null
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -0,0 +1,1768 @@
+/*
+ * Copyright (C) 2015 Red Hat. All rights reserved.
+ *
+ * This file is released under the GPL.
+ */
+
+#include "dm-cache-policy.h"
+#include "dm-cache-policy-internal.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/vmalloc.h>
+#include <linux/math64.h>
+
+#define DM_MSG_PREFIX "cache-policy-smq"
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Safe division functions that return zero on divide by zero.
+ */
+static unsigned safe_div(unsigned n, unsigned d)
+{
+	return d ? n / d : 0u;
+}
+
+static unsigned safe_mod(unsigned n, unsigned d)
+{
+	return d ? n % d : 0u;
+}
+
+/*----------------------------------------------------------------*/
+
+struct entry {
+	unsigned hash_next:28;
+	unsigned prev:28;
+	unsigned next:28;
+	unsigned level:7;
+	bool dirty:1;
+	bool allocated:1;
+	bool sentinel:1;
+
+	dm_oblock_t oblock;
+};
+
+/*----------------------------------------------------------------*/
+
+#define INDEXER_NULL ((1u << 28u) - 1u)
+
+/*
+ * An entry_space manages a set of entries that we use for the queues.
+ * The clean and dirty queues share entries, so this object is separate
+ * from the queue itself.
+ */
+struct entry_space {
+	struct entry *begin;
+	struct entry *end;
+};
+
+static int space_init(struct entry_space *es, unsigned nr_entries)
+{
+	if (!nr_entries) {
+		es->begin = es->end = NULL;
+		return 0;
+	}
+
+	es->begin = vzalloc(sizeof(struct entry) * nr_entries);
+	if (!es->begin)
+		return -ENOMEM;
+
+	es->end = es->begin + nr_entries;
+	return 0;
+}
+
+static void space_exit(struct entry_space *es)
+{
+	vfree(es->begin);
+}
+
+static struct entry *__get_entry(struct entry_space *es, unsigned block)
+{
+	struct entry *e;
+
+	e = es->begin + block;
+	BUG_ON(e >= es->end);
+
+	return e;
+}
+
+static unsigned to_index(struct entry_space *es, struct entry *e)
+{
+	BUG_ON(e < es->begin || e >= es->end);
+	return e - es->begin;
+}
+
+static struct entry *to_entry(struct entry_space *es, unsigned block)
+{
+	if (block == INDEXER_NULL)
+		return NULL;
+
+	return __get_entry(es, block);
+}
+
+/*----------------------------------------------------------------*/
+
+struct ilist {
+	unsigned nr_elts;	/* excluding sentinel entries */
+	unsigned head, tail;
+};
+
+static void l_init(struct ilist *l)
+{
+	l->nr_elts = 0;
+	l->head = l->tail = INDEXER_NULL;
+}
+
+static struct entry *l_head(struct entry_space *es, struct ilist *l)
+{
+	return to_entry(es, l->head);
+}
+
+static struct entry *l_tail(struct entry_space *es, struct ilist *l)
+{
+	return to_entry(es, l->tail);
+}
+
+static struct entry *l_next(struct entry_space *es, struct entry *e)
+{
+	return to_entry(es, e->next);
+}
+
+static struct entry *l_prev(struct entry_space *es, struct entry *e)
+{
+	return to_entry(es, e->prev);
+}
+
+static bool l_empty(struct ilist *l)
+{
+	return l->head == INDEXER_NULL;
+}
+
+static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+	struct entry *head = l_head(es, l);
+
+	e->next = l->head;
+	e->prev = INDEXER_NULL;
+
+	if (head)
+		head->prev = l->head = to_index(es, e);
+	else
+		l->head = l->tail = to_index(es, e);
+
+	if (!e->sentinel)
+		l->nr_elts++;
+}
+
+static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+	struct entry *tail = l_tail(es, l);
+
+	e->next = INDEXER_NULL;
+	e->prev = l->tail;
+
+	if (tail)
+		tail->next = l->tail = to_index(es, e);
+	else
+		l->head = l->tail = to_index(es, e);
+
+	if (!e->sentinel)
+		l->nr_elts++;
+}
+
+static void l_add_before(struct entry_space *es, struct ilist *l,
+			 struct entry *old, struct entry *e)
+{
+	struct entry *prev = l_prev(es, old);
+
+	if (!prev)
+		l_add_head(es, l, e);
+
+	else {
+		e->prev = old->prev;
+		e->next = to_index(es, old);
+		prev->next = old->prev = to_index(es, e);
+
+		if (!e->sentinel)
+			l->nr_elts++;
+	}
+}
+
+static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
+{
+	struct entry *prev = l_prev(es, e);
+	struct entry *next = l_next(es, e);
+
+	if (prev)
+		prev->next = e->next;
+	else
+		l->head = e->next;
+
+	if (next)
+		next->prev = e->prev;
+	else
+		l->tail = e->prev;
+
+	if (!e->sentinel)
+		l->nr_elts--;
+}
+
+static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
+{
+	struct entry *e;
+
+	for (e = l_tail(es, l); e; e = l_prev(es, e))
+		if (!e->sentinel) {
+			l_del(es, l, e);
+			return e;
+		}
+
+	return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * The stochastic-multi-queue is a set of lru lists stacked into levels.
+ * Entries are moved up levels when they are used, which loosely orders the
+ * most accessed entries in the top levels and least in the bottom.  This
+ * structure is *much* better than a single lru list.
+ */
+#define MAX_LEVELS 64u
+
+struct queue {
+	struct entry_space *es;
+
+	unsigned nr_elts;
+	unsigned nr_levels;
+	struct ilist qs[MAX_LEVELS];
+
+	/*
+	 * We maintain a count of the number of entries we would like in each
+	 * level.
+	 */
+	unsigned last_target_nr_elts;
+	unsigned nr_top_levels;
+	unsigned nr_in_top_levels;
+	unsigned target_count[MAX_LEVELS];
+};
+
+static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
+{
+	unsigned i;
+
+	q->es = es;
+	q->nr_elts = 0;
+	q->nr_levels = nr_levels;
+
+	for (i = 0; i < q->nr_levels; i++) {
+		l_init(q->qs + i);
+		q->target_count[i] = 0u;
+	}
+
+	q->last_target_nr_elts = 0u;
+	q->nr_top_levels = 0u;
+	q->nr_in_top_levels = 0u;
+}
+
+static unsigned q_size(struct queue *q)
+{
+	return q->nr_elts;
+}
+
+/*
+ * Insert an entry to the back of the given level.
+ */
+static void q_push(struct queue *q, struct entry *e)
+{
+	if (!e->sentinel)
+		q->nr_elts++;
+
+	l_add_tail(q->es, q->qs + e->level, e);
+}
+
+static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
+{
+	if (!e->sentinel)
+		q->nr_elts++;
+
+	l_add_before(q->es, q->qs + e->level, old, e);
+}
+
+static void q_del(struct queue *q, struct entry *e)
+{
+	l_del(q->es, q->qs + e->level, e);
+	if (!e->sentinel)
+		q->nr_elts--;
+}
+
+/*
+ * Return the oldest entry of the lowest populated level.
+ */
+static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
+{
+	unsigned level;
+	struct entry *e;
+
+	max_level = min(max_level, q->nr_levels);
+
+	for (level = 0; level < max_level; level++)
+		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
+			if (e->sentinel) {
+				if (can_cross_sentinel)
+					continue;
+				else
+					break;
+			}
+
+			return e;
+		}
+
+	return NULL;
+}
+
+static struct entry *q_pop(struct queue *q)
+{
+	struct entry *e = q_peek(q, q->nr_levels, true);
+
+	if (e)
+		q_del(q, e);
+
+	return e;
+}
+
+/*
+ * Pops an entry from a level that is not past a sentinel.
+ */
+static struct entry *q_pop_old(struct queue *q, unsigned max_level)
+{
+	struct entry *e = q_peek(q, max_level, false);
+
+	if (e)
+		q_del(q, e);
+
+	return e;
+}
+
+/*
+ * This function assumes there is a non-sentinel entry to pop.  It's only
+ * used by redistribute, so we know this is true.  It also doesn't adjust
+ * the q->nr_elts count.
+ */
+static struct entry *__redist_pop_from(struct queue *q, unsigned level)
+{
+	struct entry *e;
+
+	for (; level < q->nr_levels; level++)
+		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
+			if (!e->sentinel) {
+				l_del(q->es, q->qs + e->level, e);
+				return e;
+			}
+
+	return NULL;
+}
+
+static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
+{
+	unsigned level, nr_levels, entries_per_level, remainder;
+
+	BUG_ON(lbegin > lend);
+	BUG_ON(lend > q->nr_levels);
+	nr_levels = lend - lbegin;
+	entries_per_level = safe_div(nr_elts, nr_levels);
+	remainder = safe_mod(nr_elts, nr_levels);
+
+	for (level = lbegin; level < lend; level++)
+		q->target_count[level] =
+			(level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
+}
+
+/*
+ * Typically we have fewer elements in the top few levels which allows us
+ * to adjust the promote threshold nicely.
+ */
+static void q_set_targets(struct queue *q)
+{
+	if (q->last_target_nr_elts == q->nr_elts)
+		return;
+
+	q->last_target_nr_elts = q->nr_elts;
+
+	if (q->nr_top_levels > q->nr_levels)
+		q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
+
+	else {
+		q_set_targets_subrange_(q, q->nr_in_top_levels,
+					q->nr_levels - q->nr_top_levels, q->nr_levels);
+
+		if (q->nr_in_top_levels < q->nr_elts)
+			q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
+						0, q->nr_levels - q->nr_top_levels);
+		else
+			q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
+	}
+}
+
+static void q_redistribute(struct queue *q)
+{
+	unsigned target, level;
+	struct ilist *l, *l_above;
+	struct entry *e;
+
+	q_set_targets(q);
+
+	for (level = 0u; level < q->nr_levels - 1u; level++) {
+		l = q->qs + level;
+		target = q->target_count[level];
+
+		/*
+		 * Pull down some entries from the level above.
+		 */
+		while (l->nr_elts < target) {
+			e = __redist_pop_from(q, level + 1u);
+			if (!e) {
+				/* bug in nr_elts */
+				break;
+			}
+
+			e->level = level;
+			l_add_tail(q->es, l, e);
+		}
+
+		/*
+		 * Push some entries up.
+		 */
+		l_above = q->qs + level + 1u;
+		while (l->nr_elts > target) {
+			e = l_pop_tail(q->es, l);
+
+			if (!e)
+				/* bug in nr_elts */
+				break;
+
+			e->level = level + 1u;
+			l_add_head(q->es, l_above, e);
+		}
+	}
+}
+
+static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
+{
+	struct entry *de;
+	unsigned new_level;
+
+	q_del(q, e);
+
+	if (extra_levels && (e->level < q->nr_levels - 1u)) {
+		new_level = min(q->nr_levels - 1u, e->level + extra_levels);
+		for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
+			if (de->sentinel)
+				continue;
+
+			q_del(q, de);
+			de->level = e->level;
+
+			if (dest)
+				q_push_before(q, dest, de);
+			else
+				q_push(q, de);
+			break;
+		}
+
+		e->level = new_level;
+	}
+
+	q_push(q, e);
+}
+
+static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
+{
+	q_requeue_before(q, NULL, e, extra_levels);
+}
+
+/*----------------------------------------------------------------*/
+
+#define FP_SHIFT 8
+#define SIXTEENTH (1u << (FP_SHIFT - 4u))
+#define EIGHTH (1u << (FP_SHIFT - 3u))
+
+struct stats {
+	unsigned hit_threshold;
+	unsigned hits;
+	unsigned misses;
+};
+
+enum performance {
+	Q_POOR,
+	Q_FAIR,
+	Q_WELL
+};
+
+static void stats_init(struct stats *s, unsigned nr_levels)
+{
+	s->hit_threshold = (nr_levels * 3u) / 4u;
+	s->hits = 0u;
+	s->misses = 0u;
+}
+
+static void stats_reset(struct stats *s)
+{
+	s->hits = s->misses = 0u;
+}
+
+static void stats_level_accessed(struct stats *s, unsigned level)
+{
+	if (level >= s->hit_threshold)
+		s->hits++;
+	else
+		s->misses++;
+}
+
+static void stats_miss(struct stats *s)
+{
+	s->misses++;
+}
+
+/*
+ * There are times when we don't have any confidence in the hotspot queue.
+ * Such as when a fresh cache is created and the blocks have been spread
+ * out across the levels, or if an io load changes.  We detect this by
+ * seeing how often a lookup is in the top levels of the hotspot queue.
+ */
+static enum performance stats_assess(struct stats *s)
+{
+	unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
+
+	if (confidence < SIXTEENTH)
+		return Q_POOR;
+
+	else if (confidence < EIGHTH)
+		return Q_FAIR;
+
+	else
+		return Q_WELL;
+}
+
+/*----------------------------------------------------------------*/
+
+struct hash_table {
+	struct entry_space *es;
+	unsigned long long hash_bits;
+	unsigned *buckets;
+};
+
+/*
+ * All cache entries are stored in a chained hash table.  To save space we
+ * use indexing again, and only store indexes to the next entry.
+ */
+static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
+{
+	unsigned i, nr_buckets;
+
+	ht->es = es;
+	nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
+	ht->hash_bits = ffs(nr_buckets) - 1;
+
+	ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
+	if (!ht->buckets)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_buckets; i++)
+		ht->buckets[i] = INDEXER_NULL;
+
+	return 0;
+}
+
+static void h_exit(struct hash_table *ht)
+{
+	vfree(ht->buckets);
+}
+
+static struct entry *h_head(struct hash_table *ht, unsigned bucket)
+{
+	return to_entry(ht->es, ht->buckets[bucket]);
+}
+
+static struct entry *h_next(struct hash_table *ht, struct entry *e)
+{
+	return to_entry(ht->es, e->hash_next);
+}
+
+static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
+{
+	e->hash_next = ht->buckets[bucket];
+	ht->buckets[bucket] = to_index(ht->es, e);
+}
+
+static void h_insert(struct hash_table *ht, struct entry *e)
+{
+	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
+	__h_insert(ht, h, e);
+}
+
+static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
+				struct entry **prev)
+{
+	struct entry *e;
+
+	*prev = NULL;
+	for (e = h_head(ht, h); e; e = h_next(ht, e)) {
+		if (e->oblock == oblock)
+			return e;
+
+		*prev = e;
+	}
+
+	return NULL;
+}
+
+static void __h_unlink(struct hash_table *ht, unsigned h,
+		       struct entry *e, struct entry *prev)
+{
+	if (prev)
+		prev->hash_next = e->hash_next;
+	else
+		ht->buckets[h] = e->hash_next;
+}
+
+/*
+ * Also moves each entry to the front of the bucket.
+ */
+static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
+{
+	struct entry *e, *prev;
+	unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
+
+	e = __h_lookup(ht, h, oblock, &prev);
+	if (e && prev) {
+		/*
+		 * Move to the front because this entry is likely
+		 * to be hit again.
+		 */
+		__h_unlink(ht, h, e, prev);
+		__h_insert(ht, h, e);
+	}
+
+	return e;
+}
+
+static void h_remove(struct hash_table *ht, struct entry *e)
+{
+	unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
+	struct entry *prev;
+
+	/*
+	 * The down side of using a singly linked list is we have to
+	 * iterate the bucket to remove an item.
+	 */
+	e = __h_lookup(ht, h, e->oblock, &prev);
+	if (e)
+		__h_unlink(ht, h, e, prev);
+}
+
+/*----------------------------------------------------------------*/
+
+struct entry_alloc {
+	struct entry_space *es;
+	unsigned begin;
+
+	unsigned nr_allocated;
+	struct ilist free;
+};
+
+static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
+			   unsigned begin, unsigned end)
+{
+	unsigned i;
+
+	ea->es = es;
+	ea->nr_allocated = 0u;
+	ea->begin = begin;
+
+	l_init(&ea->free);
+	for (i = begin; i != end; i++)
+		l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
+}
+
+static void init_entry(struct entry *e)
+{
+	/*
+	 * We can't memset because that would clear the hotspot and
+	 * sentinel bits which remain constant.
+	 */
+	e->hash_next = INDEXER_NULL;
+	e->next = INDEXER_NULL;
+	e->prev = INDEXER_NULL;
+	e->level = 0u;
+	e->allocated = true;
+}
+
+static struct entry *alloc_entry(struct entry_alloc *ea)
+{
+	struct entry *e;
+
+	if (l_empty(&ea->free))
+		return NULL;
+
+	e = l_pop_tail(ea->es, &ea->free);
+	init_entry(e);
+	ea->nr_allocated++;
+
+	return e;
+}
+
+/*
+ * This assumes the cblock hasn't already been allocated.
+ */
+static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
+{
+	struct entry *e = __get_entry(ea->es, ea->begin + i);
+
+	BUG_ON(e->allocated);
+
+	l_del(ea->es, &ea->free, e);
+	init_entry(e);
+	ea->nr_allocated++;
+
+	return e;
+}
+
+static void free_entry(struct entry_alloc *ea, struct entry *e)
+{
+	BUG_ON(!ea->nr_allocated);
+	BUG_ON(!e->allocated);
+
+	ea->nr_allocated--;
+	e->allocated = false;
+	l_add_tail(ea->es, &ea->free, e);
+}
+
+static bool allocator_empty(struct entry_alloc *ea)
+{
+	return l_empty(&ea->free);
+}
+
+static unsigned get_index(struct entry_alloc *ea, struct entry *e)
+{
+	return to_index(ea->es, e) - ea->begin;
+}
+
+static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
+{
+	return __get_entry(ea->es, ea->begin + index);
+}
+
+/*----------------------------------------------------------------*/
+
+#define NR_HOTSPOT_LEVELS 64u
+#define NR_CACHE_LEVELS 64u
+
+#define WRITEBACK_PERIOD (10 * HZ)
+#define DEMOTE_PERIOD (60 * HZ)
+
+#define HOTSPOT_UPDATE_PERIOD (HZ)
+#define CACHE_UPDATE_PERIOD (10u * HZ)
+
+struct smq_policy {
+	struct dm_cache_policy policy;
+
+	/* protects everything */
+	struct mutex lock;
+	dm_cblock_t cache_size;
+	sector_t cache_block_size;
+
+	sector_t hotspot_block_size;
+	unsigned nr_hotspot_blocks;
+	unsigned cache_blocks_per_hotspot_block;
+	unsigned hotspot_level_jump;
+
+	struct entry_space es;
+	struct entry_alloc writeback_sentinel_alloc;
+	struct entry_alloc demote_sentinel_alloc;
+	struct entry_alloc hotspot_alloc;
+	struct entry_alloc cache_alloc;
+
+	unsigned long *hotspot_hit_bits;
+	unsigned long *cache_hit_bits;
+
+	/*
+	 * We maintain three queues of entries.  The cache proper,
+	 * consisting of a clean and dirty queue, containing the currently
+	 * active mappings.  The hotspot queue uses a larger block size to
+	 * track blocks that are being hit frequently and potential
+	 * candidates for promotion to the cache.
+	 */
+	struct queue hotspot;
+	struct queue clean;
+	struct queue dirty;
+
+	struct stats hotspot_stats;
+	struct stats cache_stats;
+
+	/*
+	 * Keeps track of time, incremented by the core.  We use this to
+	 * avoid attributing multiple hits within the same tick.
+	 *
+	 * Access to tick_protected should be done with the spin lock held.
+	 * It's copied to tick at the start of the map function (within the
+	 * mutex).
+	 */
+	spinlock_t tick_lock;
+	unsigned tick_protected;
+	unsigned tick;
+
+	/*
+	 * The hash tables allows us to quickly find an entry by origin
+	 * block.
+	 */
+	struct hash_table table;
+	struct hash_table hotspot_table;
+
+	bool current_writeback_sentinels;
+	unsigned long next_writeback_period;
+
+	bool current_demote_sentinels;
+	unsigned long next_demote_period;
+
+	unsigned write_promote_level;
+	unsigned read_promote_level;
+
+	unsigned long next_hotspot_period;
+	unsigned long next_cache_period;
+};
+
+/*----------------------------------------------------------------*/
+
+static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
+{
+	return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
+}
+
+static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
+{
+	return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
+}
+
+static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
+{
+	return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
+}
+
+static void __update_writeback_sentinels(struct smq_policy *mq)
+{
+	unsigned level;
+	struct queue *q = &mq->dirty;
+	struct entry *sentinel;
+
+	for (level = 0; level < q->nr_levels; level++) {
+		sentinel = writeback_sentinel(mq, level);
+		q_del(q, sentinel);
+		q_push(q, sentinel);
+	}
+}
+
+static void __update_demote_sentinels(struct smq_policy *mq)
+{
+	unsigned level;
+	struct queue *q = &mq->clean;
+	struct entry *sentinel;
+
+	for (level = 0; level < q->nr_levels; level++) {
+		sentinel = demote_sentinel(mq, level);
+		q_del(q, sentinel);
+		q_push(q, sentinel);
+	}
+}
+
+static void update_sentinels(struct smq_policy *mq)
+{
+	if (time_after(jiffies, mq->next_writeback_period)) {
+		__update_writeback_sentinels(mq);
+		mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
+		mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+	}
+
+	if (time_after(jiffies, mq->next_demote_period)) {
+		__update_demote_sentinels(mq);
+		mq->next_demote_period = jiffies + DEMOTE_PERIOD;
+		mq->current_demote_sentinels = !mq->current_demote_sentinels;
+	}
+}
+
+static void __sentinels_init(struct smq_policy *mq)
+{
+	unsigned level;
+	struct entry *sentinel;
+
+	for (level = 0; level < NR_CACHE_LEVELS; level++) {
+		sentinel = writeback_sentinel(mq, level);
+		sentinel->level = level;
+		q_push(&mq->dirty, sentinel);
+
+		sentinel = demote_sentinel(mq, level);
+		sentinel->level = level;
+		q_push(&mq->clean, sentinel);
+	}
+}
+
+static void sentinels_init(struct smq_policy *mq)
+{
+	mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
+	mq->next_demote_period = jiffies + DEMOTE_PERIOD;
+
+	mq->current_writeback_sentinels = false;
+	mq->current_demote_sentinels = false;
+	__sentinels_init(mq);
+
+	mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
+	mq->current_demote_sentinels = !mq->current_demote_sentinels;
+	__sentinels_init(mq);
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * These methods tie together the dirty queue, clean queue and hash table.
+ */
+static void push_new(struct smq_policy *mq, struct entry *e)
+{
+	struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
+	h_insert(&mq->table, e);
+	q_push(q, e);
+}
+
+static void push(struct smq_policy *mq, struct entry *e)
+{
+	struct entry *sentinel;
+
+	h_insert(&mq->table, e);
+
+	/*
+	 * Punch this into the queue just in front of the sentinel, to
+	 * ensure it's cleaned straight away.
+	 */
+	if (e->dirty) {
+		sentinel = writeback_sentinel(mq, e->level);
+		q_push_before(&mq->dirty, sentinel, e);
+	} else {
+		sentinel = demote_sentinel(mq, e->level);
+		q_push_before(&mq->clean, sentinel, e);
+	}
+}
+
+/*
+ * Removes an entry from cache.  Removes from the hash table.
+ */
+static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
+{
+	q_del(q, e);
+	h_remove(&mq->table, e);
+}
+
+static void del(struct smq_policy *mq, struct entry *e)
+{
+	__del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
+}
+
+static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
+{
+	struct entry *e = q_pop_old(q, max_level);
+	if (e)
+		h_remove(&mq->table, e);
+	return e;
+}
+
+static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
+{
+	return to_cblock(get_index(&mq->cache_alloc, e));
+}
+
+static void requeue(struct smq_policy *mq, struct entry *e)
+{
+	struct entry *sentinel;
+
+	if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
+		if (e->dirty) {
+			sentinel = writeback_sentinel(mq, e->level);
+			q_requeue_before(&mq->dirty, sentinel, e, 1u);
+		} else {
+			sentinel = demote_sentinel(mq, e->level);
+			q_requeue_before(&mq->clean, sentinel, e, 1u);
+		}
+	}
+}
+
+static unsigned default_promote_level(struct smq_policy *mq)
+{
+	/*
+	 * The promote level depends on the current performance of the
+	 * cache.
+	 *
+	 * If the cache is performing badly, then we can't afford
+	 * to promote much without causing performance to drop below that
+	 * of the origin device.
+	 *
+	 * If the cache is performing well, then we don't need to promote
+	 * much.  If it isn't broken, don't fix it.
+	 *
+	 * If the cache is middling then we promote more.
+	 *
+	 * This scheme reminds me of a graph of entropy vs probability of a
+	 * binary variable.
+	 */
+	static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
+
+	unsigned hits = mq->cache_stats.hits;
+	unsigned misses = mq->cache_stats.misses;
+	unsigned index = safe_div(hits << 4u, hits + misses);
+	return table[index];
+}
+
+static void update_promote_levels(struct smq_policy *mq)
+{
+	/*
+	 * If there are unused cache entries then we want to be really
+	 * eager to promote.
+	 */
+	unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
+		default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
+
+	/*
+	 * If the hotspot queue is performing badly then we have little
+	 * confidence that we know which blocks to promote.  So we cut down
+	 * the amount of promotions.
+	 */
+	switch (stats_assess(&mq->hotspot_stats)) {
+	case Q_POOR:
+		threshold_level /= 4u;
+		break;
+
+	case Q_FAIR:
+		threshold_level /= 2u;
+		break;
+
+	case Q_WELL:
+		break;
+	}
+
+	mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
+	mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
+}
+
+/*
+ * If the hotspot queue is performing badly, then we try and move entries
+ * around more quickly.
+ */
+static void update_level_jump(struct smq_policy *mq)
+{
+	switch (stats_assess(&mq->hotspot_stats)) {
+	case Q_POOR:
+		mq->hotspot_level_jump = 4u;
+		break;
+
+	case Q_FAIR:
+		mq->hotspot_level_jump = 2u;
+		break;
+
+	case Q_WELL:
+		mq->hotspot_level_jump = 1u;
+		break;
+	}
+}
+
+static void end_hotspot_period(struct smq_policy *mq)
+{
+	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
+	update_promote_levels(mq);
+
+	if (time_after(jiffies, mq->next_hotspot_period)) {
+		update_level_jump(mq);
+		q_redistribute(&mq->hotspot);
+		stats_reset(&mq->hotspot_stats);
+		mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
+	}
+}
+
+static void end_cache_period(struct smq_policy *mq)
+{
+	if (time_after(jiffies, mq->next_cache_period)) {
+		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
+
+		q_redistribute(&mq->dirty);
+		q_redistribute(&mq->clean);
+		stats_reset(&mq->cache_stats);
+
+		mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
+	}
+}
+
+static int demote_cblock(struct smq_policy *mq,
+			 struct policy_locker *locker,
+			 dm_oblock_t *oblock)
+{
+	struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
+	if (!demoted)
+		/*
+		 * We could get a block from mq->dirty, but that
+		 * would add extra latency to the triggering bio as it
+		 * waits for the writeback.  Better to not promote this
+		 * time and hope there's a clean block next time this block
+		 * is hit.
+		 */
+		return -ENOSPC;
+
+	if (locker->fn(locker, demoted->oblock))
+		/*
+		 * We couldn't lock this block.
+		 */
+		return -EBUSY;
+
+	del(mq, demoted);
+	*oblock = demoted->oblock;
+	free_entry(&mq->cache_alloc, demoted);
+
+	return 0;
+}
+
+enum promote_result {
+	PROMOTE_NOT,
+	PROMOTE_TEMPORARY,
+	PROMOTE_PERMANENT
+};
+
+/*
+ * Converts a boolean into a promote result.
+ */
+static enum promote_result maybe_promote(bool promote)
+{
+	return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
+}
+
+static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
+					  bool fast_promote)
+{
+	if (bio_data_dir(bio) == WRITE) {
+		if (!allocator_empty(&mq->cache_alloc) && fast_promote)
+			return PROMOTE_TEMPORARY;
+
+		else
+			return maybe_promote(hs_e->level >= mq->write_promote_level);
+	} else
+		return maybe_promote(hs_e->level >= mq->read_promote_level);
+}
+
+static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
+			    struct policy_locker *locker,
+			    struct policy_result *result, enum promote_result pr)
+{
+	int r;
+	struct entry *e;
+
+	if (allocator_empty(&mq->cache_alloc)) {
+		result->op = POLICY_REPLACE;
+		r = demote_cblock(mq, locker, &result->old_oblock);
+		if (r) {
+			result->op = POLICY_MISS;
+			return;
+		}
+
+	} else
+		result->op = POLICY_NEW;
+
+	e = alloc_entry(&mq->cache_alloc);
+	BUG_ON(!e);
+	e->oblock = oblock;
+
+	if (pr == PROMOTE_TEMPORARY)
+		push(mq, e);
+	else
+		push_new(mq, e);
+
+	result->cblock = infer_cblock(mq, e);
+}
+
+static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
+{
+	sector_t r = from_oblock(b);
+	(void) sector_div(r, mq->cache_blocks_per_hotspot_block);
+	return to_oblock(r);
+}
+
+static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
+{
+	unsigned hi;
+	dm_oblock_t hb = to_hblock(mq, b);
+	struct entry *e = h_lookup(&mq->hotspot_table, hb);
+
+	if (e) {
+		stats_level_accessed(&mq->hotspot_stats, e->level);
+
+		hi = get_index(&mq->hotspot_alloc, e);
+		q_requeue(&mq->hotspot, e,
+			  test_and_set_bit(hi, mq->hotspot_hit_bits) ?
+			  0u : mq->hotspot_level_jump);
+
+	} else {
+		stats_miss(&mq->hotspot_stats);
+
+		e = alloc_entry(&mq->hotspot_alloc);
+		if (!e) {
+			e = q_pop(&mq->hotspot);
+			if (e) {
+				h_remove(&mq->hotspot_table, e);
+				hi = get_index(&mq->hotspot_alloc, e);
+				clear_bit(hi, mq->hotspot_hit_bits);
+			}
+
+		}
+
+		if (e) {
+			e->oblock = hb;
+			q_push(&mq->hotspot, e);
+			h_insert(&mq->hotspot_table, e);
+		}
+	}
+
+	return e;
+}
+
+/*
+ * Looks the oblock up in the hash table, then decides whether to put in
+ * pre_cache, or cache etc.
+ */
+static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
+	       bool can_migrate, bool fast_promote,
+	       struct policy_locker *locker, struct policy_result *result)
+{
+	struct entry *e, *hs_e;
+	enum promote_result pr;
+
+	hs_e = update_hotspot_queue(mq, oblock, bio);
+
+	e = h_lookup(&mq->table, oblock);
+	if (e) {
+		stats_level_accessed(&mq->cache_stats, e->level);
+
+		requeue(mq, e);
+		result->op = POLICY_HIT;
+		result->cblock = infer_cblock(mq, e);
+
+	} else {
+		stats_miss(&mq->cache_stats);
+
+		pr = should_promote(mq, hs_e, bio, fast_promote);
+		if (pr == PROMOTE_NOT)
+			result->op = POLICY_MISS;
+
+		else {
+			if (!can_migrate) {
+				result->op = POLICY_MISS;
+				return -EWOULDBLOCK;
+			}
+
+			insert_in_cache(mq, oblock, locker, result, pr);
+		}
+	}
+
+	return 0;
+}
+
+/*----------------------------------------------------------------*/
+
+/*
+ * Public interface, via the policy struct.  See dm-cache-policy.h for a
+ * description of these.
+ */
+
+static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
+{
+	return container_of(p, struct smq_policy, policy);
+}
+
+static void smq_destroy(struct dm_cache_policy *p)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+
+	h_exit(&mq->hotspot_table);
+	h_exit(&mq->table);
+	free_bitset(mq->hotspot_hit_bits);
+	free_bitset(mq->cache_hit_bits);
+	space_exit(&mq->es);
+	kfree(mq);
+}
+
+static void copy_tick(struct smq_policy *mq)
+{
+	unsigned long flags, tick;
+
+	spin_lock_irqsave(&mq->tick_lock, flags);
+	tick = mq->tick_protected;
+	if (tick != mq->tick) {
+		update_sentinels(mq);
+		end_hotspot_period(mq);
+		end_cache_period(mq);
+		mq->tick = tick;
+	}
+	spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+static bool maybe_lock(struct smq_policy *mq, bool can_block)
+{
+	if (can_block) {
+		mutex_lock(&mq->lock);
+		return true;
+	} else
+		return mutex_trylock(&mq->lock);
+}
+
+static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+		   bool can_block, bool can_migrate, bool fast_promote,
+		   struct bio *bio, struct policy_locker *locker,
+		   struct policy_result *result)
+{
+	int r;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	result->op = POLICY_MISS;
+
+	if (!maybe_lock(mq, can_block))
+		return -EWOULDBLOCK;
+
+	copy_tick(mq);
+	r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
+{
+	int r;
+	struct smq_policy *mq = to_smq_policy(p);
+	struct entry *e;
+
+	if (!mutex_trylock(&mq->lock))
+		return -EWOULDBLOCK;
+
+	e = h_lookup(&mq->table, oblock);
+	if (e) {
+		*cblock = infer_cblock(mq, e);
+		r = 0;
+	} else
+		r = -ENOENT;
+
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
+{
+	struct entry *e;
+
+	e = h_lookup(&mq->table, oblock);
+	BUG_ON(!e);
+
+	del(mq, e);
+	e->dirty = set;
+	push(mq, e);
+}
+
+static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__smq_set_clear_dirty(mq, oblock, true);
+	mutex_unlock(&mq->lock);
+}
+
+static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__smq_set_clear_dirty(mq, oblock, false);
+	mutex_unlock(&mq->lock);
+}
+
+static int smq_load_mapping(struct dm_cache_policy *p,
+			    dm_oblock_t oblock, dm_cblock_t cblock,
+			    uint32_t hint, bool hint_valid)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+	struct entry *e;
+
+	e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
+	e->oblock = oblock;
+	e->dirty = false;	/* this gets corrected in a minute */
+	e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : 1;
+	push(mq, e);
+
+	return 0;
+}
+
+static int smq_save_hints(struct smq_policy *mq, struct queue *q,
+			  policy_walk_fn fn, void *context)
+{
+	int r;
+	unsigned level;
+	struct entry *e;
+
+	for (level = 0; level < q->nr_levels; level++)
+		for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
+			if (!e->sentinel) {
+				r = fn(context, infer_cblock(mq, e),
+				       e->oblock, e->level);
+				if (r)
+					return r;
+			}
+		}
+
+	return 0;
+}
+
+static int smq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
+			     void *context)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+	int r = 0;
+
+	mutex_lock(&mq->lock);
+
+	r = smq_save_hints(mq, &mq->clean, fn, context);
+	if (!r)
+		r = smq_save_hints(mq, &mq->dirty, fn, context);
+
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
+{
+	struct entry *e;
+
+	e = h_lookup(&mq->table, oblock);
+	BUG_ON(!e);
+
+	del(mq, e);
+	free_entry(&mq->cache_alloc, e);
+}
+
+static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__remove_mapping(mq, oblock);
+	mutex_unlock(&mq->lock);
+}
+
+static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
+{
+	struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
+
+	if (!e || !e->allocated)
+		return -ENODATA;
+
+	del(mq, e);
+	free_entry(&mq->cache_alloc, e);
+
+	return 0;
+}
+
+static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
+{
+	int r;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	r = __remove_cblock(mq, cblock);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+
+#define CLEAN_TARGET_CRITICAL 5u /* percent */
+
+static bool clean_target_met(struct smq_policy *mq, bool critical)
+{
+	if (critical) {
+		/*
+		 * Cache entries may not be populated.  So we're cannot rely on the
+		 * size of the clean queue.
+		 */
+		unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
+		unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
+
+		return nr_clean >= target;
+	} else
+		return !q_size(&mq->dirty);
+}
+
+static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
+				dm_cblock_t *cblock, bool critical_only)
+{
+	struct entry *e = NULL;
+	bool target_met = clean_target_met(mq, critical_only);
+
+	if (critical_only)
+		/*
+		 * Always try and keep the bottom level clean.
+		 */
+		e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
+
+	else
+		e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
+
+	if (!e)
+		return -ENODATA;
+
+	*oblock = e->oblock;
+	*cblock = infer_cblock(mq, e);
+	e->dirty = false;
+	push_new(mq, e);
+
+	return 0;
+}
+
+static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
+			      dm_cblock_t *cblock, bool critical_only)
+{
+	int r;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	r = __smq_writeback_work(mq, oblock, cblock, critical_only);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void __force_mapping(struct smq_policy *mq,
+			    dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+	struct entry *e = h_lookup(&mq->table, current_oblock);
+
+	if (e) {
+		del(mq, e);
+		e->oblock = new_oblock;
+		e->dirty = true;
+		push(mq, e);
+	}
+}
+
+static void smq_force_mapping(struct dm_cache_policy *p,
+			      dm_oblock_t current_oblock, dm_oblock_t new_oblock)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	__force_mapping(mq, current_oblock, new_oblock);
+	mutex_unlock(&mq->lock);
+}
+
+static dm_cblock_t smq_residency(struct dm_cache_policy *p)
+{
+	dm_cblock_t r;
+	struct smq_policy *mq = to_smq_policy(p);
+
+	mutex_lock(&mq->lock);
+	r = to_cblock(mq->cache_alloc.nr_allocated);
+	mutex_unlock(&mq->lock);
+
+	return r;
+}
+
+static void smq_tick(struct dm_cache_policy *p)
+{
+	struct smq_policy *mq = to_smq_policy(p);
+	unsigned long flags;
+
+	spin_lock_irqsave(&mq->tick_lock, flags);
+	mq->tick_protected++;
+	spin_unlock_irqrestore(&mq->tick_lock, flags);
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct smq_policy *mq)
+{
+	mq->policy.destroy = smq_destroy;
+	mq->policy.map = smq_map;
+	mq->policy.lookup = smq_lookup;
+	mq->policy.set_dirty = smq_set_dirty;
+	mq->policy.clear_dirty = smq_clear_dirty;
+	mq->policy.load_mapping = smq_load_mapping;
+	mq->policy.walk_mappings = smq_walk_mappings;
+	mq->policy.remove_mapping = smq_remove_mapping;
+	mq->policy.remove_cblock = smq_remove_cblock;
+	mq->policy.writeback_work = smq_writeback_work;
+	mq->policy.force_mapping = smq_force_mapping;
+	mq->policy.residency = smq_residency;
+	mq->policy.tick = smq_tick;
+}
+
+static bool too_many_hotspot_blocks(sector_t origin_size,
+				    sector_t hotspot_block_size,
+				    unsigned nr_hotspot_blocks)
+{
+	return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
+}
+
+static void calc_hotspot_params(sector_t origin_size,
+				sector_t cache_block_size,
+				unsigned nr_cache_blocks,
+				sector_t *hotspot_block_size,
+				unsigned *nr_hotspot_blocks)
+{
+	*hotspot_block_size = cache_block_size * 16u;
+	*nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
+
+	while ((*hotspot_block_size > cache_block_size) &&
+	       too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
+		*hotspot_block_size /= 2u;
+}
+
+static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
+					  sector_t origin_size,
+					  sector_t cache_block_size)
+{
+	unsigned i;
+	unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
+	unsigned total_sentinels = 2u * nr_sentinels_per_queue;
+	struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
+
+	if (!mq)
+		return NULL;
+
+	init_policy_functions(mq);
+	mq->cache_size = cache_size;
+	mq->cache_block_size = cache_block_size;
+
+	calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
+			    &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
+
+	mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
+	mq->hotspot_level_jump = 1u;
+	if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
+		DMERR("couldn't initialize entry space");
+		goto bad_pool_init;
+	}
+
+	init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
+        for (i = 0; i < nr_sentinels_per_queue; i++)
+		get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
+
+	init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
+        for (i = 0; i < nr_sentinels_per_queue; i++)
+		get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
+
+	init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
+		       total_sentinels + mq->nr_hotspot_blocks);
+
+	init_allocator(&mq->cache_alloc, &mq->es,
+		       total_sentinels + mq->nr_hotspot_blocks,
+		       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
+
+	mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
+	if (!mq->hotspot_hit_bits) {
+		DMERR("couldn't allocate hotspot hit bitset");
+		goto bad_hotspot_hit_bits;
+	}
+	clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
+
+	if (from_cblock(cache_size)) {
+		mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
+		if (!mq->cache_hit_bits && mq->cache_hit_bits) {
+			DMERR("couldn't allocate cache hit bitset");
+			goto bad_cache_hit_bits;
+		}
+		clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
+	} else
+		mq->cache_hit_bits = NULL;
+
+	mq->tick_protected = 0;
+	mq->tick = 0;
+	mutex_init(&mq->lock);
+	spin_lock_init(&mq->tick_lock);
+
+	q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
+	mq->hotspot.nr_top_levels = 8;
+	mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
+					   from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
+
+	q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
+	q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
+
+	stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
+	stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
+
+	if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
+		goto bad_alloc_table;
+
+	if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
+		goto bad_alloc_hotspot_table;
+
+	sentinels_init(mq);
+	mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
+
+	mq->next_hotspot_period = jiffies;
+	mq->next_cache_period = jiffies;
+
+	return &mq->policy;
+
+bad_alloc_hotspot_table:
+	h_exit(&mq->table);
+bad_alloc_table:
+	free_bitset(mq->cache_hit_bits);
+bad_cache_hit_bits:
+	free_bitset(mq->hotspot_hit_bits);
+bad_hotspot_hit_bits:
+	space_exit(&mq->es);
+bad_pool_init:
+	kfree(mq);
+
+	return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_policy_type smq_policy_type = {
+	.name = "smq",
+	.version = {1, 0, 0},
+	.hint_size = 4,
+	.owner = THIS_MODULE,
+	.create = smq_create
+};
+
+static int __init smq_init(void)
+{
+	int r;
+
+	r = dm_cache_policy_register(&smq_policy_type);
+	if (r) {
+		DMERR("register failed %d", r);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __exit smq_exit(void)
+{
+	dm_cache_policy_unregister(&smq_policy_type);
+}
+
+module_init(smq_init);
+module_exit(smq_exit);
+
+MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("smq cache policy");
-- 
cgit v1.2.1


From 88bf5184fa5861e766e39fd34fc6d21557ac7be8 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 27 May 2015 15:39:45 +0100
Subject: dm cache: wake the worker thread every time we free a migration
 object

When the cache is idle, writeback work was only being issued every
second.  With this change outstanding writebacks are streamed
constantly.  This offers a writeback performance improvement.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 7829d947ef01..6d36ed3c46a0 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -418,10 +418,13 @@ static struct dm_cache_migration *alloc_migration(struct cache *cache)
 
 static void free_migration(struct dm_cache_migration *mg)
 {
-	if (atomic_dec_and_test(&mg->cache->nr_allocated_migrations))
-		wake_up(&mg->cache->migration_wait);
+	struct cache *cache = mg->cache;
+
+	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
+		wake_up(&cache->migration_wait);
 
-	mempool_free(mg, mg->cache->migration_pool);
+	mempool_free(mg, cache->migration_pool);
+	wake_worker(cache);
 }
 
 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
-- 
cgit v1.2.1


From 028ae9f76f2935e8cf9974bff9a4587e3a995ff3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 22 Apr 2015 16:42:35 -0400
Subject: dm cache: add fail io mode and needs_check flag

If a cache metadata operation fails (e.g. transaction commit) the
cache's metadata device will abort the current transaction, set a new
needs_check flag, and the cache will transition to "read-only" mode.  If
aborting the transaction or setting the needs_check flag fails the cache
will transition to "fail-io" mode.

Once needs_check is set the cache device will not be allowed to
activate.  Activation requires write access to metadata.  Future work is
needed to add proper support for running the cache in read-only mode.

Once in fail-io mode the cache will report a status of "Fail".

Also, add commit() wrapper that will disallow commits if in read_only or
fail mode.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-metadata.c        | 133 ++++++++++++++++++----
 drivers/md/dm-cache-metadata.h        |  10 ++
 drivers/md/dm-cache-policy-internal.h |  10 +-
 drivers/md/dm-cache-policy-mq.c       |   8 +-
 drivers/md/dm-cache-policy.h          |   4 +-
 drivers/md/dm-cache-target.c          | 204 +++++++++++++++++++++++++++++-----
 6 files changed, 313 insertions(+), 56 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c
index c1c010498a21..20cc36b01b77 100644
--- a/drivers/md/dm-cache-metadata.c
+++ b/drivers/md/dm-cache-metadata.c
@@ -39,6 +39,8 @@
 enum superblock_flag_bits {
 	/* for spotting crashes that would invalidate the dirty bitset */
 	CLEAN_SHUTDOWN,
+	/* metadata must be checked using the tools */
+	NEEDS_CHECK,
 };
 
 /*
@@ -107,6 +109,7 @@ struct dm_cache_metadata {
 	struct dm_disk_bitset discard_info;
 
 	struct rw_semaphore root_lock;
+	unsigned long flags;
 	dm_block_t root;
 	dm_block_t hint_root;
 	dm_block_t discard_root;
@@ -129,6 +132,14 @@ struct dm_cache_metadata {
 	 * buffer before the superblock is locked and updated.
 	 */
 	__u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
+
+	/*
+	 * Set if a transaction has to be aborted but the attempt to roll
+	 * back to the previous (good) transaction failed.  The only
+	 * metadata operation permissible in this state is the closing of
+	 * the device.
+	 */
+	bool fail_io:1;
 };
 
 /*-------------------------------------------------------------------
@@ -527,6 +538,7 @@ static unsigned long clear_clean_shutdown(unsigned long flags)
 static void read_superblock_fields(struct dm_cache_metadata *cmd,
 				   struct cache_disk_superblock *disk_super)
 {
+	cmd->flags = le32_to_cpu(disk_super->flags);
 	cmd->root = le64_to_cpu(disk_super->mapping_root);
 	cmd->hint_root = le64_to_cpu(disk_super->hint_root);
 	cmd->discard_root = le64_to_cpu(disk_super->discard_root);
@@ -625,6 +637,7 @@ static int __commit_transaction(struct dm_cache_metadata *cmd,
 	if (mutator)
 		update_flags(disk_super, mutator);
 
+	disk_super->flags = cpu_to_le32(cmd->flags);
 	disk_super->mapping_root = cpu_to_le64(cmd->root);
 	disk_super->hint_root = cpu_to_le64(cmd->hint_root);
 	disk_super->discard_root = cpu_to_le64(cmd->discard_root);
@@ -693,6 +706,7 @@ static struct dm_cache_metadata *metadata_open(struct block_device *bdev,
 	cmd->cache_blocks = 0;
 	cmd->policy_hint_size = policy_hint_size;
 	cmd->changed = true;
+	cmd->fail_io = false;
 
 	r = __create_persistent_data_objects(cmd, may_format_device);
 	if (r) {
@@ -796,7 +810,8 @@ void dm_cache_metadata_close(struct dm_cache_metadata *cmd)
 		list_del(&cmd->list);
 		mutex_unlock(&table_lock);
 
-		__destroy_persistent_data_objects(cmd);
+		if (!cmd->fail_io)
+			__destroy_persistent_data_objects(cmd);
 		kfree(cmd);
 	}
 }
@@ -848,13 +863,26 @@ static int blocks_are_unmapped_or_clean(struct dm_cache_metadata *cmd,
 	return 0;
 }
 
+#define WRITE_LOCK(cmd) \
+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+		return -EINVAL; \
+	down_write(&cmd->root_lock)
+
+#define WRITE_LOCK_VOID(cmd) \
+	if (cmd->fail_io || dm_bm_is_read_only(cmd->bm)) \
+		return; \
+	down_write(&cmd->root_lock)
+
+#define WRITE_UNLOCK(cmd) \
+	up_write(&cmd->root_lock)
+
 int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
 {
 	int r;
 	bool clean;
 	__le64 null_mapping = pack_value(0, 0);
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	__dm_bless_for_disk(&null_mapping);
 
 	if (from_cblock(new_cache_size) < from_cblock(cmd->cache_blocks)) {
@@ -880,7 +908,7 @@ int dm_cache_resize(struct dm_cache_metadata *cmd, dm_cblock_t new_cache_size)
 	cmd->changed = true;
 
 out:
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -891,7 +919,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = dm_bitset_resize(&cmd->discard_info,
 			     cmd->discard_root,
 			     from_dblock(cmd->discard_nr_blocks),
@@ -903,7 +931,7 @@ int dm_cache_discard_bitset_resize(struct dm_cache_metadata *cmd,
 	}
 
 	cmd->changed = true;
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -946,9 +974,9 @@ int dm_cache_set_discard(struct dm_cache_metadata *cmd,
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = __discard(cmd, dblock, discard);
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -1020,9 +1048,9 @@ int dm_cache_remove_mapping(struct dm_cache_metadata *cmd, dm_cblock_t cblock)
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = __remove(cmd, cblock);
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -1048,9 +1076,9 @@ int dm_cache_insert_mapping(struct dm_cache_metadata *cmd,
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = __insert(cmd, cblock, oblock);
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -1234,9 +1262,9 @@ int dm_cache_set_dirty(struct dm_cache_metadata *cmd,
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = __dirty(cmd, cblock, dirty);
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -1252,9 +1280,9 @@ void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
 void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
 				 struct dm_cache_statistics *stats)
 {
-	down_write(&cmd->root_lock);
+	WRITE_LOCK_VOID(cmd);
 	cmd->stats = *stats;
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 }
 
 int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
@@ -1263,7 +1291,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
 	flags_mutator mutator = (clean_shutdown ? set_clean_shutdown :
 				 clear_clean_shutdown);
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = __commit_transaction(cmd, mutator);
 	if (r)
 		goto out;
@@ -1271,7 +1299,7 @@ int dm_cache_commit(struct dm_cache_metadata *cmd, bool clean_shutdown)
 	r = __begin_transaction(cmd);
 
 out:
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 	return r;
 }
 
@@ -1376,9 +1404,9 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
 {
 	int r;
 
-	down_write(&cmd->root_lock);
+	WRITE_LOCK(cmd);
 	r = write_hints(cmd, policy);
-	up_write(&cmd->root_lock);
+	WRITE_UNLOCK(cmd);
 
 	return r;
 }
@@ -1387,3 +1415,70 @@ int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result)
 {
 	return blocks_are_unmapped_or_clean(cmd, 0, cmd->cache_blocks, result);
 }
+
+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd)
+{
+	WRITE_LOCK_VOID(cmd);
+	dm_bm_set_read_only(cmd->bm);
+	WRITE_UNLOCK(cmd);
+}
+
+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd)
+{
+	WRITE_LOCK_VOID(cmd);
+	dm_bm_set_read_write(cmd->bm);
+	WRITE_UNLOCK(cmd);
+}
+
+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd)
+{
+	int r;
+	struct dm_block *sblock;
+	struct cache_disk_superblock *disk_super;
+
+	/*
+	 * We ignore fail_io for this function.
+	 */
+	down_write(&cmd->root_lock);
+	set_bit(NEEDS_CHECK, &cmd->flags);
+
+	r = superblock_lock(cmd, &sblock);
+	if (r) {
+		DMERR("couldn't read superblock");
+		goto out;
+	}
+
+	disk_super = dm_block_data(sblock);
+	disk_super->flags = cpu_to_le32(cmd->flags);
+
+	dm_bm_unlock(sblock);
+
+out:
+	up_write(&cmd->root_lock);
+	return r;
+}
+
+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd)
+{
+	bool needs_check;
+
+	down_read(&cmd->root_lock);
+	needs_check = !!test_bit(NEEDS_CHECK, &cmd->flags);
+	up_read(&cmd->root_lock);
+
+	return needs_check;
+}
+
+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd)
+{
+	int r;
+
+	WRITE_LOCK(cmd);
+	__destroy_persistent_data_objects(cmd);
+	r = __create_persistent_data_objects(cmd, false);
+	if (r)
+		cmd->fail_io = true;
+	WRITE_UNLOCK(cmd);
+
+	return r;
+}
diff --git a/drivers/md/dm-cache-metadata.h b/drivers/md/dm-cache-metadata.h
index 4ecc403be283..2ffee21f318d 100644
--- a/drivers/md/dm-cache-metadata.h
+++ b/drivers/md/dm-cache-metadata.h
@@ -102,6 +102,10 @@ struct dm_cache_statistics {
 
 void dm_cache_metadata_get_stats(struct dm_cache_metadata *cmd,
 				 struct dm_cache_statistics *stats);
+
+/*
+ * 'void' because it's no big deal if it fails.
+ */
 void dm_cache_metadata_set_stats(struct dm_cache_metadata *cmd,
 				 struct dm_cache_statistics *stats);
 
@@ -133,6 +137,12 @@ int dm_cache_write_hints(struct dm_cache_metadata *cmd, struct dm_cache_policy *
  */
 int dm_cache_metadata_all_clean(struct dm_cache_metadata *cmd, bool *result);
 
+bool dm_cache_metadata_needs_check(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_set_needs_check(struct dm_cache_metadata *cmd);
+void dm_cache_metadata_set_read_only(struct dm_cache_metadata *cmd);
+void dm_cache_metadata_set_read_write(struct dm_cache_metadata *cmd);
+int dm_cache_metadata_abort(struct dm_cache_metadata *cmd);
+
 /*----------------------------------------------------------------*/
 
 #endif /* DM_CACHE_METADATA_H */
diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index 9dc05a52369e..ccbe852d5362 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -89,13 +89,15 @@ static inline void policy_tick(struct dm_cache_policy *p)
 		return p->tick(p);
 }
 
-static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result,
+					    unsigned maxlen, ssize_t *sz_ptr)
 {
-	ssize_t sz = 0;
+	ssize_t sz = *sz_ptr;
 	if (p->emit_config_values)
-		return p->emit_config_values(p, result, maxlen);
+		return p->emit_config_values(p, result, maxlen, sz_ptr);
 
-	DMEMIT("0");
+	DMEMIT("0 ");
+	*sz_ptr = sz;
 	return 0;
 }
 
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 7cbae125879c..084eec653321 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1323,22 +1323,24 @@ static int mq_set_config_value(struct dm_cache_policy *p,
 	return 0;
 }
 
-static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
+static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
+				 unsigned maxlen, ssize_t *sz_ptr)
 {
-	ssize_t sz = 0;
+	ssize_t sz = *sz_ptr;
 	struct mq_policy *mq = to_mq_policy(p);
 
 	DMEMIT("10 random_threshold %u "
 	       "sequential_threshold %u "
 	       "discard_promote_adjustment %u "
 	       "read_promote_adjustment %u "
-	       "write_promote_adjustment %u",
+	       "write_promote_adjustment %u ",
 	       mq->tracker.thresholds[PATTERN_RANDOM],
 	       mq->tracker.thresholds[PATTERN_SEQUENTIAL],
 	       mq->discard_promote_adjustment,
 	       mq->read_promote_adjustment,
 	       mq->write_promote_adjustment);
 
+	*sz_ptr = sz;
 	return 0;
 }
 
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 6106ca3aa350..74709129d856 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -208,8 +208,8 @@ struct dm_cache_policy {
 	/*
 	 * Configuration.
 	 */
-	int (*emit_config_values)(struct dm_cache_policy *p,
-				  char *result, unsigned maxlen);
+	int (*emit_config_values)(struct dm_cache_policy *p, char *result,
+				  unsigned maxlen, ssize_t *sz_ptr);
 	int (*set_config_value)(struct dm_cache_policy *p,
 				const char *key, const char *value);
 
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 6d36ed3c46a0..dae0321ebfa9 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -150,12 +150,10 @@ static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 
-/*
- * FIXME: the cache is read/write for the time being.
- */
 enum cache_metadata_mode {
 	CM_WRITE,		/* metadata may be changed */
 	CM_READ_ONLY,		/* metadata may not be changed */
+	CM_FAIL
 };
 
 enum cache_io_mode {
@@ -385,6 +383,8 @@ struct prealloc {
 	struct dm_bio_prison_cell *cell2;
 };
 
+static enum cache_metadata_mode get_cache_mode(struct cache *cache);
+
 static void wake_worker(struct cache *cache)
 {
 	queue_work(cache->wq, &cache->worker);
@@ -699,6 +699,9 @@ static void save_stats(struct cache *cache)
 {
 	struct dm_cache_statistics stats;
 
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return;
+
 	stats.read_hits = atomic_read(&cache->stats.read_hit);
 	stats.read_misses = atomic_read(&cache->stats.read_miss);
 	stats.write_hits = atomic_read(&cache->stats.write_hit);
@@ -957,6 +960,84 @@ static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
 	remap_to_origin_clear_discard(pb->cache, bio, oblock);
 }
 
+/*----------------------------------------------------------------
+ * Failure modes
+ *--------------------------------------------------------------*/
+static enum cache_metadata_mode get_cache_mode(struct cache *cache)
+{
+	return cache->features.mode;
+}
+
+static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
+{
+	const char *descs[] = {
+		"write",
+		"read-only",
+		"fail"
+	};
+
+	dm_table_event(cache->ti->table);
+	DMINFO("switching cache to %s mode", descs[(int)mode]);
+}
+
+static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
+{
+	bool needs_check = dm_cache_metadata_needs_check(cache->cmd);
+	enum cache_metadata_mode old_mode = get_cache_mode(cache);
+
+	if (new_mode == CM_WRITE && needs_check) {
+		DMERR("unable to switch cache to write mode until repaired.");
+		if (old_mode != new_mode)
+			new_mode = old_mode;
+		else
+			new_mode = CM_READ_ONLY;
+	}
+
+	/* Never move out of fail mode */
+	if (old_mode == CM_FAIL)
+		new_mode = CM_FAIL;
+
+	switch (new_mode) {
+	case CM_FAIL:
+	case CM_READ_ONLY:
+		dm_cache_metadata_set_read_only(cache->cmd);
+		break;
+
+	case CM_WRITE:
+		dm_cache_metadata_set_read_write(cache->cmd);
+		break;
+	}
+
+	cache->features.mode = new_mode;
+
+	if (new_mode != old_mode)
+		notify_mode_switch(cache, new_mode);
+}
+
+static void abort_transaction(struct cache *cache)
+{
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return;
+
+	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
+		DMERR("failed to set 'needs_check' flag in metadata");
+		set_cache_mode(cache, CM_FAIL);
+	}
+
+	DMERR_LIMIT("aborting current metadata transaction");
+	if (dm_cache_metadata_abort(cache->cmd)) {
+		DMERR("failed to abort metadata transaction");
+		set_cache_mode(cache, CM_FAIL);
+	}
+}
+
+static void metadata_operation_failed(struct cache *cache, const char *op, int r)
+{
+	DMERR_LIMIT("metadata operation '%s' failed: error = %d", op, r);
+	abort_transaction(cache);
+	set_cache_mode(cache, CM_READ_ONLY);
+}
+
 /*----------------------------------------------------------------
  * Migration processing
  *
@@ -1063,6 +1144,7 @@ static void migration_failure(struct dm_cache_migration *mg)
 
 static void migration_success_pre_commit(struct dm_cache_migration *mg)
 {
+	int r;
 	unsigned long flags;
 	struct cache *cache = mg->cache;
 
@@ -1073,8 +1155,10 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
 		return;
 
 	} else if (mg->demote) {
-		if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
+		r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
+		if (r) {
 			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
 			policy_force_mapping(cache->policy, mg->new_oblock,
 					     mg->old_oblock);
 			if (mg->promote)
@@ -1083,8 +1167,10 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
 			return;
 		}
 	} else {
-		if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
+		r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
+		if (r) {
 			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
 			policy_remove_mapping(cache->policy, mg->new_oblock);
 			free_io_migration(mg);
 			return;
@@ -1812,15 +1898,32 @@ static int need_commit_due_to_time(struct cache *cache)
 	       jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
 }
 
+/*
+ * A non-zero return indicates read_only or fail_io mode.
+ */
+static int commit(struct cache *cache, bool clean_shutdown)
+{
+	int r;
+
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return -EINVAL;
+
+	atomic_inc(&cache->stats.commit_count);
+	r = dm_cache_commit(cache->cmd, clean_shutdown);
+	if (r)
+		metadata_operation_failed(cache, "dm_cache_commit", r);
+
+	return r;
+}
+
 static int commit_if_needed(struct cache *cache)
 {
 	int r = 0;
 
 	if ((cache->commit_requested || need_commit_due_to_time(cache)) &&
 	    dm_cache_changed_this_transaction(cache->cmd)) {
-		atomic_inc(&cache->stats.commit_count);
+		r = commit(cache, false);
 		cache->commit_requested = false;
-		r = dm_cache_commit(cache->cmd, false);
 		cache->last_commit_jiffies = jiffies;
 	}
 
@@ -1988,8 +2091,10 @@ static void process_invalidation_request(struct cache *cache, struct invalidatio
 		r = policy_remove_cblock(cache->policy, to_cblock(begin));
 		if (!r) {
 			r = dm_cache_remove_mapping(cache->cmd, to_cblock(begin));
-			if (r)
+			if (r) {
+				metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
 				break;
+			}
 
 		} else if (r == -ENODATA) {
 			/* harmless, already unmapped */
@@ -2133,12 +2238,6 @@ static void do_worker(struct work_struct *ws)
 		if (commit_if_needed(cache)) {
 			process_deferred_flush_bios(cache, false);
 			process_migrations(cache, &cache->need_commit_migrations, migration_failure);
-
-			/*
-			 * FIXME: rollback metadata or just go into a
-			 * failure mode and error everything
-			 */
-
 		} else {
 			process_deferred_flush_bios(cache, true);
 			process_migrations(cache, &cache->need_commit_migrations,
@@ -2711,6 +2810,12 @@ static int cache_create(struct cache_args *ca, struct cache **result)
 		goto bad;
 	}
 	cache->cmd = cmd;
+	set_cache_mode(cache, CM_WRITE);
+	if (get_cache_mode(cache) != CM_WRITE) {
+		*error = "Unable to get write access to metadata, please check/repair metadata.";
+		r = -EINVAL;
+		goto bad;
+	}
 
 	if (passthrough_mode(&cache->features)) {
 		bool all_clean;
@@ -3043,11 +3148,16 @@ static int write_dirty_bitset(struct cache *cache)
 {
 	unsigned i, r;
 
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return -EINVAL;
+
 	for (i = 0; i < from_cblock(cache->cache_size); i++) {
 		r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
 				       is_dirty(cache, to_cblock(i)));
-		if (r)
+		if (r) {
+			metadata_operation_failed(cache, "dm_cache_set_dirty", r);
 			return r;
+		}
 	}
 
 	return 0;
@@ -3057,18 +3167,40 @@ static int write_discard_bitset(struct cache *cache)
 {
 	unsigned i, r;
 
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return -EINVAL;
+
 	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
 					   cache->discard_nr_blocks);
 	if (r) {
 		DMERR("could not resize on-disk discard bitset");
+		metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
 		return r;
 	}
 
 	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
 		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
 					 is_discarded(cache, to_dblock(i)));
-		if (r)
+		if (r) {
+			metadata_operation_failed(cache, "dm_cache_set_discard", r);
 			return r;
+		}
+	}
+
+	return 0;
+}
+
+static int write_hints(struct cache *cache)
+{
+	int r;
+
+	if (get_cache_mode(cache) >= CM_READ_ONLY)
+		return -EINVAL;
+
+	r = dm_cache_write_hints(cache->cmd, cache->policy);
+	if (r) {
+		metadata_operation_failed(cache, "dm_cache_write_hints", r);
+		return r;
 	}
 
 	return 0;
@@ -3091,7 +3223,7 @@ static bool sync_metadata(struct cache *cache)
 
 	save_stats(cache);
 
-	r3 = dm_cache_write_hints(cache->cmd, cache->policy);
+	r3 = write_hints(cache);
 	if (r3)
 		DMERR("could not write hints");
 
@@ -3100,9 +3232,9 @@ static bool sync_metadata(struct cache *cache)
 	 * set the clean shutdown flag.  This will effectively force every
 	 * dirty bit to be set on reload.
 	 */
-	r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
+	r4 = commit(cache, !r1 && !r2 && !r3);
 	if (r4)
-		DMERR("could not write cache metadata.  Data loss may occur.");
+		DMERR("could not write cache metadata.");
 
 	return !r1 && !r2 && !r3 && !r4;
 }
@@ -3118,7 +3250,8 @@ static void cache_postsuspend(struct dm_target *ti)
 	requeue_deferred_cells(cache);
 	stop_quiescing(cache);
 
-	(void) sync_metadata(cache);
+	if (get_cache_mode(cache) == CM_WRITE)
+		(void) sync_metadata(cache);
 }
 
 static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
@@ -3257,6 +3390,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 	r = dm_cache_resize(cache->cmd, new_size);
 	if (r) {
 		DMERR("could not resize cache metadata");
+		metadata_operation_failed(cache, "dm_cache_resize", r);
 		return r;
 	}
 
@@ -3295,6 +3429,7 @@ static int cache_preresume(struct dm_target *ti)
 					   load_mapping, cache);
 		if (r) {
 			DMERR("could not load cache mappings");
+			metadata_operation_failed(cache, "dm_cache_load_mappings", r);
 			return r;
 		}
 
@@ -3315,6 +3450,7 @@ static int cache_preresume(struct dm_target *ti)
 		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
 		if (r) {
 			DMERR("could not load origin discards");
+			metadata_operation_failed(cache, "dm_cache_load_discards", r);
 			return r;
 		}
 		set_discard_range(&li);
@@ -3342,7 +3478,7 @@ static void cache_resume(struct dm_target *ti)
  * <#demotions> <#promotions> <#dirty>
  * <#features> <features>*
  * <#core args> <core args>
- * <policy name> <#policy args> <policy args>*
+ * <policy name> <#policy args> <policy args>* <cache metadata mode>
  */
 static void cache_status(struct dm_target *ti, status_type_t type,
 			 unsigned status_flags, char *result, unsigned maxlen)
@@ -3358,13 +3494,15 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 
 	switch (type) {
 	case STATUSTYPE_INFO:
-		/* Commit to ensure statistics aren't out-of-date */
-		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
-			r = dm_cache_commit(cache->cmd, false);
-			if (r)
-				DMERR("could not commit metadata for accurate status");
+		if (get_cache_mode(cache) == CM_FAIL) {
+			DMEMIT("Fail");
+			break;
 		}
 
+		/* Commit to ensure statistics aren't out-of-date */
+		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
+			(void) commit(cache, false);
+
 		r = dm_cache_get_free_metadata_block_count(cache->cmd,
 							   &nr_free_blocks_metadata);
 		if (r) {
@@ -3413,11 +3551,16 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 
 		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
 		if (sz < maxlen) {
-			r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
+			r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
 			if (r)
 				DMERR("policy_emit_config_values returned %d", r);
 		}
 
+		if (get_cache_mode(cache) == CM_READ_ONLY)
+			DMEMIT("ro ");
+		else
+			DMEMIT("rw ");
+
 		break;
 
 	case STATUSTYPE_TABLE:
@@ -3573,6 +3716,11 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
 	if (!argc)
 		return -EINVAL;
 
+	if (get_cache_mode(cache) >= CM_READ_ONLY) {
+		DMERR("unable to service cache target messages in READ_ONLY or FAIL mode");
+		return -EOPNOTSUPP;
+	}
+
 	if (!strcasecmp(argv[0], "invalidate_cblocks"))
 		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
 
@@ -3646,7 +3794,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {1, 6, 0},
+	.version = {1, 7, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,
-- 
cgit v1.2.1


From b61d9509628fea995196a96b4c1713fa67dade88 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 22 Apr 2015 17:25:56 -0400
Subject: dm cache: prefix all DMERR and DMINFO messages with cache device name

Having the DM device name associated with the ERR or INFO message is
very helpful.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 102 +++++++++++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 38 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index dae0321ebfa9..5aad875b822c 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -968,6 +968,11 @@ static enum cache_metadata_mode get_cache_mode(struct cache *cache)
 	return cache->features.mode;
 }
 
+static const char *cache_device_name(struct cache *cache)
+{
+	return dm_device_name(dm_table_get_md(cache->ti->table));
+}
+
 static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
 {
 	const char *descs[] = {
@@ -977,7 +982,8 @@ static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mod
 	};
 
 	dm_table_event(cache->ti->table);
-	DMINFO("switching cache to %s mode", descs[(int)mode]);
+	DMINFO("%s: switching cache to %s mode",
+	       cache_device_name(cache), descs[(int)mode]);
 }
 
 static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
@@ -986,7 +992,8 @@ static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mod
 	enum cache_metadata_mode old_mode = get_cache_mode(cache);
 
 	if (new_mode == CM_WRITE && needs_check) {
-		DMERR("unable to switch cache to write mode until repaired.");
+		DMERR("%s: unable to switch cache to write mode until repaired.",
+		      cache_device_name(cache));
 		if (old_mode != new_mode)
 			new_mode = old_mode;
 		else
@@ -1016,24 +1023,27 @@ static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mod
 
 static void abort_transaction(struct cache *cache)
 {
+	const char *dev_name = cache_device_name(cache);
+
 	if (get_cache_mode(cache) >= CM_READ_ONLY)
 		return;
 
 	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
-		DMERR("failed to set 'needs_check' flag in metadata");
+		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
 		set_cache_mode(cache, CM_FAIL);
 	}
 
-	DMERR_LIMIT("aborting current metadata transaction");
+	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
 	if (dm_cache_metadata_abort(cache->cmd)) {
-		DMERR("failed to abort metadata transaction");
+		DMERR("%s: failed to abort metadata transaction", dev_name);
 		set_cache_mode(cache, CM_FAIL);
 	}
 }
 
 static void metadata_operation_failed(struct cache *cache, const char *op, int r)
 {
-	DMERR_LIMIT("metadata operation '%s' failed: error = %d", op, r);
+	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
+		    cache_device_name(cache), op, r);
 	abort_transaction(cache);
 	set_cache_mode(cache, CM_READ_ONLY);
 }
@@ -1120,21 +1130,22 @@ static void free_io_migration(struct dm_cache_migration *mg)
 static void migration_failure(struct dm_cache_migration *mg)
 {
 	struct cache *cache = mg->cache;
+	const char *dev_name = cache_device_name(cache);
 
 	if (mg->writeback) {
-		DMWARN_LIMIT("writeback failed; couldn't copy block");
+		DMERR_LIMIT("%s: writeback failed; couldn't copy block", dev_name);
 		set_dirty(cache, mg->old_oblock, mg->cblock);
 		cell_defer(cache, mg->old_ocell, false);
 
 	} else if (mg->demote) {
-		DMWARN_LIMIT("demotion failed; couldn't copy block");
+		DMERR_LIMIT("%s: demotion failed; couldn't copy block", dev_name);
 		policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
 
 		cell_defer(cache, mg->old_ocell, mg->promote ? false : true);
 		if (mg->promote)
 			cell_defer(cache, mg->new_ocell, true);
 	} else {
-		DMWARN_LIMIT("promotion failed; couldn't copy block");
+		DMERR_LIMIT("%s: promotion failed; couldn't copy block", dev_name);
 		policy_remove_mapping(cache->policy, mg->new_oblock);
 		cell_defer(cache, mg->new_ocell, true);
 	}
@@ -1157,7 +1168,8 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
 	} else if (mg->demote) {
 		r = dm_cache_remove_mapping(cache->cmd, mg->cblock);
 		if (r) {
-			DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
+			DMERR_LIMIT("%s: demotion failed; couldn't update on disk metadata",
+				    cache_device_name(cache));
 			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
 			policy_force_mapping(cache->policy, mg->new_oblock,
 					     mg->old_oblock);
@@ -1169,7 +1181,8 @@ static void migration_success_pre_commit(struct dm_cache_migration *mg)
 	} else {
 		r = dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock);
 		if (r) {
-			DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
+			DMERR_LIMIT("%s: promotion failed; couldn't update on disk metadata",
+				    cache_device_name(cache));
 			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
 			policy_remove_mapping(cache->policy, mg->new_oblock);
 			free_io_migration(mg);
@@ -1189,7 +1202,8 @@ static void migration_success_post_commit(struct dm_cache_migration *mg)
 	struct cache *cache = mg->cache;
 
 	if (mg->writeback) {
-		DMWARN("writeback unexpectedly triggered commit");
+		DMWARN_LIMIT("%s: writeback unexpectedly triggered commit",
+			     cache_device_name(cache));
 		return;
 
 	} else if (mg->demote) {
@@ -1265,7 +1279,7 @@ static void issue_copy(struct dm_cache_migration *mg)
 	}
 
 	if (r < 0) {
-		DMERR_LIMIT("issuing migration failed");
+		DMERR_LIMIT("%s: issuing migration failed", cache_device_name(cache));
 		migration_failure(mg);
 	}
 }
@@ -1863,7 +1877,8 @@ static void process_cell(struct cache *cache, struct prealloc *structs,
 		break;
 
 	default:
-		DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
+		DMERR_LIMIT("%s: %s: erroring bio, unknown policy op: %u",
+			    cache_device_name(cache), __func__,
 			    (unsigned) lookup_result.op);
 		bio_io_error(bio);
 	}
@@ -2101,7 +2116,7 @@ static void process_invalidation_request(struct cache *cache, struct invalidatio
 			r = 0;
 
 		} else {
-			DMERR("policy_remove_cblock failed");
+			DMERR("%s: policy_remove_cblock failed", cache_device_name(cache));
 			break;
 		}
 
@@ -3054,7 +3069,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 		return DM_MAPIO_SUBMITTED;
 
 	} else if (r) {
-		DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
+		DMERR_LIMIT("%s: Unexpected return from cache replacement policy: %d",
+			    cache_device_name(cache), r);
 		cell_defer(cache, cell, false);
 		bio_io_error(bio);
 		return DM_MAPIO_SUBMITTED;
@@ -3113,7 +3129,8 @@ static int cache_map(struct dm_target *ti, struct bio *bio)
 		break;
 
 	default:
-		DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
+		DMERR_LIMIT("%s: %s: erroring bio: unknown policy op: %u",
+			    cache_device_name(cache), __func__,
 			    (unsigned) lookup_result.op);
 		cell_defer(cache, cell, false);
 		bio_io_error(bio);
@@ -3173,7 +3190,7 @@ static int write_discard_bitset(struct cache *cache)
 	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
 					   cache->discard_nr_blocks);
 	if (r) {
-		DMERR("could not resize on-disk discard bitset");
+		DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
 		metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
 		return r;
 	}
@@ -3215,17 +3232,17 @@ static bool sync_metadata(struct cache *cache)
 
 	r1 = write_dirty_bitset(cache);
 	if (r1)
-		DMERR("could not write dirty bitset");
+		DMERR("%s: could not write dirty bitset", cache_device_name(cache));
 
 	r2 = write_discard_bitset(cache);
 	if (r2)
-		DMERR("could not write discard bitset");
+		DMERR("%s: could not write discard bitset", cache_device_name(cache));
 
 	save_stats(cache);
 
 	r3 = write_hints(cache);
 	if (r3)
-		DMERR("could not write hints");
+		DMERR("%s: could not write hints", cache_device_name(cache));
 
 	/*
 	 * If writing the above metadata failed, we still commit, but don't
@@ -3234,7 +3251,7 @@ static bool sync_metadata(struct cache *cache)
 	 */
 	r4 = commit(cache, !r1 && !r2 && !r3);
 	if (r4)
-		DMERR("could not write cache metadata.");
+		DMERR("%s: could not write cache metadata", cache_device_name(cache));
 
 	return !r1 && !r2 && !r3 && !r4;
 }
@@ -3374,7 +3391,8 @@ static bool can_resize(struct cache *cache, dm_cblock_t new_size)
 	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
 		new_size = to_cblock(from_cblock(new_size) + 1);
 		if (is_dirty(cache, new_size)) {
-			DMERR("unable to shrink cache; cache block %llu is dirty",
+			DMERR("%s: unable to shrink cache; cache block %llu is dirty",
+			      cache_device_name(cache),
 			      (unsigned long long) from_cblock(new_size));
 			return false;
 		}
@@ -3389,7 +3407,7 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
 
 	r = dm_cache_resize(cache->cmd, new_size);
 	if (r) {
-		DMERR("could not resize cache metadata");
+		DMERR("%s: could not resize cache metadata", cache_device_name(cache));
 		metadata_operation_failed(cache, "dm_cache_resize", r);
 		return r;
 	}
@@ -3428,7 +3446,7 @@ static int cache_preresume(struct dm_target *ti)
 		r = dm_cache_load_mappings(cache->cmd, cache->policy,
 					   load_mapping, cache);
 		if (r) {
-			DMERR("could not load cache mappings");
+			DMERR("%s: could not load cache mappings", cache_device_name(cache));
 			metadata_operation_failed(cache, "dm_cache_load_mappings", r);
 			return r;
 		}
@@ -3449,7 +3467,7 @@ static int cache_preresume(struct dm_target *ti)
 		discard_load_info_init(cache, &li);
 		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
 		if (r) {
-			DMERR("could not load origin discards");
+			DMERR("%s: could not load origin discards", cache_device_name(cache));
 			metadata_operation_failed(cache, "dm_cache_load_discards", r);
 			return r;
 		}
@@ -3503,16 +3521,17 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
 			(void) commit(cache, false);
 
-		r = dm_cache_get_free_metadata_block_count(cache->cmd,
-							   &nr_free_blocks_metadata);
+		r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
 		if (r) {
-			DMERR("could not get metadata free block count");
+			DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
+			      cache_device_name(cache), r);
 			goto err;
 		}
 
 		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
 		if (r) {
-			DMERR("could not get metadata device size");
+			DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
+			      cache_device_name(cache), r);
 			goto err;
 		}
 
@@ -3543,7 +3562,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 			DMEMIT("1 writeback ");
 
 		else {
-			DMERR("internal error: unknown io mode: %d", (int) cache->features.io_mode);
+			DMERR("%s: internal error: unknown io mode: %d",
+			      cache_device_name(cache), (int) cache->features.io_mode);
 			goto err;
 		}
 
@@ -3553,7 +3573,8 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 		if (sz < maxlen) {
 			r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
 			if (r)
-				DMERR("policy_emit_config_values returned %d", r);
+				DMERR("%s: policy_emit_config_values returned %d",
+				      cache_device_name(cache), r);
 		}
 
 		if (get_cache_mode(cache) == CM_READ_ONLY)
@@ -3622,7 +3643,7 @@ static int parse_cblock_range(struct cache *cache, const char *str,
 		return 0;
 	}
 
-	DMERR("invalid cblock range '%s'", str);
+	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
 	return -EINVAL;
 }
 
@@ -3633,17 +3654,20 @@ static int validate_cblock_range(struct cache *cache, struct cblock_range *range
 	uint64_t n = from_cblock(cache->cache_size);
 
 	if (b >= n) {
-		DMERR("begin cblock out of range: %llu >= %llu", b, n);
+		DMERR("%s: begin cblock out of range: %llu >= %llu",
+		      cache_device_name(cache), b, n);
 		return -EINVAL;
 	}
 
 	if (e > n) {
-		DMERR("end cblock out of range: %llu > %llu", e, n);
+		DMERR("%s: end cblock out of range: %llu > %llu",
+		      cache_device_name(cache), e, n);
 		return -EINVAL;
 	}
 
 	if (b >= e) {
-		DMERR("invalid cblock range: %llu >= %llu", b, e);
+		DMERR("%s: invalid cblock range: %llu >= %llu",
+		      cache_device_name(cache), b, e);
 		return -EINVAL;
 	}
 
@@ -3677,7 +3701,8 @@ static int process_invalidate_cblocks_message(struct cache *cache, unsigned coun
 	struct cblock_range range;
 
 	if (!passthrough_mode(&cache->features)) {
-		DMERR("cache has to be in passthrough mode for invalidation");
+		DMERR("%s: cache has to be in passthrough mode for invalidation",
+		      cache_device_name(cache));
 		return -EPERM;
 	}
 
@@ -3717,7 +3742,8 @@ static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
 		return -EINVAL;
 
 	if (get_cache_mode(cache) >= CM_READ_ONLY) {
-		DMERR("unable to service cache target messages in READ_ONLY or FAIL mode");
+		DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
+		      cache_device_name(cache));
 		return -EOPNOTSUPP;
 	}
 
-- 
cgit v1.2.1


From fba10109a45d864bab98ae90dd63bcc2789352b3 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 29 May 2015 10:20:56 +0100
Subject: dm cache: age and write back cache entries even without active IO

The policy tick() method is normally called from interrupt context.
Both the mq and smq policies do some bottom half work for the tick
method in their map functions.  However if no IO is going through the
cache, then that bottom half work doesn't occur.  With these policies
this means recently hit entries do not age and do not get written
back as early as we'd like.

Fix this by introducing a new 'can_block' parameter to the tick()
method.  When this is set the bottom half work occurs immediately.
'can_block' is set when the tick method is called every second by the
core target (not in interrupt context).

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-internal.h | 4 ++--
 drivers/md/dm-cache-policy-mq.c       | 8 +++++++-
 drivers/md/dm-cache-policy-smq.c      | 8 +++++++-
 drivers/md/dm-cache-policy.h          | 4 ++--
 drivers/md/dm-cache-target.c          | 4 ++--
 5 files changed, 20 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-internal.h b/drivers/md/dm-cache-policy-internal.h
index ccbe852d5362..2816018faa7f 100644
--- a/drivers/md/dm-cache-policy-internal.h
+++ b/drivers/md/dm-cache-policy-internal.h
@@ -83,10 +83,10 @@ static inline dm_cblock_t policy_residency(struct dm_cache_policy *p)
 	return p->residency(p);
 }
 
-static inline void policy_tick(struct dm_cache_policy *p)
+static inline void policy_tick(struct dm_cache_policy *p, bool can_block)
 {
 	if (p->tick)
-		return p->tick(p);
+		return p->tick(p, can_block);
 }
 
 static inline int policy_emit_config_values(struct dm_cache_policy *p, char *result,
diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 084eec653321..838665bb495a 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1283,7 +1283,7 @@ static dm_cblock_t mq_residency(struct dm_cache_policy *p)
 	return r;
 }
 
-static void mq_tick(struct dm_cache_policy *p)
+static void mq_tick(struct dm_cache_policy *p, bool can_block)
 {
 	struct mq_policy *mq = to_mq_policy(p);
 	unsigned long flags;
@@ -1291,6 +1291,12 @@ static void mq_tick(struct dm_cache_policy *p)
 	spin_lock_irqsave(&mq->tick_lock, flags);
 	mq->tick_protected++;
 	spin_unlock_irqrestore(&mq->tick_lock, flags);
+
+	if (can_block) {
+		mutex_lock(&mq->lock);
+		copy_tick(mq);
+		mutex_unlock(&mq->lock);
+	}
 }
 
 static int mq_set_config_value(struct dm_cache_policy *p,
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 55a657f78f00..66feb307e697 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1581,7 +1581,7 @@ static dm_cblock_t smq_residency(struct dm_cache_policy *p)
 	return r;
 }
 
-static void smq_tick(struct dm_cache_policy *p)
+static void smq_tick(struct dm_cache_policy *p, bool can_block)
 {
 	struct smq_policy *mq = to_smq_policy(p);
 	unsigned long flags;
@@ -1589,6 +1589,12 @@ static void smq_tick(struct dm_cache_policy *p)
 	spin_lock_irqsave(&mq->tick_lock, flags);
 	mq->tick_protected++;
 	spin_unlock_irqrestore(&mq->tick_lock, flags);
+
+	if (can_block) {
+		mutex_lock(&mq->lock);
+		copy_tick(mq);
+		mutex_unlock(&mq->lock);
+	}
 }
 
 /* Init the policy plugin interface function pointers. */
diff --git a/drivers/md/dm-cache-policy.h b/drivers/md/dm-cache-policy.h
index 74709129d856..05db56eedb6a 100644
--- a/drivers/md/dm-cache-policy.h
+++ b/drivers/md/dm-cache-policy.h
@@ -200,10 +200,10 @@ struct dm_cache_policy {
 	 * Because of where we sit in the block layer, we can be asked to
 	 * map a lot of little bios that are all in the same block (no
 	 * queue merging has occurred).  To stop the policy being fooled by
-	 * these the core target sends regular tick() calls to the policy.
+	 * these, the core target sends regular tick() calls to the policy.
 	 * The policy should only count an entry as hit once per tick.
 	 */
-	void (*tick)(struct dm_cache_policy *p);
+	void (*tick)(struct dm_cache_policy *p, bool can_block);
 
 	/*
 	 * Configuration.
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 5aad875b822c..1b4e1756b169 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2271,7 +2271,7 @@ static void do_worker(struct work_struct *ws)
 static void do_waker(struct work_struct *ws)
 {
 	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
-	policy_tick(cache->policy);
+	policy_tick(cache->policy, true);
 	wake_worker(cache);
 	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
 }
@@ -3148,7 +3148,7 @@ static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
 	struct per_bio_data *pb = get_per_bio_data(bio, pb_data_size);
 
 	if (pb->tick) {
-		policy_tick(cache->policy);
+		policy_tick(cache->policy, false);
 
 		spin_lock_irqsave(&cache->lock, flags);
 		cache->need_tick_bio = true;
-- 
cgit v1.2.1


From 0f24b79b52730e15d9e3386ce27da2110eb4597d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Fri, 15 May 2015 21:35:21 +0300
Subject: dm stats: Use kvfree() in dm_kvfree()

Use kvfree() instead of open-coding it.

Signed-off-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-stats.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index f478a4c96d2f..492fe6a5ebf2 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -160,10 +160,7 @@ static void dm_kvfree(void *ptr, size_t alloc_size)
 
 	free_shared_memory(alloc_size);
 
-	if (is_vmalloc_addr(ptr))
-		vfree(ptr);
-	else
-		kfree(ptr);
+	kvfree(ptr);
 }
 
 static void dm_stat_free(struct rcu_head *head)
-- 
cgit v1.2.1


From 4ec331c3ea7ec94f28aa1c62a279cfa1cfe3c91b Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 13 Apr 2015 09:41:44 +0100
Subject: dm btree: add dm_btree_remove_leaves()

Removes a range of leaf values from the tree.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/persistent-data/dm-btree-remove.c | 127 +++++++++++++++++++++++++++
 drivers/md/persistent-data/dm-btree.h        |   9 ++
 2 files changed, 136 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index b88757cd0d1d..e04cfd2d60ef 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -590,3 +590,130 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 	return r;
 }
 EXPORT_SYMBOL_GPL(dm_btree_remove);
+
+/*----------------------------------------------------------------*/
+
+static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
+			  struct dm_btree_value_type *vt, dm_block_t root,
+			  uint64_t key, int *index)
+{
+	int i = *index, r;
+	struct btree_node *n;
+
+	for (;;) {
+		r = shadow_step(s, root, vt);
+		if (r < 0)
+			break;
+
+		/*
+		 * We have to patch up the parent node, ugly, but I don't
+		 * see a way to do this automatically as part of the spine
+		 * op.
+		 */
+		if (shadow_has_parent(s)) {
+			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
+			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
+			       &location, sizeof(__le64));
+		}
+
+		n = dm_block_data(shadow_current(s));
+
+		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
+			*index = lower_bound(n, key);
+			return 0;
+		}
+
+		r = rebalance_children(s, info, vt, key);
+		if (r)
+			break;
+
+		n = dm_block_data(shadow_current(s));
+		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
+			*index = lower_bound(n, key);
+			return 0;
+		}
+
+		i = lower_bound(n, key);
+
+		/*
+		 * We know the key is present, or else
+		 * rebalance_children would have returned
+		 * -ENODATA
+		 */
+		root = value64(n, i);
+	}
+
+	return r;
+}
+
+static int remove_one(struct dm_btree_info *info, dm_block_t root,
+		      uint64_t *keys, uint64_t end_key,
+		      dm_block_t *new_root, unsigned *nr_removed)
+{
+	unsigned level, last_level = info->levels - 1;
+	int index = 0, r = 0;
+	struct shadow_spine spine;
+	struct btree_node *n;
+	uint64_t k;
+
+	init_shadow_spine(&spine, info);
+	for (level = 0; level < last_level; level++) {
+		r = remove_raw(&spine, info, &le64_type,
+			       root, keys[level], (unsigned *) &index);
+		if (r < 0)
+			goto out;
+
+		n = dm_block_data(shadow_current(&spine));
+		root = value64(n, index);
+	}
+
+	r = remove_nearest(&spine, info, &info->value_type,
+			   root, keys[last_level], &index);
+	if (r < 0)
+		goto out;
+
+	n = dm_block_data(shadow_current(&spine));
+
+	if (index < 0)
+		index = 0;
+
+	if (index >= le32_to_cpu(n->header.nr_entries)) {
+		r = -ENODATA;
+		goto out;
+	}
+
+	k = le64_to_cpu(n->keys[index]);
+	if (k >= keys[last_level] && k < end_key) {
+		if (info->value_type.dec)
+			info->value_type.dec(info->value_type.context,
+					     value_ptr(n, index));
+
+		delete_at(n, index);
+
+	} else
+		r = -ENODATA;
+
+out:
+	*new_root = shadow_root(&spine);
+	exit_shadow_spine(&spine);
+
+	return r;
+}
+
+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
+			   uint64_t *first_key, uint64_t end_key,
+			   dm_block_t *new_root, unsigned *nr_removed)
+{
+	int r;
+
+	*nr_removed = 0;
+	do {
+		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
+		if (!r)
+			(*nr_removed)++;
+	} while (!r);
+
+	*new_root = root;
+	return r == -ENODATA ? 0 : r;
+}
+EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
diff --git a/drivers/md/persistent-data/dm-btree.h b/drivers/md/persistent-data/dm-btree.h
index dacfc34180b4..11d8cf78621d 100644
--- a/drivers/md/persistent-data/dm-btree.h
+++ b/drivers/md/persistent-data/dm-btree.h
@@ -134,6 +134,15 @@ int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
 		    uint64_t *keys, dm_block_t *new_root);
 
+/*
+ * Removes values between 'keys' and keys2, where keys2 is keys with the
+ * final key replaced with 'end_key'.  'end_key' is the one-past-the-end
+ * value.  'keys' may be altered.
+ */
+int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
+			   uint64_t *keys, uint64_t end_key,
+			   dm_block_t *new_root, unsigned *nr_removed);
+
 /*
  * Returns < 0 on failure.  Otherwise the number of key entries that have
  * been filled out.  Remember trees can have zero entries, and as such have
-- 
cgit v1.2.1


From a5d895a90bf57e5fe87edf48dd1852e7292d570d Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 16 Apr 2015 12:47:21 +0100
Subject: dm thin metadata: add dm_thin_find_mapped_range()

Retrieve the next run of contiguously mapped blocks.  Useful for working
out where to break up IO.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c | 57 +++++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm-thin-metadata.h |  9 +++++++
 2 files changed, 66 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index cb6dd055053d..94cf0db8a22e 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1417,6 +1417,63 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 	return r;
 }
 
+/* FIXME: write a more efficient one in btree */
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+			      dm_block_t begin, dm_block_t end,
+			      dm_block_t *thin_begin, dm_block_t *thin_end,
+			      dm_block_t *pool_begin, bool *maybe_shared)
+{
+	int r;
+	dm_block_t pool_end;
+	struct dm_thin_lookup_result lookup;
+
+	if (end < begin)
+		return -ENODATA;
+
+	/*
+	 * Find first mapped block.
+	 */
+	while (begin < end) {
+		r = dm_thin_find_block(td, begin, true, &lookup);
+		if (r) {
+			if (r != -ENODATA)
+				return r;
+		} else
+			break;
+
+		begin++;
+	}
+
+	if (begin == end)
+		return -ENODATA;
+
+	*thin_begin = begin;
+	*pool_begin = lookup.block;
+	*maybe_shared = lookup.shared;
+
+	begin++;
+	pool_end = *pool_begin + 1;
+	while (begin != end) {
+		r = dm_thin_find_block(td, begin, true, &lookup);
+		if (r) {
+			if (r == -ENODATA)
+				break;
+			else
+				return r;
+		}
+
+		if ((lookup.block != pool_end) ||
+		    (lookup.shared != *maybe_shared))
+			break;
+
+		pool_end++;
+		begin++;
+	}
+
+	*thin_end = begin;
+	return 0;
+}
+
 static int __insert(struct dm_thin_device *td, dm_block_t block,
 		    dm_block_t data_block)
 {
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index fac01a96d303..f11f14095b93 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -146,6 +146,15 @@ struct dm_thin_lookup_result {
 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 		       int can_issue_io, struct dm_thin_lookup_result *result);
 
+/*
+ * Retrieve the next run of contiguously mapped blocks.  Useful for working
+ * out where to break up IO.  Returns 0 on success, < 0 on error.
+ */
+int dm_thin_find_mapped_range(struct dm_thin_device *td,
+			      dm_block_t begin, dm_block_t end,
+			      dm_block_t *thin_begin, dm_block_t *thin_end,
+			      dm_block_t *pool_begin, bool *maybe_shared);
+
 /*
  * Obtain an unused block.
  */
-- 
cgit v1.2.1


From 6550f075f5087459f64c1af71298fc50b102af11 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Mon, 13 Apr 2015 09:45:25 +0100
Subject: dm thin metadata: add dm_thin_remove_range()

Removes a range of blocks from the btree.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c | 54 +++++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm-thin-metadata.h |  2 ++
 2 files changed, 56 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 94cf0db8a22e..8b521e3e1e1b 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1526,6 +1526,47 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
 	return 0;
 }
 
+static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
+{
+	int r;
+	unsigned count;
+	struct dm_pool_metadata *pmd = td->pmd;
+	dm_block_t keys[1] = { td->id };
+	__le64 value;
+	dm_block_t mapping_root;
+
+	/*
+	 * Find the mapping tree
+	 */
+	r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
+	if (r)
+		return r;
+
+	/*
+	 * Remove from the mapping tree, taking care to inc the
+	 * ref count so it doesn't get deleted.
+	 */
+	mapping_root = le64_to_cpu(value);
+	dm_tm_inc(pmd->tm, mapping_root);
+	r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
+	if (r)
+		return r;
+
+	r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
+	if (r)
+		return r;
+
+	td->mapped_blocks -= count;
+	td->changed = 1;
+
+	/*
+	 * Reinsert the mapping tree.
+	 */
+	value = cpu_to_le64(mapping_root);
+	__dm_bless_for_disk(&value);
+	return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
+}
+
 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 {
 	int r = -EINVAL;
@@ -1538,6 +1579,19 @@ int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
 	return r;
 }
 
+int dm_thin_remove_range(struct dm_thin_device *td,
+			 dm_block_t begin, dm_block_t end)
+{
+	int r = -EINVAL;
+
+	down_write(&td->pmd->root_lock);
+	if (!td->pmd->fail_io)
+		r = __remove_range(td, begin, end);
+	up_write(&td->pmd->root_lock);
+
+	return r;
+}
+
 int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
 {
 	int r;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index f11f14095b93..a938babe4258 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -167,6 +167,8 @@ int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
 			 dm_block_t data_block);
 
 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block);
+int dm_thin_remove_range(struct dm_thin_device *td,
+			 dm_block_t begin, dm_block_t end);
 
 /*
  * Queries.
-- 
cgit v1.2.1


From 34fbcf6257eb3f39a5b78a4f51b40f881b82033b Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 16 Apr 2015 12:58:35 +0100
Subject: dm thin: range discard support

Previously REQ_DISCARD bios have been split into block sized chunks
before submission to the thin target.  There are a couple of issues with
this:

 - If the block size is small, a large discard request can
   get broken up into a great many bios which is both slow and causes
   a lot of memory pressure.

 - The thin pool block size and the discard granularity for the
   underlying data device need to be compatible if we want to passdown
   the discard.

This patch relaxes the block size granularity for thin devices.  It
makes use of the recent range locking added to the bio_prison to
quiesce a whole range of thin blocks before unmapping them.  Once a
thin range has been unmapped the discard can then be passed down to
the data device for those sub ranges where the data blocks are no
longer used (ie. they weren't shared in the first place).

This patch also doesn't make any apologies about open-coding portions
of block core as a means to supporting async discard completions in the
near-term -- if/when late bio splitting lands it'll all get cleaned up.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 583 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 434 insertions(+), 149 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c552df7b3420..99daf2e332f4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -111,22 +111,30 @@ DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
 /*
  * Key building.
  */
-static void build_data_key(struct dm_thin_device *td,
-			   dm_block_t b, struct dm_cell_key *key)
+enum lock_space {
+	VIRTUAL,
+	PHYSICAL
+};
+
+static void build_key(struct dm_thin_device *td, enum lock_space ls,
+		      dm_block_t b, dm_block_t e, struct dm_cell_key *key)
 {
-	key->virtual = 0;
+	key->virtual = (ls == VIRTUAL);
 	key->dev = dm_thin_dev_id(td);
 	key->block_begin = b;
-	key->block_end = b + 1ULL;
+	key->block_end = e;
+}
+
+static void build_data_key(struct dm_thin_device *td, dm_block_t b,
+			   struct dm_cell_key *key)
+{
+	build_key(td, PHYSICAL, b, b + 1llu, key);
 }
 
 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 			      struct dm_cell_key *key)
 {
-	key->virtual = 1;
-	key->dev = dm_thin_dev_id(td);
-	key->block_begin = b;
-	key->block_end = b + 1ULL;
+	build_key(td, VIRTUAL, b, b + 1llu, key);
 }
 
 /*----------------------------------------------------------------*/
@@ -312,6 +320,138 @@ struct thin_c {
 
 /*----------------------------------------------------------------*/
 
+/**
+ * __blkdev_issue_discard_async - queue a discard with async completion
+ * @bdev:	blockdev to issue discard for
+ * @sector:	start sector
+ * @nr_sects:	number of sectors to discard
+ * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @flags:	BLKDEV_IFL_* flags to control behaviour
+ * @parent_bio: parent discard bio that all sub discards get chained to
+ *
+ * Description:
+ *    Asynchronously issue a discard request for the sectors in question.
+ *    NOTE: this variant of blk-core's blkdev_issue_discard() is a stop-gap
+ *    that is being kept local to DM thinp until the block changes to allow
+ *    late bio splitting land upstream.
+ */
+static int __blkdev_issue_discard_async(struct block_device *bdev, sector_t sector,
+					sector_t nr_sects, gfp_t gfp_mask, unsigned long flags,
+					struct bio *parent_bio)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	int type = REQ_WRITE | REQ_DISCARD;
+	unsigned int max_discard_sectors, granularity;
+	int alignment;
+	struct bio *bio;
+	int ret = 0;
+	struct blk_plug plug;
+
+	if (!q)
+		return -ENXIO;
+
+	if (!blk_queue_discard(q))
+		return -EOPNOTSUPP;
+
+	/* Zero-sector (unknown) and one-sector granularities are the same.  */
+	granularity = max(q->limits.discard_granularity >> 9, 1U);
+	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+	/*
+	 * Ensure that max_discard_sectors is of the proper
+	 * granularity, so that requests stay aligned after a split.
+	 */
+	max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9);
+	max_discard_sectors -= max_discard_sectors % granularity;
+	if (unlikely(!max_discard_sectors)) {
+		/* Avoid infinite loop below. Being cautious never hurts. */
+		return -EOPNOTSUPP;
+	}
+
+	if (flags & BLKDEV_DISCARD_SECURE) {
+		if (!blk_queue_secdiscard(q))
+			return -EOPNOTSUPP;
+		type |= REQ_SECURE;
+	}
+
+	blk_start_plug(&plug);
+	while (nr_sects) {
+		unsigned int req_sects;
+		sector_t end_sect, tmp;
+
+		/*
+		 * Required bio_put occurs in bio_endio thanks to bio_chain below
+		 */
+		bio = bio_alloc(gfp_mask, 1);
+		if (!bio) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		req_sects = min_t(sector_t, nr_sects, max_discard_sectors);
+
+		/*
+		 * If splitting a request, and the next starting sector would be
+		 * misaligned, stop the discard at the previous aligned sector.
+		 */
+		end_sect = sector + req_sects;
+		tmp = end_sect;
+		if (req_sects < nr_sects &&
+		    sector_div(tmp, granularity) != alignment) {
+			end_sect = end_sect - alignment;
+			sector_div(end_sect, granularity);
+			end_sect = end_sect * granularity + alignment;
+			req_sects = end_sect - sector;
+		}
+
+		bio_chain(bio, parent_bio);
+
+		bio->bi_iter.bi_sector = sector;
+		bio->bi_bdev = bdev;
+
+		bio->bi_iter.bi_size = req_sects << 9;
+		nr_sects -= req_sects;
+		sector = end_sect;
+
+		submit_bio(type, bio);
+
+		/*
+		 * We can loop for a long time in here, if someone does
+		 * full device discards (like mkfs). Be nice and allow
+		 * us to schedule out to avoid softlocking if preempt
+		 * is disabled.
+		 */
+		cond_resched();
+	}
+	blk_finish_plug(&plug);
+
+	return ret;
+}
+
+static bool block_size_is_power_of_two(struct pool *pool)
+{
+	return pool->sectors_per_block_shift >= 0;
+}
+
+static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
+{
+	return block_size_is_power_of_two(pool) ?
+		(b << pool->sectors_per_block_shift) :
+		(b * pool->sectors_per_block);
+}
+
+static int issue_discard(struct thin_c *tc, dm_block_t data_b, dm_block_t data_e,
+			 struct bio *parent_bio)
+{
+	sector_t s = block_to_sectors(tc->pool, data_b);
+	sector_t len = block_to_sectors(tc->pool, data_e - data_b);
+
+	return __blkdev_issue_discard_async(tc->pool_dev->bdev, s, len,
+					    GFP_NOWAIT, 0, parent_bio);
+}
+
+/*----------------------------------------------------------------*/
+
 /*
  * wake_worker() is used when new work is queued and when pool_resume is
  * ready to continue deferred IO processing.
@@ -461,6 +601,7 @@ struct dm_thin_endio_hook {
 	struct dm_deferred_entry *all_io_entry;
 	struct dm_thin_new_mapping *overwrite_mapping;
 	struct rb_node rb_node;
+	struct dm_bio_prison_cell *cell;
 };
 
 static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
@@ -541,11 +682,6 @@ static void error_retry_list(struct pool *pool)
  * target.
  */
 
-static bool block_size_is_power_of_two(struct pool *pool)
-{
-	return pool->sectors_per_block_shift >= 0;
-}
-
 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
@@ -559,6 +695,34 @@ static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 	return block_nr;
 }
 
+/*
+ * Returns the _complete_ blocks that this bio covers.
+ */
+static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
+				dm_block_t *begin, dm_block_t *end)
+{
+	struct pool *pool = tc->pool;
+	sector_t b = bio->bi_iter.bi_sector;
+	sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
+
+	b += pool->sectors_per_block - 1ull; /* so we round up */
+
+	if (block_size_is_power_of_two(pool)) {
+		b >>= pool->sectors_per_block_shift;
+		e >>= pool->sectors_per_block_shift;
+	} else {
+		(void) sector_div(b, pool->sectors_per_block);
+		(void) sector_div(e, pool->sectors_per_block);
+	}
+
+	if (e < b)
+		/* Can happen if the bio is within a single block. */
+		e = b;
+
+	*begin = b;
+	*end = e;
+}
+
 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 {
 	struct pool *pool = tc->pool;
@@ -647,7 +811,7 @@ struct dm_thin_new_mapping {
 	struct list_head list;
 
 	bool pass_discard:1;
-	bool definitely_not_shared:1;
+	bool maybe_shared:1;
 
 	/*
 	 * Track quiescing, copying and zeroing preparation actions.  When this
@@ -658,9 +822,9 @@ struct dm_thin_new_mapping {
 
 	int err;
 	struct thin_c *tc;
-	dm_block_t virt_block;
+	dm_block_t virt_begin, virt_end;
 	dm_block_t data_block;
-	struct dm_bio_prison_cell *cell, *cell2;
+	struct dm_bio_prison_cell *cell;
 
 	/*
 	 * If the bio covers the whole area of a block then we can avoid
@@ -817,7 +981,7 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 	 * Any I/O for this block arriving after this point will get
 	 * remapped to it directly.
 	 */
-	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
+	r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
 	if (r) {
 		metadata_operation_failed(pool, "dm_thin_insert_block", r);
 		cell_error(pool, m->cell);
@@ -844,50 +1008,112 @@ out:
 	mempool_free(m, pool->mapping_pool);
 }
 
-static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+/*----------------------------------------------------------------*/
+
+static void free_discard_mapping(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
+	if (m->cell)
+		cell_defer_no_holder(tc, m->cell);
+	mempool_free(m, tc->pool->mapping_pool);
+}
 
+static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
+{
 	bio_io_error(m->bio);
+	free_discard_mapping(m);
+}
+
+static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
+{
+	bio_endio(m->bio, 0);
+	free_discard_mapping(m);
+}
+
+static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
+{
+	int r;
+	struct thin_c *tc = m->tc;
+
+	r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
+	if (r) {
+		metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
+		bio_io_error(m->bio);
+	} else
+		bio_endio(m->bio, 0);
+
 	cell_defer_no_holder(tc, m->cell);
-	cell_defer_no_holder(tc, m->cell2);
 	mempool_free(m, tc->pool->mapping_pool);
 }
 
-static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
+static int passdown_double_checking_shared_status(struct dm_thin_new_mapping *m)
 {
+	/*
+	 * We've already unmapped this range of blocks, but before we
+	 * passdown we have to check that these blocks are now unused.
+	 */
+	int r;
+	bool used = true;
 	struct thin_c *tc = m->tc;
+	struct pool *pool = tc->pool;
+	dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
 
-	inc_all_io_entry(tc->pool, m->bio);
-	cell_defer_no_holder(tc, m->cell);
-	cell_defer_no_holder(tc, m->cell2);
+	while (b != end) {
+		/* find start of unmapped run */
+		for (; b < end; b++) {
+			r = dm_pool_block_is_used(pool->pmd, b, &used);
+			if (r)
+				return r;
 
-	if (m->pass_discard)
-		if (m->definitely_not_shared)
-			remap_and_issue(tc, m->bio, m->data_block);
-		else {
-			bool used = false;
-			if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
-				bio_endio(m->bio, 0);
-			else
-				remap_and_issue(tc, m->bio, m->data_block);
+			if (!used)
+				break;
 		}
-	else
-		bio_endio(m->bio, 0);
 
-	mempool_free(m, tc->pool->mapping_pool);
+		if (b == end)
+			break;
+
+		/* find end of run */
+		for (e = b + 1; e != end; e++) {
+			r = dm_pool_block_is_used(pool->pmd, e, &used);
+			if (r)
+				return r;
+
+			if (used)
+				break;
+		}
+
+		r = issue_discard(tc, b, e, m->bio);
+		if (r)
+			return r;
+
+		b = e;
+	}
+
+	return 0;
 }
 
-static void process_prepared_discard(struct dm_thin_new_mapping *m)
+static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 {
 	int r;
 	struct thin_c *tc = m->tc;
+	struct pool *pool = tc->pool;
 
-	r = dm_thin_remove_block(tc->td, m->virt_block);
+	r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
 	if (r)
-		DMERR_LIMIT("dm_thin_remove_block() failed");
+		metadata_operation_failed(pool, "dm_thin_remove_range", r);
+
+	else if (m->maybe_shared)
+		r = passdown_double_checking_shared_status(m);
+	else
+		r = issue_discard(tc, m->data_block, m->data_block + (m->virt_end - m->virt_begin), m->bio);
 
-	process_prepared_discard_passdown(m);
+	/*
+	 * Even if r is set, there could be sub discards in flight that we
+	 * need to wait for.
+	 */
+	bio_endio(m->bio, r);
+	cell_defer_no_holder(tc, m->cell);
+	mempool_free(m, pool->mapping_pool);
 }
 
 static void process_prepared(struct pool *pool, struct list_head *head,
@@ -971,7 +1197,7 @@ static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
 }
 
 static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
-				      dm_block_t data_block,
+				      dm_block_t data_begin,
 				      struct dm_thin_new_mapping *m)
 {
 	struct pool *pool = tc->pool;
@@ -981,7 +1207,7 @@ static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
 	m->bio = bio;
 	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 	inc_all_io_entry(pool, bio);
-	remap_and_issue(tc, bio, data_block);
+	remap_and_issue(tc, bio, data_begin);
 }
 
 /*
@@ -998,7 +1224,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
 	m->tc = tc;
-	m->virt_block = virt_block;
+	m->virt_begin = virt_block;
+	m->virt_end = virt_block + 1u;
 	m->data_block = data_dest;
 	m->cell = cell;
 
@@ -1077,7 +1304,8 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 
 	atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
 	m->tc = tc;
-	m->virt_block = virt_block;
+	m->virt_begin = virt_block;
+	m->virt_end = virt_block + 1u;
 	m->data_block = data_block;
 	m->cell = cell;
 
@@ -1284,99 +1512,149 @@ static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *c
 		retry_on_resume(bio);
 }
 
-static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
+static void process_discard_cell_no_passdown(struct thin_c *tc,
+					     struct dm_bio_prison_cell *virt_cell)
 {
-	int r;
-	struct bio *bio = cell->holder;
 	struct pool *pool = tc->pool;
-	struct dm_bio_prison_cell *cell2;
-	struct dm_cell_key key2;
-	dm_block_t block = get_bio_block(tc, bio);
-	struct dm_thin_lookup_result lookup_result;
-	struct dm_thin_new_mapping *m;
+	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 
-	if (tc->requeue_mode) {
-		cell_requeue(pool, cell);
-		return;
-	}
+	/*
+	 * We don't need to lock the data blocks, since there's no
+	 * passdown.  We only lock data blocks for allocation and breaking sharing.
+	 */
+	m->tc = tc;
+	m->virt_begin = virt_cell->key.block_begin;
+	m->virt_end = virt_cell->key.block_end;
+	m->cell = virt_cell;
+	m->bio = virt_cell->holder;
 
-	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
-	switch (r) {
-	case 0:
-		/*
-		 * Check nobody is fiddling with this pool block.  This can
-		 * happen if someone's in the process of breaking sharing
-		 * on this block.
-		 */
-		build_data_key(tc->td, lookup_result.block, &key2);
-		if (bio_detain(tc->pool, &key2, bio, &cell2)) {
-			cell_defer_no_holder(tc, cell);
-			break;
-		}
+	if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+		pool->process_prepared_discard(m);
+}
 
-		if (io_overlaps_block(pool, bio)) {
-			/*
-			 * IO may still be going to the destination block.  We must
-			 * quiesce before we can do the removal.
-			 */
-			m = get_next_mapping(pool);
-			m->tc = tc;
-			m->pass_discard = pool->pf.discard_passdown;
-			m->definitely_not_shared = !lookup_result.shared;
-			m->virt_block = block;
-			m->data_block = lookup_result.block;
-			m->cell = cell;
-			m->cell2 = cell2;
-			m->bio = bio;
-
-			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
-				pool->process_prepared_discard(m);
+/*
+ * FIXME: DM local hack to defer parent bios's end_io until we
+ * _know_ all chained sub range discard bios have completed.
+ * Will go away once late bio splitting lands upstream!
+ */
+static inline void __bio_inc_remaining(struct bio *bio)
+{
+	bio->bi_flags |= (1 << BIO_CHAIN);
+	smp_mb__before_atomic();
+	atomic_inc(&bio->__bi_remaining);
+}
 
-		} else {
-			inc_all_io_entry(pool, bio);
-			cell_defer_no_holder(tc, cell);
-			cell_defer_no_holder(tc, cell2);
+static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
+				 struct bio *bio)
+{
+	struct pool *pool = tc->pool;
+
+	int r;
+	bool maybe_shared;
+	struct dm_cell_key data_key;
+	struct dm_bio_prison_cell *data_cell;
+	struct dm_thin_new_mapping *m;
+	dm_block_t virt_begin, virt_end, data_begin;
+
+	while (begin != end) {
+		r = ensure_next_mapping(pool);
+		if (r)
+			/* we did our best */
+			return;
 
+		r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
+					      &data_begin, &maybe_shared);
+		if (r)
 			/*
-			 * The DM core makes sure that the discard doesn't span
-			 * a block boundary.  So we submit the discard of a
-			 * partial block appropriately.
+			 * Silently fail, letting any mappings we've
+			 * created complete.
 			 */
-			if ((!lookup_result.shared) && pool->pf.discard_passdown)
-				remap_and_issue(tc, bio, lookup_result.block);
-			else
-				bio_endio(bio, 0);
+			break;
+
+		build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
+		if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
+			/* contention, we'll give up with this range */
+			begin = virt_end;
+			continue;
 		}
-		break;
 
-	case -ENODATA:
 		/*
-		 * It isn't provisioned, just forget it.
+		 * IO may still be going to the destination block.  We must
+		 * quiesce before we can do the removal.
 		 */
-		cell_defer_no_holder(tc, cell);
-		bio_endio(bio, 0);
-		break;
+		m = get_next_mapping(pool);
+		m->tc = tc;
+		m->maybe_shared = maybe_shared;
+		m->virt_begin = virt_begin;
+		m->virt_end = virt_end;
+		m->data_block = data_begin;
+		m->cell = data_cell;
+		m->bio = bio;
 
-	default:
-		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
-			    __func__, r);
-		cell_defer_no_holder(tc, cell);
-		bio_io_error(bio);
-		break;
+		/*
+		 * The parent bio must not complete before sub discard bios are
+		 * chained to it (see __blkdev_issue_discard_async's bio_chain)!
+		 *
+		 * This per-mapping bi_remaining increment is paired with
+		 * the implicit decrement that occurs via bio_endio() in
+		 * process_prepared_discard_{passdown,no_passdown}.
+		 */
+		__bio_inc_remaining(bio);
+		if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
+			pool->process_prepared_discard(m);
+
+		begin = virt_end;
 	}
 }
 
+static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
+{
+	struct bio *bio = virt_cell->holder;
+	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
+
+	/*
+	 * The virt_cell will only get freed once the origin bio completes.
+	 * This means it will remain locked while all the individual
+	 * passdown bios are in flight.
+	 */
+	h->cell = virt_cell;
+	break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
+
+	/*
+	 * We complete the bio now, knowing that the bi_remaining field
+	 * will prevent completion until the sub range discards have
+	 * completed.
+	 */
+	bio_endio(bio, 0);
+}
+
 static void process_discard_bio(struct thin_c *tc, struct bio *bio)
 {
-	struct dm_bio_prison_cell *cell;
-	struct dm_cell_key key;
-	dm_block_t block = get_bio_block(tc, bio);
+	dm_block_t begin, end;
+	struct dm_cell_key virt_key;
+	struct dm_bio_prison_cell *virt_cell;
 
-	build_virtual_key(tc->td, block, &key);
-	if (bio_detain(tc->pool, &key, bio, &cell))
+	get_bio_block_range(tc, bio, &begin, &end);
+	if (begin == end) {
+		/*
+		 * The discard covers less than a block.
+		 */
+		bio_endio(bio, 0);
 		return;
+	}
 
-	process_discard_cell(tc, cell);
+	build_key(tc->td, VIRTUAL, begin, end, &virt_key);
+	if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
+		/*
+		 * Potential starvation issue: We're relying on the
+		 * fs/application being well behaved, and not trying to
+		 * send IO to a region at the same time as discarding it.
+		 * If they do this persistently then it's possible this
+		 * cell will never be granted.
+		 */
+		return;
+
+	tc->pool->process_discard_cell(tc, virt_cell);
 }
 
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
@@ -2092,6 +2370,24 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
 	       dm_device_name(pool->pool_md), new_mode);
 }
 
+static bool passdown_enabled(struct pool_c *pt)
+{
+	return pt->adjusted_pf.discard_passdown;
+}
+
+static void set_discard_callbacks(struct pool *pool)
+{
+	struct pool_c *pt = pool->ti->private;
+
+	if (passdown_enabled(pt)) {
+		pool->process_discard_cell = process_discard_cell_passdown;
+		pool->process_prepared_discard = process_prepared_discard_passdown;
+	} else {
+		pool->process_discard_cell = process_discard_cell_no_passdown;
+		pool->process_prepared_discard = process_prepared_discard_no_passdown;
+	}
+}
+
 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 {
 	struct pool_c *pt = pool->ti->private;
@@ -2143,7 +2439,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		pool->process_cell = process_cell_read_only;
 		pool->process_discard_cell = process_cell_success;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
-		pool->process_prepared_discard = process_prepared_discard_passdown;
+		pool->process_prepared_discard = process_prepared_discard_success;
 
 		error_retry_list(pool);
 		break;
@@ -2162,9 +2458,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		pool->process_bio = process_bio_read_only;
 		pool->process_discard = process_discard_bio;
 		pool->process_cell = process_cell_read_only;
-		pool->process_discard_cell = process_discard_cell;
 		pool->process_prepared_mapping = process_prepared_mapping;
-		pool->process_prepared_discard = process_prepared_discard;
+		set_discard_callbacks(pool);
 
 		if (!pool->pf.error_if_no_space && no_space_timeout)
 			queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
@@ -2177,9 +2472,8 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		pool->process_bio = process_bio;
 		pool->process_discard = process_discard_bio;
 		pool->process_cell = process_cell;
-		pool->process_discard_cell = process_discard_cell;
 		pool->process_prepared_mapping = process_prepared_mapping;
-		pool->process_prepared_discard = process_prepared_discard;
+		set_discard_callbacks(pool);
 		break;
 	}
 
@@ -2268,6 +2562,7 @@ static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
 	h->shared_read_entry = NULL;
 	h->all_io_entry = NULL;
 	h->overwrite_mapping = NULL;
+	h->cell = NULL;
 }
 
 /*
@@ -2415,7 +2710,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
 	struct pool *pool = pt->pool;
 	struct block_device *data_bdev = pt->data_dev->bdev;
 	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
-	sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
 	const char *reason = NULL;
 	char buf[BDEVNAME_SIZE];
 
@@ -2428,12 +2722,6 @@ static void disable_passdown_if_not_supported(struct pool_c *pt)
 	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
 		reason = "max discard sectors smaller than a block";
 
-	else if (data_limits->discard_granularity > block_size)
-		reason = "discard granularity larger than a block";
-
-	else if (!is_factor(block_size, data_limits->discard_granularity))
-		reason = "discard granularity not a factor of block size";
-
 	if (reason) {
 		DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
 		pt->adjusted_pf.discard_passdown = false;
@@ -3566,24 +3854,6 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
-static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
-{
-	struct pool *pool = pt->pool;
-	struct queue_limits *data_limits;
-
-	limits->max_discard_sectors = pool->sectors_per_block;
-
-	/*
-	 * discard_granularity is just a hint, and not enforced.
-	 */
-	if (pt->adjusted_pf.discard_passdown) {
-		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
-		limits->discard_granularity = max(data_limits->discard_granularity,
-						  pool->sectors_per_block << SECTOR_SHIFT);
-	} else
-		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
-}
-
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct pool_c *pt = ti->private;
@@ -3638,14 +3908,17 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 	disable_passdown_if_not_supported(pt);
 
-	set_discard_limits(pt, limits);
+	/*
+	 * The pool uses the same discard limits as the underlying data
+	 * device.  DM core has already set this up.
+	 */
 }
 
 static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 14, 0},
+	.version = {1, 15, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -3804,8 +4077,7 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
-		/* Discard bios must be split on a block boundary */
-		ti->split_discard_bios = true;
+		ti->split_discard_bios = false;
 	}
 
 	mutex_unlock(&dm_thin_pool_table.mutex);
@@ -3892,6 +4164,9 @@ static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 		}
 	}
 
+	if (h->cell)
+		cell_defer_no_holder(h->tc, h->cell);
+
 	return 0;
 }
 
@@ -4019,9 +4294,18 @@ static int thin_iterate_devices(struct dm_target *ti,
 	return 0;
 }
 
+static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+	struct thin_c *tc = ti->private;
+	struct pool *pool = tc->pool;
+
+	limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
+	limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
+}
+
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 14, 0},
+	.version = {1, 15, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
@@ -4033,6 +4317,7 @@ static struct target_type thin_target = {
 	.status = thin_status,
 	.merge = thin_merge,
 	.iterate_devices = thin_iterate_devices,
+	.io_hints = thin_io_hints,
 };
 
 /*----------------------------------------------------------------*/
-- 
cgit v1.2.1


From fd467696e8beb542144cd005ff96cd35fff41354 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Tue, 9 Jun 2015 12:31:26 -0400
Subject: dm thin: fail messages with EOPNOTSUPP when pool cannot handle
 messages

Use EOPNOTSUPP, rather than EINVAL, error code when user attempts to
send the pool a message.  Otherwise usespace is led to believe the
message failed due to invalid argument.

Reported-by: Zdenek Kabelac <zkabelac@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 99daf2e332f4..c33f61a4cc28 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3656,7 +3656,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 	if (get_pool_mode(pool) >= PM_READ_ONLY) {
 		DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
 		      dm_device_name(pool->pool_md));
-		return -EINVAL;
+		return -EOPNOTSUPP;
 	}
 
 	if (!strcasecmp(argv[0], "create_thin"))
-- 
cgit v1.2.1


From b1f11aff04cc86daa0757ada5deb669a92a8f0fb Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Thu, 11 Jun 2015 17:11:48 +0100
Subject: dm thin metadata: fix a race when entering fail mode

In dm_thin_find_block() the ->fail_io flag was checked outside the
metadata device's root_lock, causing dm_thin_find_block() to race with
the setting of this flag.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin-metadata.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 8b521e3e1e1b..48dfe3c4d6aa 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -1390,10 +1390,11 @@ int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
 	dm_block_t keys[2] = { td->id, block };
 	struct dm_btree_info *info;
 
-	if (pmd->fail_io)
-		return -EINVAL;
-
 	down_read(&pmd->root_lock);
+	if (pmd->fail_io) {
+		up_read(&pmd->root_lock);
+		return -EINVAL;
+	}
 
 	if (can_issue_io) {
 		info = &pmd->info;
-- 
cgit v1.2.1


From c008f1d356277a5b7561040596a073d87e56b0c8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 12 Jun 2015 19:46:44 +1000
Subject: md: don't return 0 from array_state_store

Returning zero from a 'store' function is bad.
The return value should be either len length of the string
or an error.

So use 'len' if 'err' is zero.

Fixes: 6791875e2e53 ("md: make reconfig_mutex optional for writes to md sysfs files.")
Signed-off-by: NeilBrown <neilb@suse.de>
Cc: stable@vger.kernel (v4.0+)
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 27506302eb7a..dd59d71ade2f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -3834,7 +3834,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
 				err = -EBUSY;
 		}
 		spin_unlock(&mddev->lock);
-		return err;
+		return err ?: len;
 	}
 	err = mddev_lock(mddev);
 	if (err)
-- 
cgit v1.2.1


From 8e8e2518fceca407bb8fc2a6710d19d2e217892e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 12 Jun 2015 19:51:27 +1000
Subject: md: Close race when setting 'action' to 'idle'.

Checking ->sync_thread without holding the mddev_lock()
isn't really safe, even after flushing the workqueue which
ensures md_start_sync() has been run.

While this code is waiting for the lock, md_check_recovery could reap
the thread itself, and then start another thread (e.g. recovery might
finish, then reshape starts).  When this thread gets the lock
md_start_sync() hasn't run so it doesn't get reaped, but
MD_RECOVERY_RUNNING gets cleared.  This allows two threads to start
which leads to confusion.

So don't both if MD_RECOVERY_RUNNING isn't set, but if it is do
the flush and the test and the reap all under the mddev_lock to
avoid any race with md_check_recovery.

Signed-off-by: NeilBrown <neilb@suse.de>
Fixes: 6791875e2e53 ("md: make reconfig_mutex optional for writes to md sysfs files.")
Cc: stable@vger.kernel.org (v4.0+)
---
 drivers/md/md.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index dd59d71ade2f..8d4408baa428 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4217,13 +4217,14 @@ action_store(struct mddev *mddev, const char *page, size_t len)
 			set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
 		else
 			clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
-		flush_workqueue(md_misc_wq);
-		if (mddev->sync_thread) {
-			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
-			if (mddev_lock(mddev) == 0) {
+		if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
+		    mddev_lock(mddev) == 0) {
+			flush_workqueue(md_misc_wq);
+			if (mddev->sync_thread) {
+				set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 				md_reap_sync_thread(mddev);
-				mddev_unlock(mddev);
 			}
+			mddev_unlock(mddev);
 		}
 	} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
 		   test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
-- 
cgit v1.2.1


From ea358cd0d2c634ff1379a1392edcdf2289f31e13 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Fri, 12 Jun 2015 20:05:04 +1000
Subject: md: make sure MD_RECOVERY_DONE is clear before starting
 recovery/resync

MD_RECOVERY_DONE is normally cleared by md_check_recovery after a
resync etc finished.  However it is possible for raid5_start_reshape
to race and start a reshape before MD_RECOVERY_DONE is cleared.  This
can lean to multiple reshapes running at the same time, which isn't
good.

To make sure it is cleared before starting a reshape, and also clear
it when reaping a thread, just to be safe.

Signed-off-by: NeilBrown  <neilb@suse.de>
---
 drivers/md/md.c     | 1 +
 drivers/md/raid10.c | 1 +
 drivers/md/raid5.c  | 1 +
 3 files changed, 3 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8d4408baa428..4dbed4a67aaf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8262,6 +8262,7 @@ void md_reap_sync_thread(struct mddev *mddev)
 	if (mddev_is_clustered(mddev))
 		md_cluster_ops->metadata_update_finish(mddev);
 	clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 	clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e793ab6b3570..f55c3f35b746 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4156,6 +4156,7 @@ static int raid10_start_reshape(struct mddev *mddev)
 
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 553d54b87052..b6793d2e051f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7354,6 +7354,7 @@ static int raid5_start_reshape(struct mddev *mddev)
 
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+	clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
-- 
cgit v1.2.1


From c31df25f20e35add6a453328c61eca15434fae18 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Wed, 6 May 2015 23:34:20 -0700
Subject: md/raid10: make sync_request_write() call bio_copy_data()

Refactor sync_request_write() of md/raid10 to use bio_copy_data()
instead of open coding bio_vec iterations.

Cc: Christoph Hellwig <hch@infradead.org>
Cc: Neil Brown <neilb@suse.de>
Cc: linux-raid@vger.kernel.org
Reviewed-by: Christoph Hellwig <hch@lst.de>
Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
[dpark: add more description in commit message]
Signed-off-by: Dongsu Park <dpark@posteo.net>
Signed-off-by: Ming Lin <mlin@kernel.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 18 ++++--------------
 1 file changed, 4 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index f55c3f35b746..03f460a1de60 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2099,17 +2099,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 		tbio->bi_rw = WRITE;
 		tbio->bi_private = r10_bio;
 		tbio->bi_iter.bi_sector = r10_bio->devs[i].addr;
-
-		for (j=0; j < vcnt ; j++) {
-			tbio->bi_io_vec[j].bv_offset = 0;
-			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
-
-			memcpy(page_address(tbio->bi_io_vec[j].bv_page),
-			       page_address(fbio->bi_io_vec[j].bv_page),
-			       PAGE_SIZE);
-		}
 		tbio->bi_end_io = end_sync_write;
 
+		bio_copy_data(tbio, fbio);
+
 		d = r10_bio->devs[i].devnum;
 		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 		atomic_inc(&r10_bio->remaining);
@@ -2124,17 +2117,14 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 	 * that are active
 	 */
 	for (i = 0; i < conf->copies; i++) {
-		int j, d;
+		int d;
 
 		tbio = r10_bio->devs[i].repl_bio;
 		if (!tbio || !tbio->bi_end_io)
 			continue;
 		if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
 		    && r10_bio->devs[i].bio != fbio)
-			for (j = 0; j < vcnt; j++)
-				memcpy(page_address(tbio->bi_io_vec[j].bv_page),
-				       page_address(fbio->bi_io_vec[j].bv_page),
-				       PAGE_SIZE);
+			bio_copy_data(tbio, fbio);
 		d = r10_bio->devs[i].devnum;
 		atomic_inc(&r10_bio->remaining);
 		md_sync_acct(conf->mirrors[d].replacement->bdev,
-- 
cgit v1.2.1


From 4c9309c0cce96ea93be638dd243616e56e4bfe7d Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Sat, 16 May 2015 14:02:38 +0300
Subject: md: convert to kstrto*()

Convert away from deprecated simple_strto*() functions.

Add "fit into sector_t" checks.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 149 ++++++++++++++++++++++++++++++--------------------------
 1 file changed, 81 insertions(+), 68 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4dbed4a67aaf..b6bbcf0cc430 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2630,13 +2630,14 @@ errors_show(struct md_rdev *rdev, char *page)
 static ssize_t
 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
-	char *e;
-	unsigned long n = simple_strtoul(buf, &e, 10);
-	if (*buf && (*e == 0 || *e == '\n')) {
-		atomic_set(&rdev->corrected_errors, n);
-		return len;
-	}
-	return -EINVAL;
+	unsigned int n;
+	int rv;
+
+	rv = kstrtouint(buf, 10, &n);
+	if (rv < 0)
+		return rv;
+	atomic_set(&rdev->corrected_errors, n);
+	return len;
 }
 static struct rdev_sysfs_entry rdev_errors =
 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
@@ -2653,13 +2654,16 @@ slot_show(struct md_rdev *rdev, char *page)
 static ssize_t
 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
 {
-	char *e;
+	int slot;
 	int err;
-	int slot = simple_strtoul(buf, &e, 10);
+
 	if (strncmp(buf, "none", 4)==0)
 		slot = -1;
-	else if (e==buf || (*e && *e!= '\n'))
-		return -EINVAL;
+	else {
+		err = kstrtouint(buf, 10, (unsigned int *)&slot);
+		if (err < 0)
+			return err;
+	}
 	if (rdev->mddev->pers && slot == -1) {
 		/* Setting 'slot' on an active array requires also
 		 * updating the 'rd%d' link, and communicating
@@ -3544,12 +3548,12 @@ layout_show(struct mddev *mddev, char *page)
 static ssize_t
 layout_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	char *e;
-	unsigned long n = simple_strtoul(buf, &e, 10);
+	unsigned int n;
 	int err;
 
-	if (!*buf || (*e && *e != '\n'))
-		return -EINVAL;
+	err = kstrtouint(buf, 10, &n);
+	if (err < 0)
+		return err;
 	err = mddev_lock(mddev);
 	if (err)
 		return err;
@@ -3593,12 +3597,12 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks);
 static ssize_t
 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	char *e;
+	unsigned int n;
 	int err;
-	unsigned long n = simple_strtoul(buf, &e, 10);
 
-	if (!*buf || (*e && *e != '\n'))
-		return -EINVAL;
+	err = kstrtouint(buf, 10, &n);
+	if (err < 0)
+		return err;
 
 	err = mddev_lock(mddev);
 	if (err)
@@ -3645,12 +3649,12 @@ chunk_size_show(struct mddev *mddev, char *page)
 static ssize_t
 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
 {
+	unsigned long n;
 	int err;
-	char *e;
-	unsigned long n = simple_strtoul(buf, &e, 10);
 
-	if (!*buf || (*e && *e != '\n'))
-		return -EINVAL;
+	err = kstrtoul(buf, 10, &n);
+	if (err < 0)
+		return err;
 
 	err = mddev_lock(mddev);
 	if (err)
@@ -3688,19 +3692,24 @@ resync_start_show(struct mddev *mddev, char *page)
 static ssize_t
 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
 {
+	unsigned long long n;
 	int err;
-	char *e;
-	unsigned long long n = simple_strtoull(buf, &e, 10);
+
+	if (cmd_match(buf, "none"))
+		n = MaxSector;
+	else {
+		err = kstrtoull(buf, 10, &n);
+		if (err < 0)
+			return err;
+		if (n != (sector_t)n)
+			return -EINVAL;
+	}
 
 	err = mddev_lock(mddev);
 	if (err)
 		return err;
 	if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
 		err = -EBUSY;
-	else if (cmd_match(buf, "none"))
-		n = MaxSector;
-	else if (!*buf || (*e && *e != '\n'))
-		err = -EINVAL;
 
 	if (!err) {
 		mddev->recovery_cp = n;
@@ -3936,14 +3945,14 @@ max_corrected_read_errors_show(struct mddev *mddev, char *page) {
 static ssize_t
 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	char *e;
-	unsigned long n = simple_strtoul(buf, &e, 10);
+	unsigned int n;
+	int rv;
 
-	if (*buf && (*e == 0 || *e == '\n')) {
-		atomic_set(&mddev->max_corr_read_errors, n);
-		return len;
-	}
-	return -EINVAL;
+	rv = kstrtouint(buf, 10, &n);
+	if (rv < 0)
+		return rv;
+	atomic_set(&mddev->max_corr_read_errors, n);
+	return len;
 }
 
 static struct md_sysfs_entry max_corr_read_errors =
@@ -4300,15 +4309,18 @@ sync_min_show(struct mddev *mddev, char *page)
 static ssize_t
 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	int min;
-	char *e;
+	unsigned int min;
+	int rv;
+
 	if (strncmp(buf, "system", 6)==0) {
-		mddev->sync_speed_min = 0;
-		return len;
+		min = 0;
+	} else {
+		rv = kstrtouint(buf, 10, &min);
+		if (rv < 0)
+			return rv;
+		if (min == 0)
+			return -EINVAL;
 	}
-	min = simple_strtoul(buf, &e, 10);
-	if (buf == e || (*e && *e != '\n') || min <= 0)
-		return -EINVAL;
 	mddev->sync_speed_min = min;
 	return len;
 }
@@ -4326,15 +4338,18 @@ sync_max_show(struct mddev *mddev, char *page)
 static ssize_t
 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	int max;
-	char *e;
+	unsigned int max;
+	int rv;
+
 	if (strncmp(buf, "system", 6)==0) {
-		mddev->sync_speed_max = 0;
-		return len;
+		max = 0;
+	} else {
+		rv = kstrtouint(buf, 10, &max);
+		if (rv < 0)
+			return rv;
+		if (max == 0)
+			return -EINVAL;
 	}
-	max = simple_strtoul(buf, &e, 10);
-	if (buf == e || (*e && *e != '\n') || max <= 0)
-		return -EINVAL;
 	mddev->sync_speed_max = max;
 	return len;
 }
@@ -4517,12 +4532,13 @@ suspend_lo_show(struct mddev *mddev, char *page)
 static ssize_t
 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	char *e;
-	unsigned long long new = simple_strtoull(buf, &e, 10);
-	unsigned long long old;
+	unsigned long long old, new;
 	int err;
 
-	if (buf == e || (*e && *e != '\n'))
+	err = kstrtoull(buf, 10, &new);
+	if (err < 0)
+		return err;
+	if (new != (sector_t)new)
 		return -EINVAL;
 
 	err = mddev_lock(mddev);
@@ -4559,12 +4575,13 @@ suspend_hi_show(struct mddev *mddev, char *page)
 static ssize_t
 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
 {
-	char *e;
-	unsigned long long new = simple_strtoull(buf, &e, 10);
-	unsigned long long old;
+	unsigned long long old, new;
 	int err;
 
-	if (buf == e || (*e && *e != '\n'))
+	err = kstrtoull(buf, 10, &new);
+	if (err < 0)
+		return err;
+	if (new != (sector_t)new)
 		return -EINVAL;
 
 	err = mddev_lock(mddev);
@@ -4606,11 +4623,13 @@ static ssize_t
 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
 {
 	struct md_rdev *rdev;
-	char *e;
+	unsigned long long new;
 	int err;
-	unsigned long long new = simple_strtoull(buf, &e, 10);
 
-	if (buf == e || (*e && *e != '\n'))
+	err = kstrtoull(buf, 10, &new);
+	if (err < 0)
+		return err;
+	if (new != (sector_t)new)
 		return -EINVAL;
 	err = mddev_lock(mddev);
 	if (err)
@@ -9013,13 +9032,7 @@ static int get_ro(char *buffer, struct kernel_param *kp)
 }
 static int set_ro(const char *val, struct kernel_param *kp)
 {
-	char *e;
-	int num = simple_strtoul(val, &e, 10);
-	if (*val && (*e == '\0' || *e == '\n')) {
-		start_readonly = num;
-		return 0;
-	}
-	return -EINVAL;
+	return kstrtouint(val, 10, (unsigned int *)&start_readonly);
 }
 
 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
-- 
cgit v1.2.1


From b1b4648648e18775082858eca2517322f63e57a1 Mon Sep 17 00:00:00 2001
From: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date: Fri, 8 May 2015 18:19:06 +1000
Subject: md/raid5: split wait_for_stripe and introduce wait_for_quiescent

I noticed heavy spin lock contention at get_active_stripe(), introduced
at being wake up stage, where a bunch of processes try to re-hold the
spin lock again.

After giving some thoughts on this issue, I found the lock could be
relieved(and even avoided) if we turn the wait_for_stripe to per
waitqueue for each lock hash and make the wake up exclusive: wake up
one process each time, which avoids the lock contention naturally.

Before go hacking with wait_for_stripe, I found it actually has 2
usages: for the array to enter or leave the quiescent state, and also
to wait for an available stripe in each of the hash lists.

So this patch splits the first usage off into a separate wait_queue,
wait_for_quiescent, and the next patch will turn the second usage into
one waitqueue for each hash value, and make it exclusive, to relieve
the lock contention.

v2: wake_up(wait_for_quiescent) when (active_stripes == 0)
    Commit log refactor suggestion from Neil.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 15 +++++++++------
 drivers/md/raid5.h |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b6793d2e051f..a9112b39afee 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -374,6 +374,8 @@ static void release_inactive_stripe_list(struct r5conf *conf,
 
 	if (do_wakeup) {
 		wake_up(&conf->wait_for_stripe);
+		if (atomic_read(&conf->active_stripes) == 0)
+			wake_up(&conf->wait_for_quiescent);
 		if (conf->retry_read_aligned)
 			md_wakeup_thread(conf->mddev->thread);
 	}
@@ -667,7 +669,7 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 	spin_lock_irq(conf->hash_locks + hash);
 
 	do {
-		wait_event_lock_irq(conf->wait_for_stripe,
+		wait_event_lock_irq(conf->wait_for_quiescent,
 				    conf->quiesce == 0 || noquiesce,
 				    *(conf->hash_locks + hash));
 		sh = __find_stripe(conf, sector, conf->generation - previous);
@@ -4760,7 +4762,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 					 raid_bi, 0);
 		bio_endio(raid_bi, 0);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
-			wake_up(&conf->wait_for_stripe);
+			wake_up(&conf->wait_for_quiescent);
 		return;
 	}
 
@@ -4855,7 +4857,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 		align_bi->bi_iter.bi_sector += rdev->data_offset;
 
 		spin_lock_irq(&conf->device_lock);
-		wait_event_lock_irq(conf->wait_for_stripe,
+		wait_event_lock_irq(conf->wait_for_quiescent,
 				    conf->quiesce == 0,
 				    conf->device_lock);
 		atomic_inc(&conf->active_aligned_reads);
@@ -5699,7 +5701,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		bio_endio(raid_bio, 0);
 	}
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
-		wake_up(&conf->wait_for_stripe);
+		wake_up(&conf->wait_for_quiescent);
 	return handled;
 }
 
@@ -6433,6 +6435,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 		goto abort;
 	spin_lock_init(&conf->device_lock);
 	seqcount_init(&conf->gen_lock);
+	init_waitqueue_head(&conf->wait_for_quiescent);
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
@@ -7466,7 +7469,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 		 * active stripes can drain
 		 */
 		conf->quiesce = 2;
-		wait_event_cmd(conf->wait_for_stripe,
+		wait_event_cmd(conf->wait_for_quiescent,
 				    atomic_read(&conf->active_stripes) == 0 &&
 				    atomic_read(&conf->active_aligned_reads) == 0,
 				    unlock_all_device_hash_locks_irq(conf),
@@ -7480,7 +7483,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
 	case 0: /* re-enable writes */
 		lock_all_device_hash_locks_irq(conf);
 		conf->quiesce = 0;
-		wake_up(&conf->wait_for_stripe);
+		wake_up(&conf->wait_for_quiescent);
 		wake_up(&conf->wait_for_overlap);
 		unlock_all_device_hash_locks_irq(conf);
 		break;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 896d603ad0da..9b84b8820fc5 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -511,6 +511,7 @@ struct r5conf {
 	struct list_head	inactive_list[NR_STRIPE_HASH_LOCKS];
 	atomic_t		empty_inactive_list_nr;
 	struct llist_head	released_stripes;
+	wait_queue_head_t	wait_for_quiescent;
 	wait_queue_head_t	wait_for_stripe;
 	wait_queue_head_t	wait_for_overlap;
 	unsigned long		cache_state;
-- 
cgit v1.2.1


From e9e4c377e2f563892c50d1d093dd55c7d518fc3d Mon Sep 17 00:00:00 2001
From: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Date: Fri, 8 May 2015 18:19:07 +1000
Subject: md/raid5: per hash value and exclusive wait_for_stripe
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I noticed heavy spin lock contention at get_active_stripe() with fsmark
multiple thread write workloads.

Here is how this hot contention comes from. We have limited stripes, and
it's a multiple thread write workload. Hence, those stripes will be taken
soon, which puts later processes to sleep for waiting free stripes. When
enough stripes(>= 1/4 total stripes) are released, all process are woken,
trying to get the lock. But there is one only being able to get this lock
for each hash lock, making other processes spinning out there for acquiring
the lock.

Thus, it's effectiveless to wakeup all processes and let them battle for
a lock that permits one to access only each time. Instead, we could make
it be a exclusive wake up: wake up one process only. That avoids the heavy
spin lock contention naturally.

To do the exclusive wake up, we've to split wait_for_stripe into multiple
wait queues, to make it per hash value, just like the hash lock.

Here are some test results I have got with this patch applied(all test run
3 times):

`fsmark.files_per_sec'
=====================

next-20150317                 this patch
-------------------------     -------------------------
metric_value     ±stddev      metric_value     ±stddev     change      testbox/benchmark/testcase-params
-------------------------     -------------------------   --------     ------------------------------
      25.600     ±0.0              92.700     ±2.5          262.1%     ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
      25.600     ±0.0              77.800     ±0.6          203.9%     ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
      32.000     ±0.0              93.800     ±1.7          193.1%     ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
      32.000     ±0.0              81.233     ±1.7          153.9%     ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
      48.800     ±14.5             99.667     ±2.0          104.2%     ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
       6.400     ±0.0              12.800     ±0.0          100.0%     ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
      63.133     ±8.2              82.800     ±0.7           31.2%     ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
     245.067     ±0.7             306.567     ±7.9           25.1%     ivb44/fsmark/1x-64t-4BRD_12G-RAID5-f2fs-4M-30G-fsyncBeforeClose
      17.533     ±0.3              21.000     ±0.8           19.8%     ivb44/fsmark/1x-1t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose
     188.167     ±1.9             215.033     ±3.1           14.3%     ivb44/fsmark/1x-1t-4BRD_12G-RAID5-btrfs-4M-30G-NoSync
     254.500     ±1.8             290.733     ±2.4           14.2%     ivb44/fsmark/1x-1t-9BRD_6G-RAID5-btrfs-4M-30G-NoSync

`time.system_time'
=====================

next-20150317                 this patch
-------------------------    -------------------------
metric_value     ±stddev     metric_value     ±stddev     change       testbox/benchmark/testcase-params
-------------------------    -------------------------    --------     ------------------------------
    7235.603     ±1.2             185.163     ±1.9          -97.4%     ivb44/fsmark/1x-64t-4BRD_12G-RAID5-btrfs-4M-30G-fsyncBeforeClose
    7666.883     ±2.9             202.750     ±1.0          -97.4%     ivb44/fsmark/1x-64t-9BRD_6G-RAID5-btrfs-4M-30G-fsyncBeforeClose
   14567.893     ±0.7             421.230     ±0.4          -97.1%     ivb44/fsmark/1x-64t-3HDD-RAID5-btrfs-4M-40G-fsyncBeforeClose
    3697.667     ±14.0            148.190     ±1.7          -96.0%     ivb44/fsmark/1x-64t-4BRD_12G-RAID5-xfs-4M-30G-fsyncBeforeClose
    5572.867     ±3.8             310.717     ±1.4          -94.4%     ivb44/fsmark/1x-64t-9BRD_6G-RAID5-ext4-4M-30G-fsyncBeforeClose
    5565.050     ±0.5             313.277     ±1.5          -94.4%     ivb44/fsmark/1x-64t-4BRD_12G-RAID5-ext4-4M-30G-fsyncBeforeClose
    2420.707     ±17.1            171.043     ±2.7          -92.9%     ivb44/fsmark/1x-64t-9BRD_6G-RAID5-xfs-4M-30G-fsyncBeforeClose
    3743.300     ±4.6             379.827     ±3.5          -89.9%     ivb44/fsmark/1x-64t-3HDD-RAID5-ext4-4M-40G-fsyncBeforeClose
    3308.687     ±6.3             363.050     ±2.0          -89.0%     ivb44/fsmark/1x-64t-3HDD-RAID5-xfs-4M-40G-fsyncBeforeClose

Where,

     1x: where 'x' means iterations or loop, corresponding to the 'L' option of fsmark

     1t, 64t: where 't' means thread

     4M: means the single file size, corresponding to the '-s' option of fsmark
     40G, 30G, 120G: means the total test size

     4BRD_12G: BRD is the ramdisk, where '4' means 4 ramdisk, and where '12G' means
               the size of one ramdisk. So, it would be 48G in total. And we made a
               raid on those ramdisk

As you can see, though there are no much performance gain for hard disk
workload, the system time is dropped heavily, up to 97%. And as expected,
the performance increased a lot, up to 260%, for fast device(ram disk).

v2: use bits instead of array to note down wait queue need to wake up.

Signed-off-by: Yuanhan Liu <yuanhan.liu@linux.intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 27 +++++++++++++++++++--------
 drivers/md/raid5.h |  2 +-
 2 files changed, 20 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a9112b39afee..9a3b143b0b68 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -344,7 +344,8 @@ static void release_inactive_stripe_list(struct r5conf *conf,
 					 int hash)
 {
 	int size;
-	bool do_wakeup = false;
+	unsigned long do_wakeup = 0;
+	int i = 0;
 	unsigned long flags;
 
 	if (hash == NR_STRIPE_HASH_LOCKS) {
@@ -365,15 +366,19 @@ static void release_inactive_stripe_list(struct r5conf *conf,
 			    !list_empty(list))
 				atomic_dec(&conf->empty_inactive_list_nr);
 			list_splice_tail_init(list, conf->inactive_list + hash);
-			do_wakeup = true;
+			do_wakeup |= 1 << hash;
 			spin_unlock_irqrestore(conf->hash_locks + hash, flags);
 		}
 		size--;
 		hash--;
 	}
 
+	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+		if (do_wakeup & (1 << i))
+			wake_up(&conf->wait_for_stripe[i]);
+	}
+
 	if (do_wakeup) {
-		wake_up(&conf->wait_for_stripe);
 		if (atomic_read(&conf->active_stripes) == 0)
 			wake_up(&conf->wait_for_quiescent);
 		if (conf->retry_read_aligned)
@@ -686,14 +691,15 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 			if (!sh) {
 				set_bit(R5_INACTIVE_BLOCKED,
 					&conf->cache_state);
-				wait_event_lock_irq(
-					conf->wait_for_stripe,
+				wait_event_exclusive_cmd(
+					conf->wait_for_stripe[hash],
 					!list_empty(conf->inactive_list + hash) &&
 					(atomic_read(&conf->active_stripes)
 					 < (conf->max_nr_stripes * 3 / 4)
 					 || !test_bit(R5_INACTIVE_BLOCKED,
 						      &conf->cache_state)),
-					*(conf->hash_locks + hash));
+					spin_unlock_irq(conf->hash_locks + hash),
+					spin_lock_irq(conf->hash_locks + hash));
 				clear_bit(R5_INACTIVE_BLOCKED,
 					  &conf->cache_state);
 			} else {
@@ -718,6 +724,9 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 		}
 	} while (sh == NULL);
 
+	if (!list_empty(conf->inactive_list + hash))
+		wake_up(&conf->wait_for_stripe[hash]);
+
 	spin_unlock_irq(conf->hash_locks + hash);
 	return sh;
 }
@@ -2179,7 +2188,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
 	cnt = 0;
 	list_for_each_entry(nsh, &newstripes, lru) {
 		lock_device_hash_lock(conf, hash);
-		wait_event_cmd(conf->wait_for_stripe,
+		wait_event_exclusive_cmd(conf->wait_for_stripe[hash],
 				    !list_empty(conf->inactive_list + hash),
 				    unlock_device_hash_lock(conf, hash),
 				    lock_device_hash_lock(conf, hash));
@@ -6436,7 +6445,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	spin_lock_init(&conf->device_lock);
 	seqcount_init(&conf->gen_lock);
 	init_waitqueue_head(&conf->wait_for_quiescent);
-	init_waitqueue_head(&conf->wait_for_stripe);
+	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
+		init_waitqueue_head(&conf->wait_for_stripe[i]);
+	}
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
 	INIT_LIST_HEAD(&conf->hold_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 9b84b8820fc5..02c3bf8fbfe7 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -512,7 +512,7 @@ struct r5conf {
 	atomic_t		empty_inactive_list_nr;
 	struct llist_head	released_stripes;
 	wait_queue_head_t	wait_for_quiescent;
-	wait_queue_head_t	wait_for_stripe;
+	wait_queue_head_t	wait_for_stripe[NR_STRIPE_HASH_LOCKS];
 	wait_queue_head_t	wait_for_overlap;
 	unsigned long		cache_state;
 #define R5_INACTIVE_BLOCKED	1	/* release of inactive stripes blocked,
-- 
cgit v1.2.1


From 713bc5c2deb437ef40d48374ae6a57e030bd9e35 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@fb.com>
Date: Thu, 28 May 2015 17:33:47 -0700
Subject: md/raid5: ignore released_stripes check

conf->released_stripes list isn't always related to where there are
free stripes pending. Active stripes can be in the list too.
And even free stripes were active very recently.

Signed-off-by: Shaohua Li <shli@fb.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9a3b143b0b68..59e44e99eef3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -681,8 +681,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector,
 		if (!sh) {
 			if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
 				sh = get_free_stripe(conf, hash);
-				if (!sh && llist_empty(&conf->released_stripes) &&
-				    !test_bit(R5_DID_ALLOC, &conf->cache_state))
+				if (!sh && !test_bit(R5_DID_ALLOC,
+						     &conf->cache_state))
 					set_bit(R5_ALLOC_MORE,
 						&conf->cache_state);
 			}
-- 
cgit v1.2.1


From 4e023612325a9034a542bfab79f78b1fe5ebb841 Mon Sep 17 00:00:00 2001
From: Firo Yang <firogm@gmail.com>
Date: Thu, 11 Jun 2015 09:41:10 +0800
Subject: md: fix a build warning

Warning like this:

drivers/md/md.c: In function "update_array_info":
drivers/md/md.c:6394:26: warning: logical not is only applied
to the left hand side of comparison [-Wlogical-not-parentheses]
      !mddev->persistent  != info->not_persistent||

Fix it as Neil Brown said:
mddev->persistent != !info->not_persistent ||

Signed-off-by: Firo Yang <firogm@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b6bbcf0cc430..3d339e283cf6 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6394,7 +6394,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
 	    mddev->ctime         != info->ctime         ||
 	    mddev->level         != info->level         ||
 /*	    mddev->layout        != info->layout        || */
-	    !mddev->persistent	 != info->not_persistent||
+	    mddev->persistent	 != !info->not_persistent ||
 	    mddev->chunk_sectors != info->chunk_size >> 9 ||
 	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
 	    ((state^info->state) & 0xfffffe00)
-- 
cgit v1.2.1


From 6096d91af0b65a3967139b32d5adbb3647858a26 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Wed, 17 Jun 2015 13:35:19 +0100
Subject: dm space map metadata: fix occasional leak of a metadata block on
 resize

The metadata space map has a simplified 'bootstrap' mode that is
operational when extending the space maps.  Whilst in this mode it's
possible for some refcount decrement operations to become queued (eg, as
a result of shadowing one of the bitmap indexes).  These decrements were
not being applied when switching out of bootstrap mode.

The effect of this bug was the leaking of a 4k metadata block.  This is
detected by the latest version of thin_check as a non fatal error.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/persistent-data/dm-space-map-metadata.c | 50 +++++++++++++++-------
 1 file changed, 35 insertions(+), 15 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
index e8a904298887..53091295fce9 100644
--- a/drivers/md/persistent-data/dm-space-map-metadata.c
+++ b/drivers/md/persistent-data/dm-space-map-metadata.c
@@ -204,6 +204,27 @@ static void in(struct sm_metadata *smm)
 	smm->recursion_count++;
 }
 
+static int apply_bops(struct sm_metadata *smm)
+{
+	int r = 0;
+
+	while (!brb_empty(&smm->uncommitted)) {
+		struct block_op bop;
+
+		r = brb_pop(&smm->uncommitted, &bop);
+		if (r) {
+			DMERR("bug in bop ring buffer");
+			break;
+		}
+
+		r = commit_bop(smm, &bop);
+		if (r)
+			break;
+	}
+
+	return r;
+}
+
 static int out(struct sm_metadata *smm)
 {
 	int r = 0;
@@ -216,21 +237,8 @@ static int out(struct sm_metadata *smm)
 		return -ENOMEM;
 	}
 
-	if (smm->recursion_count == 1) {
-		while (!brb_empty(&smm->uncommitted)) {
-			struct block_op bop;
-
-			r = brb_pop(&smm->uncommitted, &bop);
-			if (r) {
-				DMERR("bug in bop ring buffer");
-				break;
-			}
-
-			r = commit_bop(smm, &bop);
-			if (r)
-				break;
-		}
-	}
+	if (smm->recursion_count == 1)
+		apply_bops(smm);
 
 	smm->recursion_count--;
 
@@ -704,6 +712,12 @@ static int sm_metadata_extend(struct dm_space_map *sm, dm_block_t extra_blocks)
 		}
 		old_len = smm->begin;
 
+		r = apply_bops(smm);
+		if (r) {
+			DMERR("%s: apply_bops failed", __func__);
+			goto out;
+		}
+
 		r = sm_ll_commit(&smm->ll);
 		if (r)
 			goto out;
@@ -773,6 +787,12 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
 	if (r)
 		return r;
 
+	r = apply_bops(smm);
+	if (r) {
+		DMERR("%s: apply_bops failed", __func__);
+		return r;
+	}
+
 	return sm_metadata_commit(sm);
 }
 
-- 
cgit v1.2.1


From bccab6a01afc26f53d91762d78153513cad10b29 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 17 Jun 2015 11:43:38 -0400
Subject: dm cache: switch the "default" cache replacement policy from mq to
 smq

The Stochastic multiqueue (SMQ) policy (vs MQ) offers the promise of
less memory utilization, improved performance and increased adaptability
in the face of changing workloads.  SMQ also does not have any
cumbersome tuning knobs.

Users may switch from "mq" to "smq" simply by appropriately reloading a
DM table that is using the cache target.  Doing so will cause all of the
mq policy's hints to be dropped.  Also, performance of the cache may
degrade slightly until smq recalculates the origin device's hotspots
that should be cached.

In the future the "mq" policy will just silently make use of "smq" and
the mq code will be removed.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Acked-by: Joe Thornber <ejt@redhat.com>
---
 drivers/md/dm-cache-policy-mq.c  | 34 +++++-----------------------------
 drivers/md/dm-cache-policy-smq.c | 17 +++++++++++++++++
 2 files changed, 22 insertions(+), 29 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-mq.c b/drivers/md/dm-cache-policy-mq.c
index 838665bb495a..32814371b8d3 100644
--- a/drivers/md/dm-cache-policy-mq.c
+++ b/drivers/md/dm-cache-policy-mq.c
@@ -1431,21 +1431,12 @@ bad_pre_cache_init:
 
 static struct dm_cache_policy_type mq_policy_type = {
 	.name = "mq",
-	.version = {1, 3, 0},
+	.version = {1, 4, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = mq_create
 };
 
-static struct dm_cache_policy_type default_policy_type = {
-	.name = "default",
-	.version = {1, 3, 0},
-	.hint_size = 4,
-	.owner = THIS_MODULE,
-	.create = mq_create,
-	.real = &mq_policy_type
-};
-
 static int __init mq_init(void)
 {
 	int r;
@@ -1455,36 +1446,21 @@ static int __init mq_init(void)
 					   __alignof__(struct entry),
 					   0, NULL);
 	if (!mq_entry_cache)
-		goto bad;
+		return -ENOMEM;
 
 	r = dm_cache_policy_register(&mq_policy_type);
 	if (r) {
 		DMERR("register failed %d", r);
-		goto bad_register_mq;
-	}
-
-	r = dm_cache_policy_register(&default_policy_type);
-	if (!r) {
-		DMINFO("version %u.%u.%u loaded",
-		       mq_policy_type.version[0],
-		       mq_policy_type.version[1],
-		       mq_policy_type.version[2]);
-		return 0;
+		kmem_cache_destroy(mq_entry_cache);
+		return -ENOMEM;
 	}
 
-	DMERR("register failed (as default) %d", r);
-
-	dm_cache_policy_unregister(&mq_policy_type);
-bad_register_mq:
-	kmem_cache_destroy(mq_entry_cache);
-bad:
-	return -ENOMEM;
+	return 0;
 }
 
 static void __exit mq_exit(void)
 {
 	dm_cache_policy_unregister(&mq_policy_type);
-	dm_cache_policy_unregister(&default_policy_type);
 
 	kmem_cache_destroy(mq_entry_cache);
 }
diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 66feb307e697..80f02d3330e2 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1748,6 +1748,15 @@ static struct dm_cache_policy_type smq_policy_type = {
 	.create = smq_create
 };
 
+static struct dm_cache_policy_type default_policy_type = {
+	.name = "default",
+	.version = {1, 0, 0},
+	.hint_size = 4,
+	.owner = THIS_MODULE,
+	.create = smq_create,
+	.real = &smq_policy_type
+};
+
 static int __init smq_init(void)
 {
 	int r;
@@ -1758,12 +1767,20 @@ static int __init smq_init(void)
 		return -ENOMEM;
 	}
 
+	r = dm_cache_policy_register(&default_policy_type);
+	if (r) {
+		DMERR("register failed (as default) %d", r);
+		dm_cache_policy_unregister(&smq_policy_type);
+		return -ENOMEM;
+	}
+
 	return 0;
 }
 
 static void __exit smq_exit(void)
 {
 	dm_cache_policy_unregister(&smq_policy_type);
+	dm_cache_policy_unregister(&default_policy_type);
 }
 
 module_init(smq_init);
-- 
cgit v1.2.1


From dd4c1b7d0c95be1c9245118a3accc41a16f1db67 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 5 Jun 2015 09:50:42 -0400
Subject: dm stats: fix divide by zero if 'number_of_areas' arg is zero

If the number_of_areas argument was zero the kernel would crash on
div-by-zero.  Add better input validation.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # v3.12+
---
 drivers/md/dm-stats.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 492fe6a5ebf2..d1fd31a6dd1a 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -792,6 +792,8 @@ static int message_stats_create(struct mapped_device *md,
 		return -EINVAL;
 
 	if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
+		if (!divisor)
+			return -EINVAL;
 		step = end - start;
 		if (do_div(step, divisor))
 			step++;
-- 
cgit v1.2.1


From c96aec344de0de857ef3d7fba53992c7ba311e1e Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 9 Jun 2015 17:21:39 -0400
Subject: dm stats: support precise timestamps

Make it possible to use precise timestamps with nanosecond granularity
in dm statistics.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-stats.c | 138 ++++++++++++++++++++++++++++++++++++--------------
 drivers/md/dm-stats.h |   4 +-
 2 files changed, 103 insertions(+), 39 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index d1fd31a6dd1a..4bfd84ab1d4a 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -33,13 +33,14 @@ struct dm_stat_percpu {
 
 struct dm_stat_shared {
 	atomic_t in_flight[2];
-	unsigned long stamp;
+	unsigned long long stamp;
 	struct dm_stat_percpu tmp;
 };
 
 struct dm_stat {
 	struct list_head list_entry;
 	int id;
+	unsigned stat_flags;
 	size_t n_entries;
 	sector_t start;
 	sector_t end;
@@ -53,6 +54,8 @@ struct dm_stat {
 	struct dm_stat_shared stat_shared[0];
 };
 
+#define STAT_PRECISE_TIMESTAMPS		1
+
 struct dm_stats_last_position {
 	sector_t last_sector;
 	unsigned last_rw;
@@ -224,7 +227,8 @@ void dm_stats_cleanup(struct dm_stats *stats)
 }
 
 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
-			   sector_t step, const char *program_id, const char *aux_data,
+			   sector_t step, unsigned stat_flags,
+			   const char *program_id, const char *aux_data,
 			   void (*suspend_callback)(struct mapped_device *),
 			   void (*resume_callback)(struct mapped_device *),
 			   struct mapped_device *md)
@@ -265,6 +269,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 	if (!s)
 		return -ENOMEM;
 
+	s->stat_flags = stat_flags;
 	s->n_entries = n_entries;
 	s->start = start;
 	s->end = end;
@@ -414,18 +419,24 @@ static int dm_stats_list(struct dm_stats *stats, const char *program,
 	return 1;
 }
 
-static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *p)
+static void dm_stat_round(struct dm_stat *s, struct dm_stat_shared *shared,
+			  struct dm_stat_percpu *p)
 {
 	/*
 	 * This is racy, but so is part_round_stats_single.
 	 */
-	unsigned long now = jiffies;
-	unsigned in_flight_read;
-	unsigned in_flight_write;
-	unsigned long difference = now - shared->stamp;
+	unsigned long long now, difference;
+	unsigned in_flight_read, in_flight_write;
+
+	if (likely(!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)))
+		now = jiffies;
+	else
+		now = ktime_to_ns(ktime_get());
 
+	difference = now - shared->stamp;
 	if (!difference)
 		return;
+
 	in_flight_read = (unsigned)atomic_read(&shared->in_flight[READ]);
 	in_flight_write = (unsigned)atomic_read(&shared->in_flight[WRITE]);
 	if (in_flight_read)
@@ -440,8 +451,9 @@ static void dm_stat_round(struct dm_stat_shared *shared, struct dm_stat_percpu *
 }
 
 static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
-			      unsigned long bi_rw, sector_t len, bool merged,
-			      bool end, unsigned long duration)
+			      unsigned long bi_rw, sector_t len,
+			      struct dm_stats_aux *stats_aux, bool end,
+			      unsigned long duration_jiffies)
 {
 	unsigned long idx = bi_rw & REQ_WRITE;
 	struct dm_stat_shared *shared = &s->stat_shared[entry];
@@ -471,15 +483,18 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 	p = &s->stat_percpu[smp_processor_id()][entry];
 
 	if (!end) {
-		dm_stat_round(shared, p);
+		dm_stat_round(s, shared, p);
 		atomic_inc(&shared->in_flight[idx]);
 	} else {
-		dm_stat_round(shared, p);
+		dm_stat_round(s, shared, p);
 		atomic_dec(&shared->in_flight[idx]);
 		p->sectors[idx] += len;
 		p->ios[idx] += 1;
-		p->merges[idx] += merged;
-		p->ticks[idx] += duration;
+		p->merges[idx] += stats_aux->merged;
+		if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))
+			p->ticks[idx] += duration_jiffies;
+		else
+			p->ticks[idx] += stats_aux->duration_ns;
 	}
 
 #if BITS_PER_LONG == 32
@@ -491,7 +506,7 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 
 static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
 			  sector_t bi_sector, sector_t end_sector,
-			  bool end, unsigned long duration,
+			  bool end, unsigned long duration_jiffies,
 			  struct dm_stats_aux *stats_aux)
 {
 	sector_t rel_sector, offset, todo, fragment_len;
@@ -520,7 +535,7 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
 		if (fragment_len > s->step - offset)
 			fragment_len = s->step - offset;
 		dm_stat_for_entry(s, entry, bi_rw, fragment_len,
-				  stats_aux->merged, end, duration);
+				  stats_aux, end, duration_jiffies);
 		todo -= fragment_len;
 		entry++;
 		offset = 0;
@@ -529,11 +544,13 @@ static void __dm_stat_bio(struct dm_stat *s, unsigned long bi_rw,
 
 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 			 sector_t bi_sector, unsigned bi_sectors, bool end,
-			 unsigned long duration, struct dm_stats_aux *stats_aux)
+			 unsigned long duration_jiffies,
+			 struct dm_stats_aux *stats_aux)
 {
 	struct dm_stat *s;
 	sector_t end_sector;
 	struct dm_stats_last_position *last;
+	bool got_precise_time;
 
 	if (unlikely(!bi_sectors))
 		return;
@@ -557,8 +574,17 @@ void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 
 	rcu_read_lock();
 
-	list_for_each_entry_rcu(s, &stats->list, list_entry)
-		__dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration, stats_aux);
+	got_precise_time = false;
+	list_for_each_entry_rcu(s, &stats->list, list_entry) {
+		if (s->stat_flags & STAT_PRECISE_TIMESTAMPS && !got_precise_time) {
+			if (!end)
+				stats_aux->duration_ns = ktime_to_ns(ktime_get());
+			else
+				stats_aux->duration_ns = ktime_to_ns(ktime_get()) - stats_aux->duration_ns;
+			got_precise_time = true;
+		}
+		__dm_stat_bio(s, bi_rw, bi_sector, end_sector, end, duration_jiffies, stats_aux);
+	}
 
 	rcu_read_unlock();
 }
@@ -571,7 +597,7 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 
 	local_irq_disable();
 	p = &s->stat_percpu[smp_processor_id()][x];
-	dm_stat_round(shared, p);
+	dm_stat_round(s, shared, p);
 	local_irq_enable();
 
 	memset(&shared->tmp, 0, sizeof(shared->tmp));
@@ -643,11 +669,15 @@ static int dm_stats_clear(struct dm_stats *stats, int id)
 /*
  * This is like jiffies_to_msec, but works for 64-bit values.
  */
-static unsigned long long dm_jiffies_to_msec64(unsigned long long j)
+static unsigned long long dm_jiffies_to_msec64(struct dm_stat *s, unsigned long long j)
 {
-	unsigned long long result = 0;
+	unsigned long long result;
 	unsigned mult;
 
+	if (s->stat_flags & STAT_PRECISE_TIMESTAMPS)
+		return j;
+
+	result = 0;
 	if (j)
 		result = jiffies_to_msecs(j & 0x3fffff);
 	if (j >= 1 << 22) {
@@ -709,16 +739,16 @@ static int dm_stats_print(struct dm_stats *stats, int id,
 		       shared->tmp.ios[READ],
 		       shared->tmp.merges[READ],
 		       shared->tmp.sectors[READ],
-		       dm_jiffies_to_msec64(shared->tmp.ticks[READ]),
+		       dm_jiffies_to_msec64(s, shared->tmp.ticks[READ]),
 		       shared->tmp.ios[WRITE],
 		       shared->tmp.merges[WRITE],
 		       shared->tmp.sectors[WRITE],
-		       dm_jiffies_to_msec64(shared->tmp.ticks[WRITE]),
+		       dm_jiffies_to_msec64(s, shared->tmp.ticks[WRITE]),
 		       dm_stat_in_flight(shared),
-		       dm_jiffies_to_msec64(shared->tmp.io_ticks_total),
-		       dm_jiffies_to_msec64(shared->tmp.time_in_queue),
-		       dm_jiffies_to_msec64(shared->tmp.io_ticks[READ]),
-		       dm_jiffies_to_msec64(shared->tmp.io_ticks[WRITE]));
+		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks_total),
+		       dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
+		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
+		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
 
 		if (unlikely(sz + 1 >= maxlen))
 			goto buffer_overflow;
@@ -769,21 +799,31 @@ static int message_stats_create(struct mapped_device *md,
 	unsigned long long start, end, len, step;
 	unsigned divisor;
 	const char *program_id, *aux_data;
+	unsigned stat_flags = 0;
+
+	struct dm_arg_set as, as_backup;
+	const char *a;
+	unsigned feature_args;
 
 	/*
 	 * Input format:
-	 *   <range> <step> [<program_id> [<aux_data>]]
+	 *   <range> <step> [<extra_parameters> <parameters>] [<program_id> [<aux_data>]]
 	 */
 
-	if (argc < 3 || argc > 5)
+	if (argc < 3)
 		return -EINVAL;
 
-	if (!strcmp(argv[1], "-")) {
+	as.argc = argc;
+	as.argv = argv;
+	dm_consume_args(&as, 1);
+
+	a = dm_shift_arg(&as);
+	if (!strcmp(a, "-")) {
 		start = 0;
 		len = dm_get_size(md);
 		if (!len)
 			len = 1;
-	} else if (sscanf(argv[1], "%llu+%llu%c", &start, &len, &dummy) != 2 ||
+	} else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
 		   start != (sector_t)start || len != (sector_t)len)
 		return -EINVAL;
 
@@ -791,7 +831,8 @@ static int message_stats_create(struct mapped_device *md,
 	if (start >= end)
 		return -EINVAL;
 
-	if (sscanf(argv[2], "/%u%c", &divisor, &dummy) == 1) {
+	a = dm_shift_arg(&as);
+	if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
 		if (!divisor)
 			return -EINVAL;
 		step = end - start;
@@ -799,18 +840,39 @@ static int message_stats_create(struct mapped_device *md,
 			step++;
 		if (!step)
 			step = 1;
-	} else if (sscanf(argv[2], "%llu%c", &step, &dummy) != 1 ||
+	} else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
 		   step != (sector_t)step || !step)
 		return -EINVAL;
 
+	as_backup = as;
+	a = dm_shift_arg(&as);
+	if (a && sscanf(a, "%u%c", &feature_args, &dummy) == 1) {
+		while (feature_args--) {
+			a = dm_shift_arg(&as);
+			if (!a)
+				return -EINVAL;
+			if (!strcasecmp(a, "precise_timestamps"))
+				stat_flags |= STAT_PRECISE_TIMESTAMPS;
+			else
+				return -EINVAL;
+		}
+	} else {
+		as = as_backup;
+	}
+
 	program_id = "-";
 	aux_data = "-";
 
-	if (argc > 3)
-		program_id = argv[3];
+	a = dm_shift_arg(&as);
+	if (a)
+		program_id = a;
+
+	a = dm_shift_arg(&as);
+	if (a)
+		aux_data = a;
 
-	if (argc > 4)
-		aux_data = argv[4];
+	if (as.argc)
+		return -EINVAL;
 
 	/*
 	 * If a buffer overflow happens after we created the region,
@@ -822,7 +884,7 @@ static int message_stats_create(struct mapped_device *md,
 	if (dm_message_test_buffer_overflow(result, maxlen))
 		return 1;
 
-	id = dm_stats_create(dm_get_stats(md), start, end, step, program_id, aux_data,
+	id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data,
 			     dm_internal_suspend_fast, dm_internal_resume_fast, md);
 	if (id < 0)
 		return id;
diff --git a/drivers/md/dm-stats.h b/drivers/md/dm-stats.h
index e7c4984bf235..f1c0956e3843 100644
--- a/drivers/md/dm-stats.h
+++ b/drivers/md/dm-stats.h
@@ -18,6 +18,7 @@ struct dm_stats {
 
 struct dm_stats_aux {
 	bool merged;
+	unsigned long long duration_ns;
 };
 
 void dm_stats_init(struct dm_stats *st);
@@ -30,7 +31,8 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 
 void dm_stats_account_io(struct dm_stats *stats, unsigned long bi_rw,
 			 sector_t bi_sector, unsigned bi_sectors, bool end,
-			 unsigned long duration, struct dm_stats_aux *aux);
+			 unsigned long duration_jiffies,
+			 struct dm_stats_aux *aux);
 
 static inline bool dm_stats_used(struct dm_stats *st)
 {
-- 
cgit v1.2.1


From dfcfac3e4cd94abef779297fab6adfd2dbcf52fa Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 9 Jun 2015 17:22:05 -0400
Subject: dm stats: collect and report histogram of IO latencies

Add an option to dm statistics to collect and report a histogram of
IO latencies.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-stats.c | 205 ++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 183 insertions(+), 22 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index 4bfd84ab1d4a..faf1071ef631 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -29,6 +29,7 @@ struct dm_stat_percpu {
 	unsigned long long io_ticks[2];
 	unsigned long long io_ticks_total;
 	unsigned long long time_in_queue;
+	unsigned long long *histogram;
 };
 
 struct dm_stat_shared {
@@ -45,11 +46,14 @@ struct dm_stat {
 	sector_t start;
 	sector_t end;
 	sector_t step;
+	unsigned n_histogram_entries;
+	unsigned long long *histogram_boundaries;
 	const char *program_id;
 	const char *aux_data;
 	struct rcu_head rcu_head;
 	size_t shared_alloc_size;
 	size_t percpu_alloc_size;
+	size_t histogram_alloc_size;
 	struct dm_stat_percpu *stat_percpu[NR_CPUS];
 	struct dm_stat_shared stat_shared[0];
 };
@@ -173,8 +177,11 @@ static void dm_stat_free(struct rcu_head *head)
 
 	kfree(s->program_id);
 	kfree(s->aux_data);
-	for_each_possible_cpu(cpu)
+	for_each_possible_cpu(cpu) {
+		dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
 		dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
+	}
+	dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
 	dm_kvfree(s, s->shared_alloc_size);
 }
 
@@ -228,6 +235,8 @@ void dm_stats_cleanup(struct dm_stats *stats)
 
 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 			   sector_t step, unsigned stat_flags,
+			   unsigned n_histogram_entries,
+			   unsigned long long *histogram_boundaries,
 			   const char *program_id, const char *aux_data,
 			   void (*suspend_callback)(struct mapped_device *),
 			   void (*resume_callback)(struct mapped_device *),
@@ -239,6 +248,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 	size_t ni;
 	size_t shared_alloc_size;
 	size_t percpu_alloc_size;
+	size_t histogram_alloc_size;
 	struct dm_stat_percpu *p;
 	int cpu;
 	int ret_id;
@@ -262,7 +272,12 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 	if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
 		return -EOVERFLOW;
 
-	if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
+	histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
+	if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
+		return -EOVERFLOW;
+
+	if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
+				 num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
 		return -ENOMEM;
 
 	s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
@@ -276,6 +291,15 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 	s->step = step;
 	s->shared_alloc_size = shared_alloc_size;
 	s->percpu_alloc_size = percpu_alloc_size;
+	s->histogram_alloc_size = histogram_alloc_size;
+
+	s->n_histogram_entries = n_histogram_entries;
+	s->histogram_boundaries = kmemdup(histogram_boundaries,
+					  s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+	if (!s->histogram_boundaries) {
+		r = -ENOMEM;
+		goto out;
+	}
 
 	s->program_id = kstrdup(program_id, GFP_KERNEL);
 	if (!s->program_id) {
@@ -293,6 +317,19 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 		atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
 	}
 
+	if (s->n_histogram_entries) {
+		unsigned long long *hi;
+		hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
+		if (!hi) {
+			r = -ENOMEM;
+			goto out;
+		}
+		for (ni = 0; ni < n_entries; ni++) {
+			s->stat_shared[ni].tmp.histogram = hi;
+			hi += s->n_histogram_entries + 1;
+		}
+	}
+
 	for_each_possible_cpu(cpu) {
 		p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
 		if (!p) {
@@ -300,6 +337,18 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
 			goto out;
 		}
 		s->stat_percpu[cpu] = p;
+		if (s->n_histogram_entries) {
+			unsigned long long *hi;
+			hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
+			if (!hi) {
+				r = -ENOMEM;
+				goto out;
+			}
+			for (ni = 0; ni < n_entries; ni++) {
+				p[ni].histogram = hi;
+				hi += s->n_histogram_entries + 1;
+			}
+		}
 	}
 
 	/*
@@ -377,9 +426,11 @@ static int dm_stats_delete(struct dm_stats *stats, int id)
 	 * vfree can't be called from RCU callback
 	 */
 	for_each_possible_cpu(cpu)
-		if (is_vmalloc_addr(s->stat_percpu))
+		if (is_vmalloc_addr(s->stat_percpu) ||
+		    is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
 			goto do_sync_free;
-	if (is_vmalloc_addr(s)) {
+	if (is_vmalloc_addr(s) ||
+	    is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
 do_sync_free:
 		synchronize_rcu_expedited();
 		dm_stat_free(&s->rcu_head);
@@ -486,15 +537,32 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
 		dm_stat_round(s, shared, p);
 		atomic_inc(&shared->in_flight[idx]);
 	} else {
+		unsigned long long duration;
 		dm_stat_round(s, shared, p);
 		atomic_dec(&shared->in_flight[idx]);
 		p->sectors[idx] += len;
 		p->ios[idx] += 1;
 		p->merges[idx] += stats_aux->merged;
-		if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))
+		if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
 			p->ticks[idx] += duration_jiffies;
-		else
+			duration = jiffies_to_msecs(duration_jiffies);
+		} else {
 			p->ticks[idx] += stats_aux->duration_ns;
+			duration = stats_aux->duration_ns;
+		}
+		if (s->n_histogram_entries) {
+			unsigned lo = 0, hi = s->n_histogram_entries + 1;
+			while (lo + 1 < hi) {
+				unsigned mid = (lo + hi) / 2;
+				if (s->histogram_boundaries[mid - 1] > duration) {
+					hi = mid;
+				} else {
+					lo = mid;
+				}
+
+			}
+			p->histogram[lo]++;
+		}
 	}
 
 #if BITS_PER_LONG == 32
@@ -600,7 +668,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 	dm_stat_round(s, shared, p);
 	local_irq_enable();
 
-	memset(&shared->tmp, 0, sizeof(shared->tmp));
+	shared->tmp.sectors[READ] = 0;
+	shared->tmp.sectors[WRITE] = 0;
+	shared->tmp.ios[READ] = 0;
+	shared->tmp.ios[WRITE] = 0;
+	shared->tmp.merges[READ] = 0;
+	shared->tmp.merges[WRITE] = 0;
+	shared->tmp.ticks[READ] = 0;
+	shared->tmp.ticks[WRITE] = 0;
+	shared->tmp.io_ticks[READ] = 0;
+	shared->tmp.io_ticks[WRITE] = 0;
+	shared->tmp.io_ticks_total = 0;
+	shared->tmp.time_in_queue = 0;
+
+	if (s->n_histogram_entries)
+		memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
+
 	for_each_possible_cpu(cpu) {
 		p = &s->stat_percpu[cpu][x];
 		shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
@@ -615,6 +698,11 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
 		shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
 		shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
 		shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+		if (s->n_histogram_entries) {
+			unsigned i;
+			for (i = 0; i < s->n_histogram_entries + 1; i++)
+				shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
+		}
 	}
 }
 
@@ -644,6 +732,15 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
 		p->io_ticks_total -= shared->tmp.io_ticks_total;
 		p->time_in_queue -= shared->tmp.time_in_queue;
 		local_irq_enable();
+		if (s->n_histogram_entries) {
+			unsigned i;
+			for (i = 0; i < s->n_histogram_entries + 1; i++) {
+				local_irq_disable();
+				p = &s->stat_percpu[smp_processor_id()][x];
+				p->histogram[i] -= shared->tmp.histogram[i];
+				local_irq_enable();
+			}
+		}
 	}
 }
 
@@ -733,7 +830,7 @@ static int dm_stats_print(struct dm_stats *stats, int id,
 
 		__dm_stat_init_temporary_percpu_totals(shared, s, x);
 
-		DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
+		DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
 		       (unsigned long long)start,
 		       (unsigned long long)step,
 		       shared->tmp.ios[READ],
@@ -749,6 +846,13 @@ static int dm_stats_print(struct dm_stats *stats, int id,
 		       dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
 		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
 		       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
+		if (s->n_histogram_entries) {
+			unsigned i;
+			for (i = 0; i < s->n_histogram_entries + 1; i++) {
+				DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
+			}
+		}
+		DMEMIT("\n");
 
 		if (unlikely(sz + 1 >= maxlen))
 			goto buffer_overflow;
@@ -790,10 +894,47 @@ static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data
 	return 0;
 }
 
+static int parse_histogram(const char *h, unsigned *n_histogram_entries,
+			   unsigned long long **histogram_boundaries)
+{
+	const char *q;
+	unsigned n;
+	unsigned long long last;
+
+	*n_histogram_entries = 1;
+	for (q = h; *q; q++)
+		if (*q == ',')
+			(*n_histogram_entries)++;
+
+	*histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+	if (!*histogram_boundaries)
+		return -ENOMEM;
+
+	n = 0;
+	last = 0;
+	while (1) {
+		unsigned long long hi;
+		int s;
+		char ch;
+		s = sscanf(h, "%llu%c", &hi, &ch);
+		if (!s || (s == 2 && ch != ','))
+			return -EINVAL;
+		if (hi <= last)
+			return -EINVAL;
+		last = hi;
+		(*histogram_boundaries)[n] = hi;
+		if (s == 1)
+			return 0;
+		h = strchr(h, ',') + 1;
+		n++;
+	}
+}
+
 static int message_stats_create(struct mapped_device *md,
 				unsigned argc, char **argv,
 				char *result, unsigned maxlen)
 {
+	int r;
 	int id;
 	char dummy;
 	unsigned long long start, end, len, step;
@@ -801,6 +942,9 @@ static int message_stats_create(struct mapped_device *md,
 	const char *program_id, *aux_data;
 	unsigned stat_flags = 0;
 
+	unsigned n_histogram_entries = 0;
+	unsigned long long *histogram_boundaries = NULL;
+
 	struct dm_arg_set as, as_backup;
 	const char *a;
 	unsigned feature_args;
@@ -811,7 +955,7 @@ static int message_stats_create(struct mapped_device *md,
 	 */
 
 	if (argc < 3)
-		return -EINVAL;
+		goto ret_einval;
 
 	as.argc = argc;
 	as.argv = argv;
@@ -825,11 +969,11 @@ static int message_stats_create(struct mapped_device *md,
 			len = 1;
 	} else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
 		   start != (sector_t)start || len != (sector_t)len)
-		return -EINVAL;
+		goto ret_einval;
 
 	end = start + len;
 	if (start >= end)
-		return -EINVAL;
+		goto ret_einval;
 
 	a = dm_shift_arg(&as);
 	if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
@@ -842,7 +986,7 @@ static int message_stats_create(struct mapped_device *md,
 			step = 1;
 	} else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
 		   step != (sector_t)step || !step)
-		return -EINVAL;
+		goto ret_einval;
 
 	as_backup = as;
 	a = dm_shift_arg(&as);
@@ -850,11 +994,16 @@ static int message_stats_create(struct mapped_device *md,
 		while (feature_args--) {
 			a = dm_shift_arg(&as);
 			if (!a)
-				return -EINVAL;
+				goto ret_einval;
 			if (!strcasecmp(a, "precise_timestamps"))
 				stat_flags |= STAT_PRECISE_TIMESTAMPS;
-			else
-				return -EINVAL;
+			else if (!strncasecmp(a, "histogram:", 10)) {
+				if (n_histogram_entries)
+					goto ret_einval;
+				if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
+					goto ret;
+			} else
+				goto ret_einval;
 		}
 	} else {
 		as = as_backup;
@@ -872,7 +1021,7 @@ static int message_stats_create(struct mapped_device *md,
 		aux_data = a;
 
 	if (as.argc)
-		return -EINVAL;
+		goto ret_einval;
 
 	/*
 	 * If a buffer overflow happens after we created the region,
@@ -881,17 +1030,29 @@ static int message_stats_create(struct mapped_device *md,
 	 * leaked).  So we must detect buffer overflow in advance.
 	 */
 	snprintf(result, maxlen, "%d", INT_MAX);
-	if (dm_message_test_buffer_overflow(result, maxlen))
-		return 1;
+	if (dm_message_test_buffer_overflow(result, maxlen)) {
+		r = 1;
+		goto ret;
+	}
 
-	id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data,
+	id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
+			     n_histogram_entries, histogram_boundaries, program_id, aux_data,
 			     dm_internal_suspend_fast, dm_internal_resume_fast, md);
-	if (id < 0)
-		return id;
+	if (id < 0) {
+		r = id;
+		goto ret;
+	}
 
 	snprintf(result, maxlen, "%d", id);
 
-	return 1;
+	r = 1;
+	goto ret;
+
+ret_einval:
+	r = -EINVAL;
+ret:
+	kfree(histogram_boundaries);
+	return r;
 }
 
 static int message_stats_delete(struct mapped_device *md,
-- 
cgit v1.2.1


From e262f34741522e0d821642e5449c6eeb512723fc Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 9 Jun 2015 17:22:49 -0400
Subject: dm stats: add support for request-based DM devices

This makes it possible to use dm stats with DM multipath.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-stats.c |  5 -----
 drivers/md/dm.c       | 26 ++++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stats.c b/drivers/md/dm-stats.c
index faf1071ef631..8a8b48fa901a 100644
--- a/drivers/md/dm-stats.c
+++ b/drivers/md/dm-stats.c
@@ -1155,11 +1155,6 @@ int dm_stats_message(struct mapped_device *md, unsigned argc, char **argv,
 {
 	int r;
 
-	if (dm_request_based(md)) {
-		DMWARN("Statistics are only supported for bio-based devices");
-		return -EOPNOTSUPP;
-	}
-
 	/* All messages here must start with '@' */
 	if (!strcasecmp(argv[0], "@stats_create"))
 		r = message_stats_create(md, argc, argv, result, maxlen);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 767bce906588..90dc49e3c78f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -86,6 +86,9 @@ struct dm_rq_target_io {
 	struct kthread_work work;
 	int error;
 	union map_info info;
+	struct dm_stats_aux stats_aux;
+	unsigned long duration_jiffies;
+	unsigned n_sectors;
 };
 
 /*
@@ -995,6 +998,17 @@ static struct dm_rq_target_io *tio_from_request(struct request *rq)
 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
 }
 
+static void rq_end_stats(struct mapped_device *md, struct request *orig)
+{
+	if (unlikely(dm_stats_used(&md->stats))) {
+		struct dm_rq_target_io *tio = tio_from_request(orig);
+		tio->duration_jiffies = jiffies - tio->duration_jiffies;
+		dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
+				    tio->n_sectors, true, tio->duration_jiffies,
+				    &tio->stats_aux);
+	}
+}
+
 /*
  * Don't touch any member of the md after calling this function because
  * the md may be freed in dm_put() at the end of this function.
@@ -1078,6 +1092,7 @@ static void dm_end_request(struct request *clone, int error)
 	}
 
 	free_rq_clone(clone);
+	rq_end_stats(md, rq);
 	if (!rq->q->mq_ops)
 		blk_end_request_all(rq, error);
 	else
@@ -1120,6 +1135,7 @@ static void dm_requeue_original_request(struct mapped_device *md,
 
 	dm_unprep_request(rq);
 
+	rq_end_stats(md, rq);
 	if (!rq->q->mq_ops)
 		old_requeue_request(rq);
 	else {
@@ -1211,6 +1227,7 @@ static void dm_softirq_done(struct request *rq)
 	int rw;
 
 	if (!clone) {
+		rq_end_stats(tio->md, rq);
 		rw = rq_data_dir(rq);
 		if (!rq->q->mq_ops) {
 			blk_end_request_all(rq, tio->error);
@@ -1943,6 +1960,14 @@ static void dm_start_request(struct mapped_device *md, struct request *orig)
 		md->last_rq_start_time = ktime_get();
 	}
 
+	if (unlikely(dm_stats_used(&md->stats))) {
+		struct dm_rq_target_io *tio = tio_from_request(orig);
+		tio->duration_jiffies = jiffies;
+		tio->n_sectors = blk_rq_sectors(orig);
+		dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
+				    tio->n_sectors, false, 0, &tio->stats_aux);
+	}
+
 	/*
 	 * Hold the md reference here for the in-flight I/O.
 	 * We can't rely on the reference count by device opener,
@@ -2689,6 +2714,7 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
 		/* Direct call is fine since .queue_rq allows allocations */
 		if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
 			/* Undo dm_start_request() before requeuing */
+			rq_end_stats(md, rq);
 			rq_completed(md, rq_data_dir(rq), false);
 			return BLK_MQ_RQ_QUEUE_BUSY;
 		}
-- 
cgit v1.2.1


From 9bf39ab2adafd7cf8740859cb49e7b7952813a5d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 19 Jun 2015 10:29:13 +0200
Subject: vfs: add file_path() helper

Turn
	d_path(&file->f_path, ...);
into
	file_path(file, ...);

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/md/bitmap.c | 2 +-
 drivers/md/md.c     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 2bc56e2a3526..dda33d648354 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -834,7 +834,7 @@ static void bitmap_file_kick(struct bitmap *bitmap)
 		if (bitmap->storage.file) {
 			path = kmalloc(PAGE_SIZE, GFP_KERNEL);
 			if (path)
-				ptr = d_path(&bitmap->storage.file->f_path,
+				ptr = file_path(bitmap->storage.file,
 					     path, PAGE_SIZE);
 
 			printk(KERN_ALERT
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 593a02476c78..e67f3ac137bf 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5741,7 +5741,7 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
 	/* bitmap disabled, zero the first byte and copy out */
 	if (!mddev->bitmap_info.file)
 		file->pathname[0] = '\0';
-	else if ((ptr = d_path(&mddev->bitmap_info.file->f_path,
+	else if ((ptr = file_path(mddev->bitmap_info.file,
 			       file->pathname, sizeof(file->pathname))),
 		 IS_ERR(ptr))
 		err = PTR_ERR(ptr);
-- 
cgit v1.2.1


From 2726d56620ce71f40dd583d51391b86e1ab8cc57 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 19 Jun 2015 10:30:28 +0200
Subject: vfs: add seq_file_path() helper

Turn
	seq_path(..., &file->f_path, ...);
into
	seq_file_path(..., file, ...);

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/md/bitmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index dda33d648354..3813fdfee4be 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -1922,7 +1922,7 @@ void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
 		   chunk_kb ? "KB" : "B");
 	if (bitmap->storage.file) {
 		seq_printf(seq, ", file: ");
-		seq_path(seq, &bitmap->storage.file->f_path, " \t\n");
+		seq_file_path(seq, bitmap->storage.file, " \t\n");
 	}
 
 	seq_printf(seq, "\n");
-- 
cgit v1.2.1


From bd6919228d7e1867ae9e24ab27e3e4a366c87d21 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 25 Jun 2015 17:01:40 +1000
Subject: md: clear mddev->private when it has been freed.

If ->private is set when ->run is called, it is assumed to be
a 'config'  prepared as part of 'reshape'.

So it is important when we free that config, that we also clear ->private.
This is not often a problem as the mddev will normally be discarded
shortly after the config us freed.
However if an 'assemble' races with a final close, the assemble can use
the old mddev which has a stale ->private.  This leads to any of
various sorts of crashes.

So clear ->private after calling ->free().

Reported-by: Nate Clark <nate@neworld.us>
Cc: stable@vger.kernel.org (v4.0+)
Fixes: afa0f557cb15 ("md: rename ->stop to ->free")
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3d339e283cf6..939739f0f881 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5178,6 +5178,7 @@ int md_run(struct mddev *mddev)
 		mddev_detach(mddev);
 		if (mddev->private)
 			pers->free(mddev, mddev->private);
+		mddev->private = NULL;
 		module_put(pers->owner);
 		bitmap_destroy(mddev);
 		return err;
@@ -5313,6 +5314,7 @@ static void md_clean(struct mddev *mddev)
 	mddev->changed = 0;
 	mddev->degraded = 0;
 	mddev->safemode = 0;
+	mddev->private = NULL;
 	mddev->merge_check_needed = 0;
 	mddev->bitmap_info.offset = 0;
 	mddev->bitmap_info.default_offset = 0;
@@ -5385,6 +5387,7 @@ static void __md_stop(struct mddev *mddev)
 	mddev->pers = NULL;
 	spin_unlock(&mddev->lock);
 	pers->free(mddev, mddev->private);
+	mddev->private = NULL;
 	if (pers->sync_request && mddev->to_remove == NULL)
 		mddev->to_remove = &md_redundancy_group;
 	module_put(pers->owner);
-- 
cgit v1.2.1


From 9a8c0fa861e4db60409b4dda254cef5e17e4d43c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 25 Jun 2015 17:06:40 +1000
Subject: md: unlock mddev_lock on an error path.

This error path retuns while still holding the lock - bad.

Fixes: 6791875e2e53 ("md: make reconfig_mutex optional for writes to md sysfs files.")
Cc: stable@vger.kernel.org (v4.0+)
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 939739f0f881..5fcce7371ee9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4014,8 +4014,10 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len)
 	else
 		rdev = md_import_device(dev, -1, -1);
 
-	if (IS_ERR(rdev))
+	if (IS_ERR(rdev)) {
+		mddev_unlock(mddev);
 		return PTR_ERR(rdev);
+	}
 	err = bind_rdev_to_array(rdev, mddev);
  out:
 	if (err)
-- 
cgit v1.2.1


From ab16bfc732c436658d13455f28b0b4a2608a7476 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Wed, 17 Jun 2015 12:31:46 +1000
Subject: md: clear Blocked flag on failed devices when array is read-only.

The Blocked flag indicates that a device has failed but that this
fact hasn't been recorded in the metadata yet.  Writes to such
devices cannot be allowed until the metadata has been updated.

On a read-only array, the Blocked flag will never be cleared.
This prevents the device being removed from the array.

If the metadata is being handled by the kernel
(i.e. !mddev->external), then we can be sure that if the array is
switch to writable, then a metadata update will happen and will
record the failure.  So we don't need the flag set.

If metadata is externally managed, it is upto the external manager
to clear the 'blocked' flag.

Reported-by: XiaoNi <xni@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5fcce7371ee9..202deb43b822 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8130,6 +8130,15 @@ void md_check_recovery(struct mddev *mddev)
 		int spares = 0;
 
 		if (mddev->ro) {
+			struct md_rdev *rdev;
+			if (!mddev->external && mddev->in_sync)
+				/* 'Blocked' flag not needed as failed devices
+				 * will be recorded if array switched to read/write.
+				 * Leaving it set will prevent the device
+				 * from being removed.
+				 */
+				rdev_for_each(rdev, mddev)
+					clear_bit(Blocked, &rdev->flags);
 			/* On a read-only array we can:
 			 * - remove failed devices
 			 * - add already-in_sync devices if the array itself
-- 
cgit v1.2.1


From 90a9befb20bd455b167b02d4018b5e882da76505 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 25 Jun 2015 15:02:36 -0700
Subject: drivers/md/md.c: use strreplace()

There's no point in starting over when we meet a '/'.  This also
eliminates a stack variable and a little .text.

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Acked-by: NeilBrown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/md.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4dbed4a67aaf..8d9f89b4519d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2024,7 +2024,6 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 {
 	char b[BDEVNAME_SIZE];
 	struct kobject *ko;
-	char *s;
 	int err;
 
 	/* prevent duplicates */
@@ -2070,8 +2069,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
 		return -EBUSY;
 	}
 	bdevname(rdev->bdev,b);
-	while ( (s=strchr(b, '/')) != NULL)
-		*s = '!';
+	strreplace(b, '/', '!');
 
 	rdev->mddev = mddev;
 	printk(KERN_INFO "md: bind<%s>\n", b);
-- 
cgit v1.2.1


From 4e6e36c3714364b65f2bfea8c73691c663891726 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 26 Jun 2015 09:42:57 -0400
Subject: Revert "dm: do not allocate any mempools for blk-mq request-based DM"

This reverts commit cbc4e3c1350beb47beab8f34ad9be3d34a20c705.

Reported-by: Junichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c |  4 +--
 drivers/md/dm.c       | 69 +++++++++++++++++++++++----------------------------
 2 files changed, 33 insertions(+), 40 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 85e1d39e9a38..a5f94125ad01 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -964,8 +964,8 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 		return -EINVAL;
 	}
 
-	if (IS_ERR(t->mempools))
-		return PTR_ERR(t->mempools);
+	if (!t->mempools)
+		return -ENOMEM;
 
 	return 0;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 90dc49e3c78f..492181e16c69 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2349,52 +2349,39 @@ static void free_dev(struct mapped_device *md)
 	kfree(md);
 }
 
-static unsigned filter_md_type(unsigned type, struct mapped_device *md)
-{
-	if (type == DM_TYPE_BIO_BASED)
-		return type;
-
-	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
-}
-
 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
 	struct dm_md_mempools *p = dm_table_get_md_mempools(t);
 
-	switch (filter_md_type(dm_table_get_type(t), md)) {
-	case DM_TYPE_BIO_BASED:
-		if (md->bs && md->io_pool) {
+	if (md->bs) {
+		/* The md already has necessary mempools. */
+		if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
 			/*
-			 * This bio-based md already has necessary mempools.
 			 * Reload bioset because front_pad may have changed
 			 * because a different table was loaded.
 			 */
 			bioset_free(md->bs);
 			md->bs = p->bs;
 			p->bs = NULL;
-			goto out;
 		}
-		break;
-	case DM_TYPE_REQUEST_BASED:
-		if (md->rq_pool && md->io_pool)
-			/*
-			 * This request-based md already has necessary mempools.
-			 */
-			goto out;
-		break;
-	case DM_TYPE_MQ_REQUEST_BASED:
-		BUG_ON(p); /* No mempools needed */
-		return;
+		/*
+		 * There's no need to reload with request-based dm
+		 * because the size of front_pad doesn't change.
+		 * Note for future: If you are to reload bioset,
+		 * prep-ed requests in the queue may refer
+		 * to bio from the old bioset, so you must walk
+		 * through the queue to unprep.
+		 */
+		goto out;
 	}
 
-	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
-
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->rq_pool = p->rq_pool;
 	p->rq_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
+
 out:
 	/* mempool bind completed, no longer need any mempools in the table */
 	dm_table_free_md_mempools(t);
@@ -2774,6 +2761,14 @@ out_tag_set:
 	return err;
 }
 
+static unsigned filter_md_type(unsigned type, struct mapped_device *md)
+{
+	if (type == DM_TYPE_BIO_BASED)
+		return type;
+
+	return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
+}
+
 /*
  * Setup the DM device's queue based on md's type
  */
@@ -3495,7 +3490,7 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 
 	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
 	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
 		offsetof(struct dm_target_io, clone);
@@ -3514,26 +3509,24 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 	return pools;
 out:
 	dm_free_md_mempools(pools);
-	return ERR_PTR(-ENOMEM);
+	return NULL;
 }
 
 struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
 					    unsigned type)
 {
-	unsigned int pool_size;
+	unsigned int pool_size = dm_get_reserved_rq_based_ios();
 	struct dm_md_mempools *pools;
 
-	if (filter_md_type(type, md) == DM_TYPE_MQ_REQUEST_BASED)
-		return NULL; /* No mempools needed */
-
-	pool_size = dm_get_reserved_rq_based_ios();
 	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
-		return ERR_PTR(-ENOMEM);
+		return NULL;
 
-	pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-	if (!pools->rq_pool)
-		goto out;
+	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
+		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+		if (!pools->rq_pool)
+			goto out;
+	}
 
 	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
 	if (!pools->io_pool)
@@ -3542,7 +3535,7 @@ struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
 	return pools;
 out:
 	dm_free_md_mempools(pools);
-	return ERR_PTR(-ENOMEM);
+	return NULL;
 }
 
 void dm_free_md_mempools(struct dm_md_mempools *pools)
-- 
cgit v1.2.1


From 78d8e58a086b214dddf1fd463e20a7e1d82d7866 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 26 Jun 2015 10:01:13 -0400
Subject: Revert "block, dm: don't copy bios for request clones"

This reverts commit 5f1b670d0bef508a5554d92525f5f6d00d640b38.

Justification for revert as reported in this dm-devel post:
https://www.redhat.com/archives/dm-devel/2015-June/msg00160.html

this change should not be pushed to mainline yet.

Firstly, Christoph has a newer version of the patch that fixes silent
data corruption problem:
  https://www.redhat.com/archives/dm-devel/2015-May/msg00229.html

And the new version still depends on LLDDs to always complete requests
to the end when error happens, while block API doesn't enforce such a
requirement. If the assumption is ever broken, the inconsistency between
request and bio (e.g. rq->__sector and rq->bio) will cause silent data
corruption:
  https://www.redhat.com/archives/dm-devel/2015-June/msg00022.html

Reported-by: Junichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-table.c |  25 +++-----
 drivers/md/dm.c       | 171 ++++++++++++++++++++++++++++++++++++++------------
 drivers/md/dm.h       |   5 +-
 3 files changed, 142 insertions(+), 59 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index a5f94125ad01..16ba55ad7089 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -942,28 +942,21 @@ static int dm_table_alloc_md_mempools(struct dm_table *t, struct mapped_device *
 {
 	unsigned type = dm_table_get_type(t);
 	unsigned per_bio_data_size = 0;
+	struct dm_target *tgt;
 	unsigned i;
 
-	switch (type) {
-	case DM_TYPE_BIO_BASED:
-		for (i = 0; i < t->num_targets; i++) {
-			struct dm_target *tgt = t->targets + i;
-
-			per_bio_data_size = max(per_bio_data_size,
-						tgt->per_bio_data_size);
-		}
-		t->mempools = dm_alloc_bio_mempools(t->integrity_supported,
-						    per_bio_data_size);
-		break;
-	case DM_TYPE_REQUEST_BASED:
-	case DM_TYPE_MQ_REQUEST_BASED:
-		t->mempools = dm_alloc_rq_mempools(md, type);
-		break;
-	default:
+	if (unlikely(type == DM_TYPE_NONE)) {
 		DMWARN("no table type is set, can't allocate mempools");
 		return -EINVAL;
 	}
 
+	if (type == DM_TYPE_BIO_BASED)
+		for (i = 0; i < t->num_targets; i++) {
+			tgt = t->targets + i;
+			per_bio_data_size = max(per_bio_data_size, tgt->per_bio_data_size);
+		}
+
+	t->mempools = dm_alloc_md_mempools(md, type, t->integrity_supported, per_bio_data_size);
 	if (!t->mempools)
 		return -ENOMEM;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 492181e16c69..9d942aef0f75 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -993,6 +993,57 @@ static void clone_endio(struct bio *bio, int error)
 	dec_pending(io, error);
 }
 
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone, int error)
+{
+	struct dm_rq_clone_bio_info *info =
+		container_of(clone, struct dm_rq_clone_bio_info, clone);
+	struct dm_rq_target_io *tio = info->tio;
+	struct bio *bio = info->orig;
+	unsigned int nr_bytes = info->orig->bi_iter.bi_size;
+
+	bio_put(clone);
+
+	if (tio->error)
+		/*
+		 * An error has already been detected on the request.
+		 * Once error occurred, just let clone->end_io() handle
+		 * the remainder.
+		 */
+		return;
+	else if (error) {
+		/*
+		 * Don't notice the error to the upper layer yet.
+		 * The error handling decision is made by the target driver,
+		 * when the request is completed.
+		 */
+		tio->error = error;
+		return;
+	}
+
+	/*
+	 * I/O for the bio successfully completed.
+	 * Notice the data completion to the upper layer.
+	 */
+
+	/*
+	 * bios are processed from the head of the list.
+	 * So the completing bio should always be rq->bio.
+	 * If it's not, something wrong is happening.
+	 */
+	if (tio->orig->bio != bio)
+		DMERR("bio completion is going in the middle of the request");
+
+	/*
+	 * Update the original request.
+	 * Do not use blk_end_request() here, because it may complete
+	 * the original request before the clone, and break the ordering.
+	 */
+	blk_update_request(tio->orig, 0, nr_bytes);
+}
+
 static struct dm_rq_target_io *tio_from_request(struct request *rq)
 {
 	return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
@@ -1050,6 +1101,8 @@ static void free_rq_clone(struct request *clone)
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 
+	blk_rq_unprep_clone(clone);
+
 	if (md->type == DM_TYPE_MQ_REQUEST_BASED)
 		/* stacked on blk-mq queue(s) */
 		tio->ti->type->release_clone_rq(clone);
@@ -1784,13 +1837,39 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
 		dm_complete_request(rq, r);
 }
 
-static void setup_clone(struct request *clone, struct request *rq,
-		        struct dm_rq_target_io *tio)
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+				 void *data)
 {
-	blk_rq_prep_clone(clone, rq);
+	struct dm_rq_target_io *tio = data;
+	struct dm_rq_clone_bio_info *info =
+		container_of(bio, struct dm_rq_clone_bio_info, clone);
+
+	info->orig = bio_orig;
+	info->tio = tio;
+	bio->bi_end_io = end_clone_bio;
+
+	return 0;
+}
+
+static int setup_clone(struct request *clone, struct request *rq,
+		       struct dm_rq_target_io *tio, gfp_t gfp_mask)
+{
+	int r;
+
+	r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
+			      dm_rq_bio_constructor, tio);
+	if (r)
+		return r;
+
+	clone->cmd = rq->cmd;
+	clone->cmd_len = rq->cmd_len;
+	clone->sense = rq->sense;
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
+
 	tio->clone = clone;
+
+	return 0;
 }
 
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
@@ -1811,7 +1890,12 @@ static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 		clone = tio->clone;
 
 	blk_rq_init(NULL, clone);
-	setup_clone(clone, rq, tio);
+	if (setup_clone(clone, rq, tio, gfp_mask)) {
+		/* -ENOMEM */
+		if (alloc_clone)
+			free_clone_request(md, clone);
+		return NULL;
+	}
 
 	return clone;
 }
@@ -1905,7 +1989,11 @@ static int map_request(struct dm_rq_target_io *tio, struct request *rq,
 		}
 		if (r != DM_MAPIO_REMAPPED)
 			return r;
-		setup_clone(clone, rq, tio);
+		if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
+			/* -ENOMEM */
+			ti->type->release_clone_rq(clone);
+			return DM_MAPIO_REQUEUE;
+		}
 	}
 
 	switch (r) {
@@ -2375,6 +2463,8 @@ static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 		goto out;
 	}
 
+	BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
+
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->rq_pool = p->rq_pool;
@@ -3481,23 +3571,48 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
-struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
-					     unsigned per_bio_data_size)
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size)
 {
-	struct dm_md_mempools *pools;
-	unsigned int pool_size = dm_get_reserved_bio_based_ios();
+	struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+	struct kmem_cache *cachep = NULL;
+	unsigned int pool_size = 0;
 	unsigned int front_pad;
 
-	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
 	if (!pools)
 		return NULL;
 
-	front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) +
-		offsetof(struct dm_target_io, clone);
+	type = filter_md_type(type, md);
 
-	pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
-	if (!pools->io_pool)
-		goto out;
+	switch (type) {
+	case DM_TYPE_BIO_BASED:
+		cachep = _io_cache;
+		pool_size = dm_get_reserved_bio_based_ios();
+		front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+		break;
+	case DM_TYPE_REQUEST_BASED:
+		cachep = _rq_tio_cache;
+		pool_size = dm_get_reserved_rq_based_ios();
+		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
+		if (!pools->rq_pool)
+			goto out;
+		/* fall through to setup remaining rq-based pools */
+	case DM_TYPE_MQ_REQUEST_BASED:
+		if (!pool_size)
+			pool_size = dm_get_reserved_rq_based_ios();
+		front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
+		/* per_bio_data_size is not used. See __bind_mempools(). */
+		WARN_ON(per_bio_data_size != 0);
+		break;
+	default:
+		BUG();
+	}
+
+	if (cachep) {
+		pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
+		if (!pools->io_pool)
+			goto out;
+	}
 
 	pools->bs = bioset_create_nobvec(pool_size, front_pad);
 	if (!pools->bs)
@@ -3507,34 +3622,10 @@ struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
 		goto out;
 
 	return pools;
-out:
-	dm_free_md_mempools(pools);
-	return NULL;
-}
-
-struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md,
-					    unsigned type)
-{
-	unsigned int pool_size = dm_get_reserved_rq_based_ios();
-	struct dm_md_mempools *pools;
-
-	pools = kzalloc(sizeof(*pools), GFP_KERNEL);
-	if (!pools)
-		return NULL;
 
-	if (filter_md_type(type, md) == DM_TYPE_REQUEST_BASED) {
-		pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
-		if (!pools->rq_pool)
-			goto out;
-	}
-
-	pools->io_pool = mempool_create_slab_pool(pool_size, _rq_tio_cache);
-	if (!pools->io_pool)
-		goto out;
-
-	return pools;
 out:
 	dm_free_md_mempools(pools);
+
 	return NULL;
 }
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index e6e66d087b26..6123c2bf9150 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -222,9 +222,8 @@ void dm_kcopyd_exit(void);
 /*
  * Mempool operations
  */
-struct dm_md_mempools *dm_alloc_bio_mempools(unsigned integrity,
-					     unsigned per_bio_data_size);
-struct dm_md_mempools *dm_alloc_rq_mempools(struct mapped_device *md, unsigned type);
+struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
+					    unsigned integrity, unsigned per_bio_data_size);
 void dm_free_md_mempools(struct dm_md_mempools *pools);
 
 /*
-- 
cgit v1.2.1


From b5451e456840af027b794afc2c7c84c2a17f569b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Fri, 26 Jun 2015 10:07:04 -0400
Subject: dm cache policy smq: fix "default" version to be 1.4.0

Commit bccab6a0 ("dm cache: switch the "default" cache replacement
policy from mq to smq") should've incremented the "default" policy's
version number to 1.4.0 rather than reverting to version 1.0.0.

Reported-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-policy-smq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-policy-smq.c b/drivers/md/dm-cache-policy-smq.c
index 80f02d3330e2..b6f22651dd35 100644
--- a/drivers/md/dm-cache-policy-smq.c
+++ b/drivers/md/dm-cache-policy-smq.c
@@ -1750,7 +1750,7 @@ static struct dm_cache_policy_type smq_policy_type = {
 
 static struct dm_cache_policy_type default_policy_type = {
 	.name = "default",
-	.version = {1, 0, 0},
+	.version = {1, 4, 0},
 	.hint_size = 4,
 	.owner = THIS_MODULE,
 	.create = smq_create,
-- 
cgit v1.2.1


From 958b43384e41c129117284f48ba3fb9c11ebac75 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@kernel.org>
Date: Tue, 30 Jun 2015 14:59:30 -0700
Subject: bcache: use kvfree() in various places

Use kvfree() instead of open-coding it.

Signed-off-by: Pekka Enberg <penberg@kernel.org>
Cc: Kent Overstreet <kmo@daterainc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/bcache/super.c | 10 ++--------
 drivers/md/bcache/util.h  | 10 ++--------
 2 files changed, 4 insertions(+), 16 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 4dd2bb7167f0..94980bfca434 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -760,14 +760,8 @@ static void bcache_device_free(struct bcache_device *d)
 	bio_split_pool_free(&d->bio_split_hook);
 	if (d->bio_split)
 		bioset_free(d->bio_split);
-	if (is_vmalloc_addr(d->full_dirty_stripes))
-		vfree(d->full_dirty_stripes);
-	else
-		kfree(d->full_dirty_stripes);
-	if (is_vmalloc_addr(d->stripe_sectors_dirty))
-		vfree(d->stripe_sectors_dirty);
-	else
-		kfree(d->stripe_sectors_dirty);
+	kvfree(d->full_dirty_stripes);
+	kvfree(d->stripe_sectors_dirty);
 
 	closure_debug_destroy(&d->cl);
 }
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index 98df7572b5f7..1d04c4859c70 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -52,10 +52,7 @@ struct closure;
 
 #define free_heap(heap)							\
 do {									\
-	if (is_vmalloc_addr((heap)->data))				\
-		vfree((heap)->data);					\
-	else								\
-		kfree((heap)->data);					\
+	kvfree((heap)->data);						\
 	(heap)->data = NULL;						\
 } while (0)
 
@@ -163,10 +160,7 @@ do {									\
 
 #define free_fifo(fifo)							\
 do {									\
-	if (is_vmalloc_addr((fifo)->data))				\
-		vfree((fifo)->data);					\
-	else								\
-		kfree((fifo)->data);					\
+	kvfree((fifo)->data);						\
 	(fifo)->data = NULL;						\
 } while (0)
 
-- 
cgit v1.2.1


From d1aa1ab33dcb0922e9088e37989a6d28d8702540 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Tue, 30 Jun 2015 14:59:54 -0700
Subject: MAINTAINERS: BCACHE: Kent Overstreet has changed email address

Kent's email address in MAINTAINERS seems to be invalid.
This was his last sign-off address, so use that if appropriate.

Fix the S: status entry while there.

Signed-off-by: Joe Perches <joe@perches.com>
Acked-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/md/bcache/journal.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index fe080ad0e558..ce64fc851251 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -157,7 +157,7 @@ int bch_journal_read(struct cache_set *c, struct list_head *list)
 
 	for_each_cache(ca, c, iter) {
 		struct journal_device *ja = &ca->journal;
-		unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
+		DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS);
 		unsigned i, l, r, m;
 		uint64_t seq;
 
-- 
cgit v1.2.1


From 4c7e309340ff85072e96f529582d159002c36734 Mon Sep 17 00:00:00 2001
From: Dennis Yang <shinrairis@gmail.com>
Date: Fri, 26 Jun 2015 15:25:48 +0100
Subject: dm btree remove: fix bug in redistribute3

redistribute3() shares entries out across 3 nodes.  Some entries were
being moved the wrong way, breaking the ordering.  This manifested as a
BUG() in dm-btree-remove.c:shift() when entries were removed from the
btree.

For additional context see:
https://www.redhat.com/archives/dm-devel/2015-May/msg00113.html

Signed-off-by: Dennis Yang <shinrairis@gmail.com>
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/persistent-data/dm-btree-remove.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index e04cfd2d60ef..9836c0ae897c 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -309,8 +309,8 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
 
 		if (s < 0 && nr_center < -s) {
 			/* not enough in central node */
-			shift(left, center, nr_center);
-			s = nr_center - target;
+			shift(left, center, -nr_center);
+			s += nr_center;
 			shift(left, right, s);
 			nr_right += s;
 		} else
@@ -323,7 +323,7 @@ static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
 		if (s > 0 && nr_center < s) {
 			/* not enough in central node */
 			shift(center, right, nr_center);
-			s = target - nr_center;
+			s -= nr_center;
 			shift(left, right, s);
 			nr_left -= s;
 		} else
-- 
cgit v1.2.1


From a822c83e47d97cdef38c4352e1ef62d9f46cfe98 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 3 Jul 2015 10:22:42 +0100
Subject: dm thin: allocate the cell_sort_array dynamically

Given the pool's cell_sort_array holds 8192 pointers it triggers an
order 5 allocation via kmalloc.  This order 5 allocation is prone to
failure as system memory gets more fragmented over time.

Fix this by allocating the cell_sort_array using vmalloc.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/dm-thin.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c33f61a4cc28..8f015d924a24 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -18,6 +18,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <linux/sort.h>
 #include <linux/rbtree.h>
 
@@ -268,7 +269,7 @@ struct pool {
 	process_mapping_fn process_prepared_mapping;
 	process_mapping_fn process_prepared_discard;
 
-	struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
+	struct dm_bio_prison_cell **cell_sort_array;
 };
 
 static enum pool_mode get_pool_mode(struct pool *pool);
@@ -2777,6 +2778,7 @@ static void __pool_destroy(struct pool *pool)
 {
 	__pool_table_remove(pool);
 
+	vfree(pool->cell_sort_array);
 	if (dm_pool_metadata_close(pool->pmd) < 0)
 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
 
@@ -2889,6 +2891,13 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 		goto bad_mapping_pool;
 	}
 
+	pool->cell_sort_array = vmalloc(sizeof(*pool->cell_sort_array) * CELL_SORT_ARRAY_SIZE);
+	if (!pool->cell_sort_array) {
+		*error = "Error allocating cell sort array";
+		err_p = ERR_PTR(-ENOMEM);
+		goto bad_sort_array;
+	}
+
 	pool->ref_count = 1;
 	pool->last_commit_jiffies = jiffies;
 	pool->pool_md = pool_md;
@@ -2897,6 +2906,8 @@ static struct pool *pool_create(struct mapped_device *pool_md,
 
 	return pool;
 
+bad_sort_array:
+	mempool_destroy(pool->mapping_pool);
 bad_mapping_pool:
 	dm_deferred_set_destroy(pool->all_io_ds);
 bad_all_io_ds:
-- 
cgit v1.2.1


From 1c7518794a3647eb345d59ee52844e8a40405198 Mon Sep 17 00:00:00 2001
From: Joe Thornber <ejt@redhat.com>
Date: Fri, 3 Jul 2015 14:51:32 +0100
Subject: dm btree: silence lockdep lock inversion in dm_btree_del()

Allocate memory using GFP_NOIO when deleting a btree.  dm_btree_del()
can be called via an ioctl and we don't want to recurse into the FS or
block layer.

Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org
---
 drivers/md/persistent-data/dm-btree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index 200ac12a1d40..fdd3793e22f9 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -255,7 +255,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
 	int r;
 	struct del_stack *s;
 
-	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	s = kmalloc(sizeof(*s), GFP_NOIO);
 	if (!s)
 		return -ENOMEM;
 	s->info = info;
-- 
cgit v1.2.1


From 621739b00e16ca2d80411dc9b111cb15b91f3ba9 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 8 Jul 2015 16:08:24 -0400
Subject: Revert "dm: only run the queue on completion if congested or no
 requests pending"

This reverts commit 9a0e609e3fd8a95c96629b9fbde6b8c5b9a1456a.
(Resolved a conflict during revert due to commit bfebd1cdb4 that came
after)

This revert is motivated by a couple failure reports on request-based DM
multipath testbeds:
1) Netapp reported that their multipath fault injection test under heavy
   IO load can stall longer than 300 seconds.
2) IBM reported elevated lock contention in their testbed (likely due to
   increased back pressure due to IO not being dispatched as quickly):
   https://www.redhat.com/archives/dm-devel/2015-July/msg00057.html

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: stable@vger.kernel.org # 4.1+
---
 drivers/md/dm.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f331d888e7f5..de703778d39f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1067,13 +1067,10 @@ static void rq_end_stats(struct mapped_device *md, struct request *orig)
  */
 static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 {
-	int nr_requests_pending;
-
 	atomic_dec(&md->pending[rw]);
 
 	/* nudge anyone waiting on suspend queue */
-	nr_requests_pending = md_in_flight(md);
-	if (!nr_requests_pending)
+	if (!md_in_flight(md))
 		wake_up(&md->wait);
 
 	/*
@@ -1085,8 +1082,7 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
 	if (run_queue) {
 		if (md->queue->mq_ops)
 			blk_mq_run_hw_queues(md->queue, true);
-		else if (!nr_requests_pending ||
-			 (nr_requests_pending >= md->queue->nr_congestion_on))
+		else
 			blk_run_queue_async(md->queue);
 	}
 
-- 
cgit v1.2.1


From 77b5a08427e87514c33730afc18cd02c9475e2c3 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@fb.com>
Date: Fri, 6 Mar 2015 08:37:46 -0700
Subject: bcache: don't embed 'return' statements in closure macros

This is horribly confusing, it breaks the flow of the code without
it being apparent in the caller.

Signed-off-by: Jens Axboe <axboe@fb.com>
Acked-by: Christoph Hellwig <hch@lst.de>
---
 drivers/md/bcache/closure.h |  3 ---
 drivers/md/bcache/io.c      |  1 +
 drivers/md/bcache/journal.c |  2 ++
 drivers/md/bcache/request.c | 14 +++++++++++---
 4 files changed, 14 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index a08e3eeac3c5..79a6d63e8ed3 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -320,7 +320,6 @@ static inline void closure_wake_up(struct closure_waitlist *list)
 do {									\
 	set_closure_fn(_cl, _fn, _wq);					\
 	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
-	return;								\
 } while (0)
 
 /**
@@ -349,7 +348,6 @@ do {									\
 do {									\
 	set_closure_fn(_cl, _fn, _wq);					\
 	closure_queue(_cl);						\
-	return;								\
 } while (0)
 
 /**
@@ -365,7 +363,6 @@ do {									\
 do {									\
 	set_closure_fn(_cl, _destructor, NULL);				\
 	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
-	return;								\
 } while (0)
 
 /**
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index cb64e64a4789..bf6a9ca18403 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -105,6 +105,7 @@ void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
 	} while (n != bio);
 
 	continue_at(&s->cl, bch_bio_submit_split_done, NULL);
+	return;
 submit:
 	generic_make_request(bio);
 }
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index ce64fc851251..418607a6ba33 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -592,12 +592,14 @@ static void journal_write_unlocked(struct closure *cl)
 
 	if (!w->need_write) {
 		closure_return_with_destructor(cl, journal_write_unlock);
+		return;
 	} else if (journal_full(&c->journal)) {
 		journal_reclaim(c);
 		spin_unlock(&c->journal.lock);
 
 		btree_flush_write(c);
 		continue_at(cl, journal_write, system_wq);
+		return;
 	}
 
 	c->journal.blocks_free -= set_blocks(w->data, block_bytes(c));
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 4afb2d26b148..f292790997d7 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -88,8 +88,10 @@ static void bch_data_insert_keys(struct closure *cl)
 	if (journal_ref)
 		atomic_dec_bug(journal_ref);
 
-	if (!op->insert_data_done)
+	if (!op->insert_data_done) {
 		continue_at(cl, bch_data_insert_start, op->wq);
+		return;
+	}
 
 	bch_keylist_free(&op->insert_keys);
 	closure_return(cl);
@@ -216,8 +218,10 @@ static void bch_data_insert_start(struct closure *cl)
 		/* 1 for the device pointer and 1 for the chksum */
 		if (bch_keylist_realloc(&op->insert_keys,
 					3 + (op->csum ? 1 : 0),
-					op->c))
+					op->c)) {
 			continue_at(cl, bch_data_insert_keys, op->wq);
+			return;
+		}
 
 		k = op->insert_keys.top;
 		bkey_init(k);
@@ -255,6 +259,7 @@ static void bch_data_insert_start(struct closure *cl)
 
 	op->insert_data_done = true;
 	continue_at(cl, bch_data_insert_keys, op->wq);
+	return;
 err:
 	/* bch_alloc_sectors() blocks if s->writeback = true */
 	BUG_ON(op->writeback);
@@ -576,8 +581,10 @@ static void cache_lookup(struct closure *cl)
 	ret = bch_btree_map_keys(&s->op, s->iop.c,
 				 &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
 				 cache_lookup_fn, MAP_END_KEY);
-	if (ret == -EAGAIN)
+	if (ret == -EAGAIN) {
 		continue_at(cl, cache_lookup, bcache_wq);
+		return;
+	}
 
 	closure_return(cl);
 }
@@ -1085,6 +1092,7 @@ static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
 		continue_at_nobarrier(&s->cl,
 				      flash_dev_nodata,
 				      bcache_wq);
+		return;
 	} else if (rw) {
 		bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
 					&KEY(d->id, bio->bi_iter.bi_sector, 0),
-- 
cgit v1.2.1


From b06075a98d595b761881fb2d7b8a557ea2f8b7ac Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Fri, 10 Jul 2015 17:21:43 -0400
Subject: dm: fix use after free crash due to incorrect cleanup sequence

Linux 4.2-rc1 Commit 0f20972f7bf6 ("dm: factor out a common
cleanup_mapped_device()") moved a common cleanup code to a separate
function.  Unfortunately, that commit incorrectly changed the order of
cleanup, so that it destroys the mapped_device's srcu structure
'io_barrier' before destroying its workqueue.

The function that is executed on the workqueue (dm_wq_work) uses the srcu
structure, thus it may use it after being freed.  That results in a
crash in the LVM test suite's mirror-vgreduce-removemissing.sh test.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Fixes: 0f20972f7bf6 ("dm: factor out a common cleanup_mapped_device()")
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index de703778d39f..ab37ae114e94 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2277,8 +2277,6 @@ static void dm_init_old_md_queue(struct mapped_device *md)
 
 static void cleanup_mapped_device(struct mapped_device *md)
 {
-	cleanup_srcu_struct(&md->io_barrier);
-
 	if (md->wq)
 		destroy_workqueue(md->wq);
 	if (md->kworker_task)
@@ -2290,6 +2288,8 @@ static void cleanup_mapped_device(struct mapped_device *md)
 	if (md->bs)
 		bioset_free(md->bs);
 
+	cleanup_srcu_struct(&md->io_barrier);
+
 	if (md->disk) {
 		spin_lock(&_minor_lock);
 		md->disk->private_data = NULL;
-- 
cgit v1.2.1


From bcc696fac11fe13e59dda5aaec6322a25b7c9a3a Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 15 Jul 2015 16:52:04 -0400
Subject: dm thin: stay in out-of-data-space mode once no_space_timeout expires

This fixes an issue where running out of data space would cause the
thin-pool's metadata to become read-only.  There was no reason to make
metadata read-only -- calling set_pool_mode() with PM_READ_ONLY was a
misguided way to error all queued and future write IOs.  We can
accomplish the same by degrading from PM_OUT_OF_DATA_SPACE to
PM_OUT_OF_DATA_SPACE with error_if_no_space enabled.

Otherwise, the use of PM_READ_ONLY could cause a race where commit() was
started before the PM_READ_ONLY transition but dm_pool_commit_metadata()
would go on to fail because the block manager had transitioned to
read-only.  The return of -EPERM from dm_pool_commit_metadata(), due to
attempting to commit while in read-only mode, caused the thin-pool to
set 'needs_check' because a metadata_operation_failed().  This needless
cascade of failures makes life for users more difficult than needed.

Reported-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 8f015d924a24..34e79531ea3f 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -2282,18 +2282,23 @@ static void do_waker(struct work_struct *ws)
 	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
 }
 
+static void notify_of_pool_mode_change_to_oods(struct pool *pool);
+
 /*
  * We're holding onto IO to allow userland time to react.  After the
  * timeout either the pool will have been resized (and thus back in
- * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
+ * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
  */
 static void do_no_space_timeout(struct work_struct *ws)
 {
 	struct pool *pool = container_of(to_delayed_work(ws), struct pool,
 					 no_space_timeout);
 
-	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
-		set_pool_mode(pool, PM_READ_ONLY);
+	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
+		pool->pf.error_if_no_space = true;
+		notify_of_pool_mode_change_to_oods(pool);
+		error_retry_list(pool);
+	}
 }
 
 /*----------------------------------------------------------------*/
@@ -2371,6 +2376,14 @@ static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
 	       dm_device_name(pool->pool_md), new_mode);
 }
 
+static void notify_of_pool_mode_change_to_oods(struct pool *pool)
+{
+	if (!pool->pf.error_if_no_space)
+		notify_of_pool_mode_change(pool, "out-of-data-space (queue IO)");
+	else
+		notify_of_pool_mode_change(pool, "out-of-data-space (error IO)");
+}
+
 static bool passdown_enabled(struct pool_c *pt)
 {
 	return pt->adjusted_pf.discard_passdown;
@@ -2455,7 +2468,7 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 		 * frequently seeing this mode.
 		 */
 		if (old_mode != new_mode)
-			notify_of_pool_mode_change(pool, "out-of-data-space");
+			notify_of_pool_mode_change_to_oods(pool);
 		pool->process_bio = process_bio_read_only;
 		pool->process_discard = process_discard_bio;
 		pool->process_cell = process_cell_read_only;
-- 
cgit v1.2.1


From e4c78e210daea17f82f12037005df225e22189b9 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 15 Jul 2015 11:40:24 -0400
Subject: dm thin: display 'needs_check' in status if it is set

There is currently no way to see that the needs_check flag has been set
in the metadata.  Display 'needs_check' in the thin-pool status if it is
set in the thinp metadata.

Also, update thinp documentation.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-thin.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 34e79531ea3f..1c50c580215c 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -3738,6 +3738,7 @@ static void emit_flags(struct pool_features *pf, char *result,
  * Status line is:
  *    <transaction id> <used metadata sectors>/<total metadata sectors>
  *    <used data sectors>/<total data sectors> <held metadata root>
+ *    <pool mode> <discard config> <no space config> <needs_check>
  */
 static void pool_status(struct dm_target *ti, status_type_t type,
 			unsigned status_flags, char *result, unsigned maxlen)
@@ -3839,6 +3840,11 @@ static void pool_status(struct dm_target *ti, status_type_t type,
 		else
 			DMEMIT("queue_if_no_space ");
 
+		if (dm_pool_metadata_needs_check(pool->pmd))
+			DMEMIT("needs_check ");
+		else
+			DMEMIT("- ");
+
 		break;
 
 	case STATUSTYPE_TABLE:
@@ -3942,7 +3948,7 @@ static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
-	.version = {1, 15, 0},
+	.version = {1, 16, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
@@ -4329,7 +4335,7 @@ static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type thin_target = {
 	.name = "thin",
-	.version = {1, 15, 0},
+	.version = {1, 16, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
-- 
cgit v1.2.1


From 255eac20054e90ac7a52b3e179b61de1168a8fe6 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Wed, 15 Jul 2015 11:42:59 -0400
Subject: dm cache: display 'needs_check' in status if it is set

There is currently no way to see that the needs_check flag has been set
in the metadata.  Display 'needs_check' in the cache status if it is set
in the cache metadata.

Also, update cache documentation.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 1b4e1756b169..4afa34d7b8ad 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -3496,7 +3496,7 @@ static void cache_resume(struct dm_target *ti)
  * <#demotions> <#promotions> <#dirty>
  * <#features> <features>*
  * <#core args> <core args>
- * <policy name> <#policy args> <policy args>* <cache metadata mode>
+ * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
  */
 static void cache_status(struct dm_target *ti, status_type_t type,
 			 unsigned status_flags, char *result, unsigned maxlen)
@@ -3582,6 +3582,11 @@ static void cache_status(struct dm_target *ti, status_type_t type,
 		else
 			DMEMIT("rw ");
 
+		if (dm_cache_metadata_needs_check(cache->cmd))
+			DMEMIT("needs_check ");
+		else
+			DMEMIT("- ");
+
 		break;
 
 	case STATUSTYPE_TABLE:
@@ -3820,7 +3825,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
 
 static struct target_type cache_target = {
 	.name = "cache",
-	.version = {1, 7, 0},
+	.version = {1, 8, 0},
 	.module = THIS_MODULE,
 	.ctr = cache_ctr,
 	.dtr = cache_dtr,
-- 
cgit v1.2.1


From 386cb7cdeeef97e0bf082a8d6bbfc07a2ccce07b Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 16 Jul 2015 21:16:31 -0400
Subject: dm cache: do not wake_worker() in free_migration()

All methods that queue work call wake_worker() as you'd expect.
E.g. cell_defer, defer_bio, quiesce_migration (which is called by
writeback, promote, demote_then_promote, invalidate, discard, etc).

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 4afa34d7b8ad..c8a160b37412 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -424,7 +424,6 @@ static void free_migration(struct dm_cache_migration *mg)
 		wake_up(&cache->migration_wait);
 
 	mempool_free(mg, cache->migration_pool);
-	wake_worker(cache);
 }
 
 static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
-- 
cgit v1.2.1


From e782eff591bca2d96bac30ab5d1cfa4ccd3b0f86 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 16 Jul 2015 21:26:10 -0400
Subject: dm cache: avoid preallocation if no work in
 writeback_some_dirty_blocks()

Refactor writeback_some_dirty_blocks() to avoid prealloc_data_structs()
if the policy doesn't have any dirty blocks ready for writeback.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index c8a160b37412..408dd276d6c9 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -2061,7 +2061,6 @@ static void process_deferred_writethrough_bios(struct cache *cache)
 
 static void writeback_some_dirty_blocks(struct cache *cache)
 {
-	int r = 0;
 	dm_oblock_t oblock;
 	dm_cblock_t cblock;
 	struct prealloc structs;
@@ -2071,15 +2070,11 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 	memset(&structs, 0, sizeof(structs));
 
 	while (spare_migration_bandwidth(cache)) {
-		if (prealloc_data_structs(cache, &structs))
-			break;
-
-		r = policy_writeback_work(cache->policy, &oblock, &cblock, busy);
-		if (r)
-			break;
+		if (policy_writeback_work(cache->policy, &oblock, &cblock, busy))
+			break; /* no work to do */
 
-		r = get_cell(cache, oblock, &structs, &old_ocell);
-		if (r) {
+		if (prealloc_data_structs(cache, &structs) ||
+		    get_cell(cache, oblock, &structs, &old_ocell)) {
 			policy_set_dirty(cache->policy, oblock);
 			break;
 		}
-- 
cgit v1.2.1


From 665022d72f9b5762f21b5ea02fa0503d04802849 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 16 Jul 2015 21:48:55 -0400
Subject: dm cache: avoid calls to prealloc_free_structs() if possible

If no work was performed then prealloc_data_structs() wasn't ever called
so there isn't any need to call prealloc_free_structs().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/md/dm-cache-target.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 408dd276d6c9..b680da5d7b93 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -1946,6 +1946,7 @@ static int commit_if_needed(struct cache *cache)
 
 static void process_deferred_bios(struct cache *cache)
 {
+	bool prealloc_used = false;
 	unsigned long flags;
 	struct bio_list bios;
 	struct bio *bio;
@@ -1980,13 +1981,16 @@ static void process_deferred_bios(struct cache *cache)
 			process_discard_bio(cache, &structs, bio);
 		else
 			process_bio(cache, &structs, bio);
+		prealloc_used = true;
 	}
 
-	prealloc_free_structs(cache, &structs);
+	if (prealloc_used)
+		prealloc_free_structs(cache, &structs);
 }
 
 static void process_deferred_cells(struct cache *cache)
 {
+	bool prealloc_used = false;
 	unsigned long flags;
 	struct dm_bio_prison_cell *cell, *tmp;
 	struct list_head cells;
@@ -2014,9 +2018,11 @@ static void process_deferred_cells(struct cache *cache)
 		}
 
 		process_cell(cache, &structs, cell);
+		prealloc_used = true;
 	}
 
-	prealloc_free_structs(cache, &structs);
+	if (prealloc_used)
+		prealloc_free_structs(cache, &structs);
 }
 
 static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
@@ -2061,6 +2067,7 @@ static void process_deferred_writethrough_bios(struct cache *cache)
 
 static void writeback_some_dirty_blocks(struct cache *cache)
 {
+	bool prealloc_used = false;
 	dm_oblock_t oblock;
 	dm_cblock_t cblock;
 	struct prealloc structs;
@@ -2080,9 +2087,11 @@ static void writeback_some_dirty_blocks(struct cache *cache)
 		}
 
 		writeback(cache, &structs, oblock, cblock, old_ocell);
+		prealloc_used = true;
 	}
 
-	prealloc_free_structs(cache, &structs);
+	if (prealloc_used)
+		prealloc_free_structs(cache, &structs);
 }
 
 /*----------------------------------------------------------------
-- 
cgit v1.2.1