From d4478e92d6186ce37947a36994de407c27446266 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 25 Sep 2017 13:07:22 -0600 Subject: block/loop: make loop cgroup aware loop block device handles IO in a separate thread. The actual IO dispatched isn't cloned from the IO loop device received, so the dispatched IO loses the cgroup context. I'm ignoring buffer IO case now, which is quite complicated. Making the loop thread aware cgroup context doesn't really help. The loop device only writes to a single file. In current writeback cgroup implementation, the file can only belong to one cgroup. For direct IO case, we could workaround the issue in theory. For example, say we assign cgroup1 5M/s BW for loop device and cgroup2 10M/s. We can create a special cgroup for loop thread and assign at least 15M/s for the underlayer disk. In this way, we correctly throttle the two cgroups. But this is tricky to setup. This patch tries to address the issue. We record bio's css in loop command. When loop thread is handling the command, we then use the API provided in patch 1 to set the css for current task. The bio layer will use the css for new IO (from patch 3). Acked-by: Tejun Heo Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.c | 13 +++++++++++++ drivers/block/loop.h | 1 + 2 files changed, 14 insertions(+) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 85de67334695..fd4eff5f5b76 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -476,6 +476,8 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) { struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); + if (cmd->css) + css_put(cmd->css); cmd->ret = ret; lo_rw_aio_do_completion(cmd); } @@ -535,6 +537,8 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, cmd->iocb.ki_filp = file; cmd->iocb.ki_complete = lo_rw_aio_complete; cmd->iocb.ki_flags = IOCB_DIRECT; + if (cmd->css) + kthread_associate_blkcg(cmd->css); if (rw == WRITE) ret = call_write_iter(file, &cmd->iocb, &iter); @@ -542,6 +546,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, ret = call_read_iter(file, &cmd->iocb, &iter); lo_rw_aio_do_completion(cmd); + kthread_associate_blkcg(NULL); if (ret != -EIOCBQUEUED) cmd->iocb.ki_complete(&cmd->iocb, ret, 0); @@ -1686,6 +1691,14 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, break; } + /* always use the first bio's css */ +#ifdef CONFIG_CGROUPS + if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) { + cmd->css = cmd->rq->bio->bi_css; + css_get(cmd->css); + } else +#endif + cmd->css = NULL; kthread_queue_work(&lo->worker, &cmd->work); return BLK_STS_OK; diff --git a/drivers/block/loop.h b/drivers/block/loop.h index 1f3956702993..0f45416e4fcf 100644 --- a/drivers/block/loop.h +++ b/drivers/block/loop.h @@ -72,6 +72,7 @@ struct loop_cmd { long ret; struct kiocb iocb; struct bio_vec *bvec; + struct cgroup_subsys_state *css; }; /* Support for loadable transfer modules */ -- cgit v1.2.1 From 9979d545c93653770984806e9bfcec1df53a9d3d Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Mon, 25 Sep 2017 19:10:07 +0200 Subject: block: cryptoloop - Fix build warning This patch fix the following build warning: drivers/block/cryptoloop.c:46:8: warning: variable 'cipher' set but not used [-Wunused-but-set-variable] Signed-off-by: Corentin Labbe Signed-off-by: Jens Axboe --- drivers/block/cryptoloop.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/cryptoloop.c b/drivers/block/cryptoloop.c index 74e03aa537ad..7033a4beda66 100644 --- a/drivers/block/cryptoloop.c +++ b/drivers/block/cryptoloop.c @@ -43,7 +43,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info) int cipher_len; int mode_len; char cms[LO_NAME_SIZE]; /* cipher-mode string */ - char *cipher; char *mode; char *cmsp = cms; /* c-m string pointer */ struct crypto_skcipher *tfm; @@ -56,7 +55,6 @@ cryptoloop_init(struct loop_device *lo, const struct loop_info64 *info) strncpy(cms, info->lo_crypt_name, LO_NAME_SIZE); cms[LO_NAME_SIZE - 1] = 0; - cipher = cmsp; cipher_len = strcspn(cmsp, "-"); mode = cmsp + cipher_len; -- cgit v1.2.1 From 0b508bc926bdced678febee2a2b8cdba0a19e481 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 26 Sep 2017 11:02:12 -0700 Subject: block: fix a build error The code is only for blkcg not for all cgroups Fixes: d4478e92d618 ("block/loop: make loop cgroup aware") Reported-by: kbuild test robot Signed-off-by: Shaohua Li Signed-off-by: Jens Axboe --- drivers/block/loop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index fd4eff5f5b76..bc8e61506968 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1692,7 +1692,7 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, } /* always use the first bio's css */ -#ifdef CONFIG_CGROUPS +#ifdef CONFIG_BLK_CGROUP if (cmd->use_aio && cmd->rq->bio && cmd->rq->bio->bi_css) { cmd->css = cmd->rq->bio->bi_css; css_get(cmd->css); -- cgit v1.2.1 From b3cffc3877a6c7edf17fd9c476f3e8d930bccec1 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Sat, 30 Sep 2017 09:49:21 +0800 Subject: null_blk: add "no_sched" module parameter add an option that disable io scheduler for null block device. Signed-off-by: weiping zhang Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers') diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 8042c26ea9e6..bf2c8ca3242a 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -154,6 +154,10 @@ enum { NULL_Q_MQ = 2, }; +static int g_no_sched; +module_param_named(no_sched, g_no_sched, int, S_IRUGO); +MODULE_PARM_DESC(no_sched, "No io scheduler"); + static int g_submit_queues = 1; module_param_named(submit_queues, g_submit_queues, int, S_IRUGO); MODULE_PARM_DESC(submit_queues, "Number of submission queues"); @@ -1754,6 +1758,8 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) set->numa_node = nullb ? nullb->dev->home_node : g_home_node; set->cmd_size = sizeof(struct nullb_cmd); set->flags = BLK_MQ_F_SHOULD_MERGE; + if (g_no_sched) + set->flags |= BLK_MQ_F_NO_SCHED; set->driver_data = NULL; if ((nullb && nullb->dev->blocking) || g_blocking) -- cgit v1.2.1 From 640ab98fb3629c0f8417b9b2532eca596495f3bb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 27 Sep 2017 05:40:16 -0600 Subject: buffer: have alloc_page_buffers() use __GFP_NOFAIL Instead of adding weird retry logic in that function, utilize __GFP_NOFAIL to ensure that the vm takes care of handling any potential retries appropriately. This means we don't have to call free_more_memory() from here. Reviewed-by: Nikolay Borisov Reviewed-by: Jan Kara Signed-off-by: Jens Axboe --- drivers/md/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index d2121637b4ab..4d8ed74efadf 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -368,7 +368,7 @@ static int read_page(struct file *file, unsigned long index, pr_debug("read bitmap file (%dB @ %llu)\n", (int)PAGE_SIZE, (unsigned long long)index << PAGE_SHIFT); - bh = alloc_page_buffers(page, 1<i_blkbits, 0); + bh = alloc_page_buffers(page, 1<i_blkbits, false); if (!bh) { ret = -ENOMEM; goto out; -- cgit v1.2.1 From 8ae4e4477d8f5cc7736816a0492f82934ca32ab7 Mon Sep 17 00:00:00 2001 From: Marc Olson Date: Wed, 6 Sep 2017 17:23:56 -0700 Subject: nvme: update timeout module parameter type The underlying blk_mq_tag_set, and request timeout parameters support an unsigned int. Extend the size of the nvme module parameters for io and admin commands to match. Signed-off-by: Marc Olson Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 8 ++++---- drivers/nvme/host/nvme.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index bb2aad078637..26c8913435b2 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -34,13 +34,13 @@ #define NVME_MINORS (1U << MINORBITS) -unsigned char admin_timeout = 60; -module_param(admin_timeout, byte, 0644); +unsigned int admin_timeout = 60; +module_param(admin_timeout, uint, 0644); MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); EXPORT_SYMBOL_GPL(admin_timeout); -unsigned char nvme_io_timeout = 30; -module_param_named(io_timeout, nvme_io_timeout, byte, 0644); +unsigned int nvme_io_timeout = 30; +module_param_named(io_timeout, nvme_io_timeout, uint, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); EXPORT_SYMBOL_GPL(nvme_io_timeout); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index d3f3c4447515..df787f39f4c1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -21,10 +21,10 @@ #include #include -extern unsigned char nvme_io_timeout; +extern unsigned int nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) -extern unsigned char admin_timeout; +extern unsigned int admin_timeout; #define ADMIN_TIMEOUT (admin_timeout * HZ) #define NVME_DEFAULT_KATO 5 -- cgit v1.2.1 From 786456325b9f05843ec722269a3ccd77c31b9c6d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 22 Sep 2017 18:04:43 -0700 Subject: nvme: use menu Kconfig interface Add a menu interface for NVME host and target support so that it is presented to users more like other Kconfig symbols. This makes the Device Driver menu less cluttered (easier to read) and keeps all of these symbols grouped together. Signed-off-by: Randy Dunlap Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/Kconfig b/drivers/nvme/Kconfig index b7c78a5b1f7a..04008e0bbe81 100644 --- a/drivers/nvme/Kconfig +++ b/drivers/nvme/Kconfig @@ -1,2 +1,6 @@ +menu "NVME Support" + source "drivers/nvme/host/Kconfig" source "drivers/nvme/target/Kconfig" + +endmenu -- cgit v1.2.1 From 3375e29c280894f8d5e65c6920630a42db069599 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 11 Sep 2017 16:14:50 -0700 Subject: nvmet: bump NVMET_NR_QUEUES to 128 Raise the max number of IO queues to 128. There are several hosts with more than 64 cpus/threads. Signed-off-by: James Smart Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/nvmet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 7b8e20adf760..e342f02845c1 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -314,7 +314,7 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, u32 nvmet_get_log_page_len(struct nvme_command *cmd); #define NVMET_QUEUE_SIZE 1024 -#define NVMET_NR_QUEUES 64 +#define NVMET_NR_QUEUES 128 #define NVMET_MAX_CMD NVMET_QUEUE_SIZE #define NVMET_KAS 10 #define NVMET_DISC_KATO 120 -- cgit v1.2.1 From d1f1071f8151355ec149117238e9c5aa6404ca9f Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 24 Sep 2017 16:15:55 +0300 Subject: nvme-fabrics: request transport module Help userspace to make sure transport module is loaded. Signed-off-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 555c976cc2ee..8bca36a46924 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -841,6 +841,9 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) if (ret) goto out_free_opts; + + request_module("nvme-%s", opts->transport); + /* * Check the generic options first as we need a valid transport for * the lookup below. Then clear the generic flags so that transport -- cgit v1.2.1 From eaefd5abf6b095bfc55eb745bdf7c42cf66790eb Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 10:38:42 -0700 Subject: nvme-fc: add uevent for auto-connect To support auto-connecting to FC-NVME devices upon their dynamic appearance, add a uevent that can kick off connection scripts. uevent is posted against the fc_udev device. patch set tested with the following rule to kick an nvme-cli connect-all for the FC initiator and FC target ports. This is just an example for testing and not intended for real life use. ACTION=="change", SUBSYSTEM=="fc", ENV{FC_EVENT}=="nvmediscovery", \ ENV{NVMEFC_HOST_TRADDR}=="*", ENV{NVMEFC_TRADDR}=="*", \ RUN+="/bin/sh -c '/usr/local/sbin/nvme connect-all --transport=fc --host-traddr=$env{NVMEFC_HOST_TRADDR} --traddr=$env{NVMEFC_TRADDR} >> /tmp/nvme_fc.log'" I will post proposed udev/systemd scripts for possible kernel support. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index af075e998944..f546eecb1f82 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -452,6 +452,36 @@ nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr) } EXPORT_SYMBOL_GPL(nvme_fc_unregister_localport); +/* + * TRADDR strings, per FC-NVME are fixed format: + * "nn-0x<16hexdigits>:pn-0x<16hexdigits>" - 43 characters + * udev event will only differ by prefix of what field is + * being specified: + * "NVMEFC_HOST_TRADDR=" or "NVMEFC_TRADDR=" - 19 max characters + * 19 + 43 + null_fudge = 64 characters + */ +#define FCNVME_TRADDR_LENGTH 64 + +static void +nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport, + struct nvme_fc_rport *rport) +{ + char hostaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_HOST_TRADDR=...*/ + char tgtaddr[FCNVME_TRADDR_LENGTH]; /* NVMEFC_TRADDR=...*/ + char *envp[4] = { "FC_EVENT=nvmediscovery", hostaddr, tgtaddr, NULL }; + + if (!(rport->remoteport.port_role & FC_PORT_ROLE_NVME_DISCOVERY)) + return; + + snprintf(hostaddr, sizeof(hostaddr), + "NVMEFC_HOST_TRADDR=nn-0x%016llx:pn-0x%016llx", + lport->localport.node_name, lport->localport.port_name); + snprintf(tgtaddr, sizeof(tgtaddr), + "NVMEFC_TRADDR=nn-0x%016llx:pn-0x%016llx", + rport->remoteport.node_name, rport->remoteport.port_name); + kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp); +} + /** * nvme_fc_register_remoteport - transport entry point called by an * LLDD to register the existence of a NVME @@ -516,6 +546,8 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, list_add_tail(&newrec->endp_list, &lport->endp_list); spin_unlock_irqrestore(&nvme_fc_lock, flags); + nvme_fc_signal_discovery_scan(lport, newrec); + *portptr = &newrec->remoteport; return 0; @@ -634,6 +666,23 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr) } EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport); +/** + * nvme_fc_rescan_remoteport - transport entry point called by an + * LLDD to request a nvme device rescan. + * @remoteport: pointer to the (registered) remote port that is to be + * rescanned. + * + * Returns: N/A + */ +void +nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport) +{ + struct nvme_fc_rport *rport = remoteport_to_rport(remoteport); + + nvme_fc_signal_discovery_scan(rport->lport, rport); +} +EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport); + /* *********************** FC-NVME DMA Handling **************************** */ -- cgit v1.2.1 From 5f5685569ae8fccb0344373d823f2e4c59bb3d8e Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 14 Sep 2017 10:38:41 -0700 Subject: nvme-fc: create fc class and transport device Added a new fc class and a device node for udev events under it. I expect the fc class will eventually be the location where the FC SCSI and FC NVME merge in the future. Therefore names are kept somewhat generic. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f546eecb1f82..2dc9932e4818 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -213,6 +213,13 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt); +/* + * These items are short-term. They will eventually be moved into + * a generic FC class. See comments in module init. + */ +static struct class *fc_class; +static struct device *fc_udev_device; + /* *********************** FC-NVME Port Management ************************ */ @@ -3046,7 +3053,50 @@ static struct nvmf_transport_ops nvme_fc_transport = { static int __init nvme_fc_init_module(void) { - return nvmf_register_transport(&nvme_fc_transport); + int ret; + + /* + * NOTE: + * It is expected that in the future the kernel will combine + * the FC-isms that are currently under scsi and now being + * added to by NVME into a new standalone FC class. The SCSI + * and NVME protocols and their devices would be under this + * new FC class. + * + * As we need something to post FC-specific udev events to, + * specifically for nvme probe events, start by creating the + * new device class. When the new standalone FC class is + * put in place, this code will move to a more generic + * location for the class. + */ + fc_class = class_create(THIS_MODULE, "fc"); + if (IS_ERR(fc_class)) { + pr_err("couldn't register class fc\n"); + return PTR_ERR(fc_class); + } + + /* + * Create a device for the FC-centric udev events + */ + fc_udev_device = device_create(fc_class, NULL, MKDEV(0, 0), NULL, + "fc_udev_device"); + if (IS_ERR(fc_udev_device)) { + pr_err("couldn't create fc_udev device!\n"); + ret = PTR_ERR(fc_udev_device); + goto out_destroy_class; + } + + ret = nvmf_register_transport(&nvme_fc_transport); + if (ret) + goto out_destroy_device; + + return 0; + +out_destroy_device: + device_destroy(fc_class, MKDEV(0, 0)); +out_destroy_class: + class_destroy(fc_class); + return ret; } static void __exit nvme_fc_exit_module(void) @@ -3059,6 +3109,9 @@ static void __exit nvme_fc_exit_module(void) ida_destroy(&nvme_fc_local_port_cnt); ida_destroy(&nvme_fc_ctrl_cnt); + + device_destroy(fc_class, MKDEV(0, 0)); + class_destroy(fc_class); } module_init(nvme_fc_init_module); -- cgit v1.2.1 From 469d0ef06debe29ec60350b9976a520c4ad5a1e8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 26 Sep 2017 21:50:45 -0700 Subject: nvme-fc: move remote port get/put/free location move nvme_fc_rport_get/put and rport free to higher in the file to avoid adding prototypes to resolve references in upcoming code additions Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 78 +++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 39 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 2dc9932e4818..b8e0822127c1 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -489,6 +489,45 @@ nvme_fc_signal_discovery_scan(struct nvme_fc_lport *lport, kobject_uevent_env(&fc_udev_device->kobj, KOBJ_CHANGE, envp); } +static void +nvme_fc_free_rport(struct kref *ref) +{ + struct nvme_fc_rport *rport = + container_of(ref, struct nvme_fc_rport, ref); + struct nvme_fc_lport *lport = + localport_to_lport(rport->remoteport.localport); + unsigned long flags; + + WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED); + WARN_ON(!list_empty(&rport->ctrl_list)); + + /* remove from lport list */ + spin_lock_irqsave(&nvme_fc_lock, flags); + list_del(&rport->endp_list); + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + /* let the LLDD know we've finished tearing it down */ + lport->ops->remoteport_delete(&rport->remoteport); + + ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); + + kfree(rport); + + nvme_fc_lport_put(lport); +} + +static void +nvme_fc_rport_put(struct nvme_fc_rport *rport) +{ + kref_put(&rport->ref, nvme_fc_free_rport); +} + +static int +nvme_fc_rport_get(struct nvme_fc_rport *rport) +{ + return kref_get_unless_zero(&rport->ref); +} + /** * nvme_fc_register_remoteport - transport entry point called by an * LLDD to register the existence of a NVME @@ -568,45 +607,6 @@ out_reghost_failed: } EXPORT_SYMBOL_GPL(nvme_fc_register_remoteport); -static void -nvme_fc_free_rport(struct kref *ref) -{ - struct nvme_fc_rport *rport = - container_of(ref, struct nvme_fc_rport, ref); - struct nvme_fc_lport *lport = - localport_to_lport(rport->remoteport.localport); - unsigned long flags; - - WARN_ON(rport->remoteport.port_state != FC_OBJSTATE_DELETED); - WARN_ON(!list_empty(&rport->ctrl_list)); - - /* remove from lport list */ - spin_lock_irqsave(&nvme_fc_lock, flags); - list_del(&rport->endp_list); - spin_unlock_irqrestore(&nvme_fc_lock, flags); - - /* let the LLDD know we've finished tearing it down */ - lport->ops->remoteport_delete(&rport->remoteport); - - ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); - - kfree(rport); - - nvme_fc_lport_put(lport); -} - -static void -nvme_fc_rport_put(struct nvme_fc_rport *rport) -{ - kref_put(&rport->ref, nvme_fc_free_rport); -} - -static int -nvme_fc_rport_get(struct nvme_fc_rport *rport) -{ - return kref_get_unless_zero(&rport->ref); -} - static int nvme_fc_abort_lsops(struct nvme_fc_rport *rport) { -- cgit v1.2.1 From 5fdee2127faa77c9c91862ad5e001dfab7013e92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 5 Oct 2017 21:22:52 +0200 Subject: block: remove QUEUE_FLAG_STACKABLE We already have a queue_is_rq_based helper to check if a request_queue is request based, so we can remove the flag for it. Acked-by: Mike Snitzer Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/md/dm-rq.c | 2 +- drivers/md/dm-table.c | 15 +-------------- drivers/md/dm.c | 11 ----------- 3 files changed, 2 insertions(+), 26 deletions(-) (limited to 'drivers') diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index eadfcfd106ff..9d32f25489c2 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -56,7 +56,7 @@ static unsigned dm_get_blk_mq_queue_depth(void) int dm_request_based(struct mapped_device *md) { - return blk_queue_stackable(md->queue); + return queue_is_rq_based(md->queue); } static void dm_old_start_queue(struct request_queue *q) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index ef7b8f201f73..75281828f2cb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1000,7 +1000,7 @@ verify_rq_based: list_for_each_entry(dd, devices, list) { struct request_queue *q = bdev_get_queue(dd->dm_dev->bdev); - if (!blk_queue_stackable(q)) { + if (!queue_is_rq_based(q)) { DMERR("table load rejected: including" " non-request-stackable devices"); return -EINVAL; @@ -1847,19 +1847,6 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, */ if (blk_queue_add_random(q) && dm_table_all_devices_attribute(t, device_is_not_random)) queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, q); - - /* - * QUEUE_FLAG_STACKABLE must be set after all queue settings are - * visible to other CPUs because, once the flag is set, incoming bios - * are processed by request-based dm, which refers to the queue - * settings. - * Until the flag set, bios are passed to bio-based dm and queued to - * md->deferred where queue settings are not needed yet. - * Those bios are passed to request-based dm at the resume time. - */ - smp_mb(); - if (dm_table_request_based(t)) - queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); } unsigned int dm_table_get_num_targets(struct dm_table *t) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 6e54145969c5..8d07ad61221c 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1612,17 +1612,6 @@ static void dm_wq_work(struct work_struct *work); void dm_init_md_queue(struct mapped_device *md) { - /* - * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device may not have been decided yet. - * The type is decided at the first table loading time. - * To prevent problematic device stacking, clear the queue flag - * for request stacking support until then. - * - * This queue is new, so no concurrency on the queue_flags. - */ - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); - /* * Initialize data that will only be used by a non-blk-mq DM queue * - must do so here (in alloc_dev callchain) before queue is used -- cgit v1.2.1 From 47bc227deedbcf3ac214a2d922c28dfa5e403f09 Mon Sep 17 00:00:00 2001 From: Christos Gkekas Date: Thu, 12 Oct 2017 23:15:54 +0100 Subject: mtip32xx: Clean up unused variables Remove variables that are set but never used. Signed-off-by: Christos Gkekas Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'drivers') diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 4a3cfc7940de..b8af7352a18f 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -887,12 +887,9 @@ static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) static bool mtip_pause_ncq(struct mtip_port *port, struct host_to_dev_fis *fis) { - struct host_to_dev_fis *reply; unsigned long task_file_data; - reply = port->rxfis + RX_FIS_D2H_REG; task_file_data = readl(port->mmio+PORT_TFDATA); - if ((task_file_data & 1)) return false; @@ -1020,7 +1017,6 @@ static int mtip_exec_internal_command(struct mtip_port *port, .opts = opts }; int rv = 0; - unsigned long start; /* Make sure the buffer is 8 byte aligned. This is asic specific. */ if (buffer & 0x00000007) { @@ -1057,7 +1053,6 @@ static int mtip_exec_internal_command(struct mtip_port *port, /* Copy the command to the command table */ memcpy(int_cmd->command, fis, fis_len*4); - start = jiffies; rq->timeout = timeout; /* insert request and run queue */ @@ -3015,7 +3010,6 @@ static int mtip_hw_init(struct driver_data *dd) { int i; int rv; - unsigned int num_command_slots; unsigned long timeout, timetaken; dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; @@ -3025,7 +3019,6 @@ static int mtip_hw_init(struct driver_data *dd) rv = -EIO; goto out1; } - num_command_slots = dd->slot_groups * 32; hba_setup(dd); -- cgit v1.2.1 From 900148296b78c61aa8c443dc594c0da968c3be53 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:50 +0200 Subject: lightnvm: prevent target type module removal when in use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If target type module e.g. pblk here is unloaded (rmmod) while module is in use (after creating target) system crashes. We fix this by using module API refcnt. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 4 ++++ drivers/lightnvm/pblk-init.c | 1 + 2 files changed, 5 insertions(+) (limited to 'drivers') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index ddae430b6eae..60e163be5a89 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -316,6 +317,8 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) list_add_tail(&t->list, &dev->targets); mutex_unlock(&dev->mlock); + __module_get(tt->owner); + return 0; err_sysfs: if (tt->exit) @@ -351,6 +354,7 @@ static void __nvm_remove_target(struct nvm_target *t) nvm_remove_tgt_dev(t->dev, 1); put_disk(tdisk); + module_put(t->type->owner); list_del(&t->list); kfree(t); diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 1b0f61233c21..6df65d14a2c5 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -1044,6 +1044,7 @@ static struct nvm_tgt_type tt_pblk = { .sysfs_init = pblk_sysfs_init, .sysfs_exit = pblk_sysfs_exit, + .owner = THIS_MODULE, }; static int __init pblk_module_init(void) -- cgit v1.2.1 From bb6aa6f08268bbce4e0185b18cab9e04505d6695 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:51 +0200 Subject: lightnvm: prevent bd removal if busy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a virtual block device is formatted and mounted after creating with "nvme lnvm create... -t pblk", a removal from "nvm lnvm remove" would result in this: 446416.309757] bdi-block not registered [446416.309773] ------------[ cut here ]------------ [446416.309780] WARNING: CPU: 3 PID: 4319 at fs/fs-writeback.c:2159 __mark_inode_dirty+0x268/0x340 Ideally removal should return -EBUSY as block device is mounted after formatting. This patch tries to address this checking if whole device or any partition of it already mounted or not before removal. Whole device is checked using "bd_super" member of block device. This member is always set once block device has been mounted using a filesystem. Another member "bd_part_count" takes care of checking any if any partitions are under use. "bd_part_count" is only updated under locks when partitions are opened or closed (first open and last release). This at least does take care sending -EBUSY if removal is being attempted while whole block device or any partition is mounted. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 60e163be5a89..c490711cf0f4 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -373,6 +373,7 @@ static void __nvm_remove_target(struct nvm_target *t) static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) { struct nvm_target *t; + struct block_device *bdev; mutex_lock(&dev->mlock); t = nvm_find_target(dev, remove->tgtname); @@ -380,6 +381,19 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) mutex_unlock(&dev->mlock); return 1; } + bdev = bdget_disk(t->disk, 0); + if (!bdev) { + pr_err("nvm: removal failed, allocating bd failed\n"); + mutex_unlock(&dev->mlock); + return -ENOMEM; + } + if (bdev->bd_super || bdev->bd_part_count) { + pr_err("nvm: removal failed, block device busy\n"); + bdput(bdev); + mutex_unlock(&dev->mlock); + return -EBUSY; + } + bdput(bdev); __nvm_remove_target(t); mutex_unlock(&dev->mlock); -- cgit v1.2.1 From 88d31ea2676696ad0802a361c8b824f0762fa34c Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:52 +0200 Subject: lightnvm: protect target type list with correct locks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nvm_tgt_types list was protected by wrong lock for NVM_INFO ioctl call and can race with addition or removal of target types. Also unregistering target type was not protected correctly. Fixes: 5cd907853 ("lightnvm: remove nested lock conflict with mm") Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index c490711cf0f4..ee2b6d771990 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -589,9 +589,9 @@ void nvm_unregister_tgt_type(struct nvm_tgt_type *tt) if (!tt) return; - down_write(&nvm_lock); + down_write(&nvm_tgtt_lock); list_del(&tt->list); - up_write(&nvm_lock); + up_write(&nvm_tgtt_lock); } EXPORT_SYMBOL(nvm_unregister_tgt_type); @@ -1195,7 +1195,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg) info->version[1] = NVM_VERSION_MINOR; info->version[2] = NVM_VERSION_PATCH; - down_write(&nvm_lock); + down_write(&nvm_tgtt_lock); list_for_each_entry(tt, &nvm_tgt_types, list) { struct nvm_ioctl_info_tgt *tgt = &info->tgts[tgt_iter]; @@ -1208,7 +1208,7 @@ static long nvm_ioctl_info(struct file *file, void __user *arg) } info->tgtsize = tgt_iter; - up_write(&nvm_lock); + up_write(&nvm_tgtt_lock); if (copy_to_user(arg, info, sizeof(struct nvm_ioctl_info))) { kfree(info); -- cgit v1.2.1 From a96d50fa0c8d3e399c49a0a90ddbbaabf8a46bb3 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:53 +0200 Subject: lightnvm: remove already calculated nr_chnls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove repeated calculation for number of channels while creating a target device. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index ee2b6d771990..798964f511cd 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -139,7 +139,6 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev, int prev_nr_luns; int i, j; - nr_chnls = nr_luns / dev->geo.luns_per_chnl; nr_chnls = (nr_chnls_mod == 0) ? nr_chnls : nr_chnls + 1; dev_map = kmalloc(sizeof(struct nvm_dev_map), GFP_KERNEL); -- cgit v1.2.1 From c9d84b350f9b253872fc24f4dcfea166c884ee15 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:54 +0200 Subject: lightnvm: pblk: fix error path in pblk_lines_alloc_metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use appropriate memory free calls based on allocation type used and also fix number of times free is called if kmalloc fails. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 6df65d14a2c5..05665a7e648c 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -630,7 +630,10 @@ static int pblk_lines_alloc_metadata(struct pblk *pblk) fail_free_emeta: while (--i >= 0) { - vfree(l_mg->eline_meta[i]->buf); + if (l_mg->emeta_alloc_type == PBLK_VMALLOC_META) + vfree(l_mg->eline_meta[i]->buf); + else + kfree(l_mg->eline_meta[i]->buf); kfree(l_mg->eline_meta[i]); } -- cgit v1.2.1 From 32c662c58a9b9d0c99e713a14ca323a9a91c73a0 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:55 +0200 Subject: lightnvm: include NVM Express driver if OCSSD is selected for build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Because NVM needs BLK_DEV_NVME, select it automatically if we mark NVM in config file before building kernel. Also append PCI to depends as select doesn't automatically add dependencies. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index ead61a93cb4e..2a953efec4e1 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -4,7 +4,8 @@ menuconfig NVM bool "Open-Channel SSD target support" - depends on BLOCK && HAS_DMA + depends on BLOCK && HAS_DMA && PCI + select BLK_DEV_NVME help Say Y here to get to enable Open-channel SSDs. -- cgit v1.2.1 From e57903fd972a398b7140d0bc055714e13a0e58c5 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:56 +0200 Subject: lightnvm: pblk: protect line bitmap while submitting meta io MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It seems pblk_dealloc_page would race against pblk_alloc_pages for line bitmap for sector allocation.The chances are very low but might as well protect the bitmap properly. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 81501644fb15..b53bb00a9918 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -486,12 +486,14 @@ void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) u64 addr; int i; + spin_lock(&line->lock); addr = find_next_zero_bit(line->map_bitmap, pblk->lm.sec_per_line, line->cur_sec); line->cur_sec = addr - nr_secs; for (i = 0; i < nr_secs; i++, line->cur_sec--) WARN_ON(!test_and_clear_bit(line->cur_sec, line->map_bitmap)); + spin_unlock(&line->lock); } u64 __pblk_alloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs) -- cgit v1.2.1 From 4e76af53e132bff9e2b94f018457fadabf5ab419 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:57 +0200 Subject: lightnvm: pblk: fix message if L2P MAP is in device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This usually happens if we are developing with qemu and ll2pmode has default value. Improve description. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 05665a7e648c..8c85779e9635 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -914,7 +914,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, int ret; if (dev->identity.dom & NVM_RSP_L2P) { - pr_err("pblk: device-side L2P table not supported. (%x)\n", + pr_err("pblk: host-side L2P table not supported. (%x)\n", dev->identity.dom); return ERR_PTR(-EINVAL); } -- cgit v1.2.1 From c5493845b7b303315118fb4ab96654bf7cb897f0 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:58 +0200 Subject: lightnvm: pblk: improve error message if down_timeout fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two pr_err messages are useless as they don't differentiate error code. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index b53bb00a9918..027c42bb1ab9 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1691,16 +1691,8 @@ static void __pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, #endif ret = down_timeout(&rlun->wr_sem, msecs_to_jiffies(30000)); - if (ret) { - switch (ret) { - case -ETIME: - pr_err("pblk: lun semaphore timed out\n"); - break; - case -EINTR: - pr_err("pblk: lun semaphore timed out\n"); - break; - } - } + if (ret == -ETIME || ret == -EINTR) + pr_err("pblk: taking lun semaphore timed out: err %d\n", -ret); } void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas) -- cgit v1.2.1 From c79819bc0877e4cbed8013b1abc9697e8805b21b Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:45:59 +0200 Subject: lightnvm: pblk: print incompatible line version correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correct it by converting little endian to cpu endian and also define a macro for line version so that maintenance is easy. Signed-off-by: Rakesh Pandit Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 2 +- drivers/lightnvm/pblk-recovery.c | 4 ++-- drivers/lightnvm/pblk.h | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 027c42bb1ab9..8536d38ef97e 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -978,7 +978,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line, memcpy(smeta_buf->header.uuid, pblk->instance_uuid, 16); smeta_buf->header.id = cpu_to_le32(line->id); smeta_buf->header.type = cpu_to_le16(line->type); - smeta_buf->header.version = cpu_to_le16(1); + smeta_buf->header.version = SMETA_VERSION; /* Start metadata */ smeta_buf->seq_nr = cpu_to_le64(line->seq_nr); diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index cb556e06673e..caf124279575 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -900,9 +900,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) if (le32_to_cpu(smeta_buf->header.identifier) != PBLK_MAGIC) continue; - if (le16_to_cpu(smeta_buf->header.version) != 1) { + if (smeta_buf->header.version != SMETA_VERSION) { pr_err("pblk: found incompatible line version %u\n", - smeta_buf->header.version); + le16_to_cpu(smeta_buf->header.version)); return ERR_PTR(-EINVAL); } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 67e623bd5c2d..9ece409993fe 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -310,6 +310,7 @@ enum { }; #define PBLK_MAGIC 0x70626c6b /*pblk*/ +#define SMETA_VERSION cpu_to_le16(1) struct line_header { __le32 crc; -- cgit v1.2.1 From 32825ebb06fafeff463ed23e9d0dea459ebd30fe Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:00 +0200 Subject: lightnvm: pblk: reuse pblk_gc_should_kick MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a trivial change which reuses pblk_gc_should_kick instead of repeating it again in pblk_rl_free_lines_inc. Signed-off-by: Rakesh Pandit Made it apply to the common case. Reviewed-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 2 -- drivers/lightnvm/pblk-rl.c | 33 +++++++++------------------------ drivers/lightnvm/pblk.h | 1 - 3 files changed, 9 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 8536d38ef97e..a68c6ae536e5 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1591,8 +1591,6 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line) spin_unlock(&line->lock); spin_unlock(&l_mg->gc_lock); - - pblk_gc_should_kick(pblk); } void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line) diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 2e6a5361baf0..9565c3bc4d0b 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -96,9 +96,11 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) * * Only the total number of free blocks is used to configure the rate limiter. */ -static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max) +static void pblk_rl_update_rates(struct pblk_rl *rl) { + struct pblk *pblk = container_of(rl, struct pblk, rl); unsigned long free_blocks = pblk_rl_nr_free_blks(rl); + int max = rl->rb_budget; if (free_blocks >= rl->high) { rl->rb_user_max = max; @@ -124,23 +126,18 @@ static int pblk_rl_update_rates(struct pblk_rl *rl, unsigned long max) rl->rb_state = PBLK_RL_LOW; } - return rl->rb_state; + if (rl->rb_state == (PBLK_RL_MID | PBLK_RL_LOW)) + pblk_gc_should_start(pblk); + else + pblk_gc_should_stop(pblk); } void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line) { - struct pblk *pblk = container_of(rl, struct pblk, rl); int blk_in_line = atomic_read(&line->blk_in_line); - int ret; atomic_add(blk_in_line, &rl->free_blocks); - /* Rates will not change that often - no need to lock update */ - ret = pblk_rl_update_rates(rl, rl->rb_budget); - - if (ret == (PBLK_RL_MID | PBLK_RL_LOW)) - pblk_gc_should_start(pblk); - else - pblk_gc_should_stop(pblk); + pblk_rl_update_rates(rl); } void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) @@ -148,19 +145,7 @@ void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line) int blk_in_line = atomic_read(&line->blk_in_line); atomic_sub(blk_in_line, &rl->free_blocks); -} - -void pblk_gc_should_kick(struct pblk *pblk) -{ - struct pblk_rl *rl = &pblk->rl; - int ret; - - /* Rates will not change that often - no need to lock update */ - ret = pblk_rl_update_rates(rl, rl->rb_budget); - if (ret == (PBLK_RL_MID | PBLK_RL_LOW)) - pblk_gc_should_start(pblk); - else - pblk_gc_should_stop(pblk); + pblk_rl_update_rates(rl); } int pblk_rl_high_thrs(struct pblk_rl *rl) diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 9ece409993fe..3a07c5b61a0c 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -824,7 +824,6 @@ int pblk_gc_init(struct pblk *pblk); void pblk_gc_exit(struct pblk *pblk); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); -void pblk_gc_should_kick(struct pblk *pblk); void pblk_gc_kick(struct pblk *pblk); void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, int *gc_active); -- cgit v1.2.1 From a1121176ff757e3c073490a69608ea0b18a00ec1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:01 +0200 Subject: lightnvm: pblk: initialize debug stat counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initialize the stat counter for garbage collected reads. Fixes: a4bd217b43268 ("lightnvm: physical block device (pblk) target") Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 8c85779e9635..83445115a922 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -947,6 +947,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, atomic_long_set(&pblk->recov_writes, 0); atomic_long_set(&pblk->recov_writes, 0); atomic_long_set(&pblk->recov_gc_writes, 0); + atomic_long_set(&pblk->recov_gc_reads, 0); #endif atomic_long_set(&pblk->read_failed, 0); -- cgit v1.2.1 From 7d327a9ed6c4dca341ebf99012e0a6b80a3050e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:02 +0200 Subject: lightnvm: pblk: use right flag for GC allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The data buffer for the GC path allocates virtual memory through vmalloc. When this change was introduced, a flag signaling kmalloc'ed memory was wrongly introduced. Use the right flag when creating a bio from this buffer. Fixes: de54e703a422 ("lightnvm: pblk: use vmalloc for GC data buffer") Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-read.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index d682e89e6493..ee8efb55b330 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -499,7 +499,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, data_len = (*secs_to_gc) * geo->sec_size; bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len, - PBLK_KMALLOC_META, GFP_KERNEL); + PBLK_VMALLOC_META, GFP_KERNEL); if (IS_ERR(bio)) { pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); goto err_free_dma; @@ -519,7 +519,7 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, if (ret) { bio_endio(bio); pr_err("pblk: GC read request failed\n"); - goto err_free_dma; + goto err_free_bio; } if (!wait_for_completion_io_timeout(&wait, @@ -541,10 +541,13 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, atomic_long_sub(*secs_to_gc, &pblk->inflight_reads); #endif + bio_put(bio); out: nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return NVM_IO_OK; +err_free_bio: + bio_put(bio); err_free_dma: nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return NVM_IO_ERR; -- cgit v1.2.1 From cd8ddbf7a5e206fe6995ab0aee245d597dd6a7f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:03 +0200 Subject: lightnvm: pblk: free padded entries in write buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a REQ_FLUSH reaches pblk, the bio cannot be directly completed. Instead, data on the write buffer is flushed and the bio is completed on the completion pah. This might require some sectors to be padded in order to guarantee a successful write. This patch fixes a memory leak on the padded pages. A consequence of this bad free was that internal bios not containing data (only a flush) were not being completed. Fixes: a4bd217b4326 ("lightnvm: physical block device (pblk) target") Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 1 - drivers/lightnvm/pblk-write.c | 7 ++++++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index a68c6ae536e5..9299a5a75a18 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -190,7 +190,6 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, WARN_ON(off + nr_pages != bio->bi_vcnt); - bio_advance(bio, off * PBLK_EXPOSED_PAGE_SIZE); for (i = off; i < nr_pages + off; i++) { bv = bio->bi_io_vec[i]; mempool_free(bv.bv_page, pblk->page_pool); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 3ad9e56d2473..d89ac573f8d8 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -33,6 +33,10 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, bio_endio(original_bio); } + if (c_ctx->nr_padded) + pblk_bio_free_pages(pblk, rqd->bio, c_ctx->nr_valid, + c_ctx->nr_padded); + #ifdef CONFIG_NVM_DEBUG atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes); #endif @@ -521,7 +525,8 @@ static void pblk_free_write_rqd(struct pblk *pblk, struct nvm_rq *rqd) struct bio *bio = rqd->bio; if (c_ctx->nr_padded) - pblk_bio_free_pages(pblk, bio, rqd->nr_ppas, c_ctx->nr_padded); + pblk_bio_free_pages(pblk, bio, c_ctx->nr_valid, + c_ctx->nr_padded); } static int pblk_submit_write(struct pblk *pblk) -- cgit v1.2.1 From e0e12a707f02fcde1b77a2417d9fb0ae1ce3b003 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:04 +0200 Subject: lightnvm: pblk: fix write I/O sync stat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix stat counter to collect the right number of I/Os being synced on the completion path. Fixes: 0880a9aa2d91f ("lightnvm: pblk: delete redundant buffer pointer") Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index d89ac573f8d8..d82ca8bd8390 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -38,7 +38,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, c_ctx->nr_padded); #ifdef CONFIG_NVM_DEBUG - atomic_long_add(c_ctx->nr_valid, &pblk->sync_writes); + atomic_long_add(rqd->nr_ppas, &pblk->sync_writes); #endif ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); -- cgit v1.2.1 From da67e68fb9d37fb9072b20cc75d4337a73bc01b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:05 +0200 Subject: lightnvm: pblk: avoid deadlock on low LUN config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On low LUN configurations, make sure not to send bios that are bigger than the buffer size. Fixes: a4bd217b4326 ("lightnvm: physical block device (pblk) target") Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 2 +- drivers/lightnvm/pblk-rl.c | 6 ++++++ drivers/lightnvm/pblk.h | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 83445115a922..eee4eeb47d07 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -46,7 +46,7 @@ static int pblk_rw_io(struct request_queue *q, struct pblk *pblk, * user I/Os. Unless stalled, the rate limiter leaves at least 256KB * available for user I/O. */ - if (unlikely(pblk_get_secs(bio) >= pblk_rl_sysfs_rate_show(&pblk->rl))) + if (pblk_get_secs(bio) > pblk_rl_max_io(&pblk->rl)) blk_queue_split(q, &bio); return pblk_write_to_cache(pblk, bio, PBLK_IOTYPE_USER); diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 9565c3bc4d0b..0896439a91b0 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -163,6 +163,11 @@ int pblk_rl_sysfs_rate_show(struct pblk_rl *rl) return rl->rb_user_max; } +int pblk_rl_max_io(struct pblk_rl *rl) +{ + return rl->rb_max_io; +} + static void pblk_rl_u_timer(unsigned long data) { struct pblk_rl *rl = (struct pblk_rl *)data; @@ -199,6 +204,7 @@ void pblk_rl_init(struct pblk_rl *rl, int budget) /* To start with, all buffer is available to user I/O writers */ rl->rb_budget = budget; rl->rb_user_max = budget; + rl->rb_max_io = budget >> 1; rl->rb_gc_max = 0; rl->rb_state = PBLK_RL_HIGH; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 3a07c5b61a0c..b592e5194b0f 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -267,6 +267,7 @@ struct pblk_rl { int rb_gc_max; /* Max buffer entries available for GC I/O */ int rb_gc_rsv; /* Reserved buffer entries for GC I/O */ int rb_state; /* Rate-limiter current state */ + int rb_max_io; /* Maximum size for an I/O giving the config */ atomic_t rb_user_cnt; /* User I/O buffer counter */ atomic_t rb_gc_cnt; /* GC I/O buffer counter */ @@ -844,6 +845,7 @@ int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); int pblk_rl_sysfs_rate_show(struct pblk_rl *rl); +int pblk_rl_max_io(struct pblk_rl *rl); void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left); -- cgit v1.2.1 From bd432417681a224d9fa4a9d43be7d4edc82135b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:06 +0200 Subject: lightnvm: pblk: fix min size for page mempool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk uses an internal page mempool for allocating pages on internal bios. The main two users of this memory pool are partial reads (reads with some sectors in cache and some on media) and padded writes, which need to add dummy pages to an existing bio already containing valid data (and with a large enough bioset allocated). In both cases, the maximum number of pages per bio is defined by the maximum number of physical sectors supported by the underlying device. This patch fixes a bad mempool allocation, where the min_nr of elements on the pool was fixed (to 16), which is lower than the maximum number of sectors supported by NVMe (as of the time for this patch). Instead, use the maximum number of allowed sectors reported by the device. Reported-by: Jens Axboe Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 6 +++--- drivers/lightnvm/pblk-init.c | 15 ++++++++------- drivers/lightnvm/pblk-read.c | 2 +- drivers/lightnvm/pblk.h | 2 +- 4 files changed, 13 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 9299a5a75a18..f5fbb9a46784 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -192,7 +192,7 @@ void pblk_bio_free_pages(struct pblk *pblk, struct bio *bio, int off, for (i = off; i < nr_pages + off; i++) { bv = bio->bi_io_vec[i]; - mempool_free(bv.bv_page, pblk->page_pool); + mempool_free(bv.bv_page, pblk->page_bio_pool); } } @@ -204,14 +204,14 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, int i, ret; for (i = 0; i < nr_pages; i++) { - page = mempool_alloc(pblk->page_pool, flags); + page = mempool_alloc(pblk->page_bio_pool, flags); if (!page) goto err; ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); if (ret != PBLK_EXPOSED_PAGE_SIZE) { pr_err("pblk: could not add page to bio\n"); - mempool_free(page, pblk->page_pool); + mempool_free(page, pblk->page_bio_pool); goto err; } } diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index eee4eeb47d07..7b1f29c71338 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -132,7 +132,6 @@ static int pblk_rwb_init(struct pblk *pblk) } /* Minimum pages needed within a lun */ -#define PAGE_POOL_SIZE 16 #define ADDR_POOL_SIZE 64 static int pblk_set_ppaf(struct pblk *pblk) @@ -247,14 +246,16 @@ static int pblk_core_init(struct pblk *pblk) if (pblk_init_global_caches(pblk)) return -ENOMEM; - pblk->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0); - if (!pblk->page_pool) + /* internal bios can be at most the sectors signaled by the device. */ + pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev), + 0); + if (!pblk->page_bio_pool) return -ENOMEM; pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE, pblk_blk_ws_cache); if (!pblk->line_ws_pool) - goto free_page_pool; + goto free_page_bio_pool; pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); if (!pblk->rec_pool) @@ -309,8 +310,8 @@ free_rec_pool: mempool_destroy(pblk->rec_pool); free_blk_ws_pool: mempool_destroy(pblk->line_ws_pool); -free_page_pool: - mempool_destroy(pblk->page_pool); +free_page_bio_pool: + mempool_destroy(pblk->page_bio_pool); return -ENOMEM; } @@ -322,7 +323,7 @@ static void pblk_core_free(struct pblk *pblk) if (pblk->bb_wq) destroy_workqueue(pblk->bb_wq); - mempool_destroy(pblk->page_pool); + mempool_destroy(pblk->page_bio_pool); mempool_destroy(pblk->line_ws_pool); mempool_destroy(pblk->rec_pool); mempool_destroy(pblk->g_rq_pool); diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index ee8efb55b330..402c732f0970 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -238,7 +238,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, kunmap_atomic(src_p); kunmap_atomic(dst_p); - mempool_free(src_bv.bv_page, pblk->page_pool); + mempool_free(src_bv.bv_page, pblk->page_bio_pool); hole = find_next_zero_bit(read_bitmap, nr_secs, hole + 1); } while (hole < nr_secs); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index b592e5194b0f..229f6020ad8a 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -620,7 +620,7 @@ struct pblk { struct list_head compl_list; - mempool_t *page_pool; + mempool_t *page_bio_pool; mempool_t *line_ws_pool; mempool_t *rec_pool; mempool_t *g_rq_pool; -- cgit v1.2.1 From b84ae4a8b883b96b95fff0e3979ff2c65bbf96b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:07 +0200 Subject: lightnvm: pblk: simplify work_queue mempool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In pblk, we have a mempool to allocate a generic structure that we pass along workqueues. This is heavily used in the GC path in order to have enough inflight reads and fully utilize the GC bandwidth. However, the current GC path copies data to the host memory and puts it back into the write buffer. This requires a vmalloc allocation for the data and a memory copy. Thus, guaranteeing the allocation by using a mempool for the structure in itself does not give us much. Until we implement support for vector copy to avoid moving data through the host, just allocate the workqueue structure using kmalloc. This allows us to have a much smaller mempool. Reported-by: Jens Axboe Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 13 +++++++------ drivers/lightnvm/pblk-gc.c | 32 ++++++++++++++++---------------- drivers/lightnvm/pblk-init.c | 32 ++++++++++++++++---------------- drivers/lightnvm/pblk-write.c | 4 ++-- drivers/lightnvm/pblk.h | 11 ++++++----- 5 files changed, 47 insertions(+), 45 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index f5fbb9a46784..b92532211866 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -33,7 +33,8 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, pr_err("pblk: attempted to erase bb: line:%d, pos:%d\n", line->id, pos); - pblk_line_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, pblk->bb_wq); + pblk_gen_run_ws(pblk, NULL, ppa, pblk_line_mark_bb, + GFP_ATOMIC, pblk->bb_wq); } static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd) @@ -1623,7 +1624,7 @@ void pblk_line_close_ws(struct work_struct *work) struct pblk_line *line = line_ws->line; pblk_line_close(pblk, line); - mempool_free(line_ws, pblk->line_ws_pool); + mempool_free(line_ws, pblk->gen_ws_pool); } void pblk_line_mark_bb(struct work_struct *work) @@ -1648,16 +1649,16 @@ void pblk_line_mark_bb(struct work_struct *work) } kfree(ppa); - mempool_free(line_ws, pblk->line_ws_pool); + mempool_free(line_ws, pblk->gen_ws_pool); } -void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *), +void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, + void (*work)(struct work_struct *), gfp_t gfp_mask, struct workqueue_struct *wq) { struct pblk_line_ws *line_ws; - line_ws = mempool_alloc(pblk->line_ws_pool, GFP_ATOMIC); + line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask); if (!line_ws) return; diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 6090d28f7995..f163829ecca8 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -136,12 +136,12 @@ static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) static void pblk_gc_line_ws(struct work_struct *work) { - struct pblk_line_ws *line_rq_ws = container_of(work, + struct pblk_line_ws *gc_rq_ws = container_of(work, struct pblk_line_ws, ws); - struct pblk *pblk = line_rq_ws->pblk; + struct pblk *pblk = gc_rq_ws->pblk; struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line = line_rq_ws->line; - struct pblk_gc_rq *gc_rq = line_rq_ws->priv; + struct pblk_line *line = gc_rq_ws->line; + struct pblk_gc_rq *gc_rq = gc_rq_ws->priv; up(&gc->gc_sem); @@ -151,7 +151,7 @@ static void pblk_gc_line_ws(struct work_struct *work) gc_rq->nr_secs); } - mempool_free(line_rq_ws, pblk->line_ws_pool); + kfree(gc_rq_ws); } static void pblk_gc_line_prepare_ws(struct work_struct *work) @@ -164,7 +164,7 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) struct pblk_line_meta *lm = &pblk->lm; struct pblk_gc *gc = &pblk->gc; struct line_emeta *emeta_buf; - struct pblk_line_ws *line_rq_ws; + struct pblk_line_ws *gc_rq_ws; struct pblk_gc_rq *gc_rq; __le64 *lba_list; int sec_left, nr_secs, bit; @@ -223,19 +223,19 @@ next_rq: gc_rq->nr_secs = nr_secs; gc_rq->line = line; - line_rq_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); - if (!line_rq_ws) + gc_rq_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); + if (!gc_rq_ws) goto fail_free_gc_rq; - line_rq_ws->pblk = pblk; - line_rq_ws->line = line; - line_rq_ws->priv = gc_rq; + gc_rq_ws->pblk = pblk; + gc_rq_ws->line = line; + gc_rq_ws->priv = gc_rq; down(&gc->gc_sem); kref_get(&line->ref); - INIT_WORK(&line_rq_ws->ws, pblk_gc_line_ws); - queue_work(gc->gc_line_reader_wq, &line_rq_ws->ws); + INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws); + queue_work(gc->gc_line_reader_wq, &gc_rq_ws->ws); sec_left -= nr_secs; if (sec_left > 0) @@ -243,7 +243,7 @@ next_rq: out: pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); - mempool_free(line_ws, pblk->line_ws_pool); + kfree(line_ws); kref_put(&line->ref, pblk_line_put); atomic_dec(&gc->inflight_gc); @@ -256,7 +256,7 @@ fail_free_emeta: pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); pblk_put_line_back(pblk, line); kref_put(&line->ref, pblk_line_put); - mempool_free(line_ws, pblk->line_ws_pool); + kfree(line_ws); atomic_dec(&gc->inflight_gc); pr_err("pblk: Failed to GC line %d\n", line->id); @@ -269,7 +269,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) pr_debug("pblk: line '%d' being reclaimed for GC\n", line->id); - line_ws = mempool_alloc(pblk->line_ws_pool, GFP_KERNEL); + line_ws = kmalloc(sizeof(struct pblk_line_ws), GFP_KERNEL); if (!line_ws) return -ENOMEM; diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 7b1f29c71338..340552253580 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -20,7 +20,7 @@ #include "pblk.h" -static struct kmem_cache *pblk_blk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, +static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, *pblk_w_rq_cache, *pblk_line_meta_cache; static DECLARE_RWSEM(pblk_lock); struct bio_set *pblk_bio_set; @@ -184,9 +184,9 @@ static int pblk_init_global_caches(struct pblk *pblk) char cache_name[PBLK_CACHE_NAME_LEN]; down_write(&pblk_lock); - pblk_blk_ws_cache = kmem_cache_create("pblk_blk_ws", + pblk_ws_cache = kmem_cache_create("pblk_blk_ws", sizeof(struct pblk_line_ws), 0, 0, NULL); - if (!pblk_blk_ws_cache) { + if (!pblk_ws_cache) { up_write(&pblk_lock); return -ENOMEM; } @@ -194,7 +194,7 @@ static int pblk_init_global_caches(struct pblk *pblk) pblk_rec_cache = kmem_cache_create("pblk_rec", sizeof(struct pblk_rec_ctx), 0, 0, NULL); if (!pblk_rec_cache) { - kmem_cache_destroy(pblk_blk_ws_cache); + kmem_cache_destroy(pblk_ws_cache); up_write(&pblk_lock); return -ENOMEM; } @@ -202,7 +202,7 @@ static int pblk_init_global_caches(struct pblk *pblk) pblk_g_rq_cache = kmem_cache_create("pblk_g_rq", pblk_g_rq_size, 0, 0, NULL); if (!pblk_g_rq_cache) { - kmem_cache_destroy(pblk_blk_ws_cache); + kmem_cache_destroy(pblk_ws_cache); kmem_cache_destroy(pblk_rec_cache); up_write(&pblk_lock); return -ENOMEM; @@ -211,7 +211,7 @@ static int pblk_init_global_caches(struct pblk *pblk) pblk_w_rq_cache = kmem_cache_create("pblk_w_rq", pblk_w_rq_size, 0, 0, NULL); if (!pblk_w_rq_cache) { - kmem_cache_destroy(pblk_blk_ws_cache); + kmem_cache_destroy(pblk_ws_cache); kmem_cache_destroy(pblk_rec_cache); kmem_cache_destroy(pblk_g_rq_cache); up_write(&pblk_lock); @@ -223,7 +223,7 @@ static int pblk_init_global_caches(struct pblk *pblk) pblk_line_meta_cache = kmem_cache_create(cache_name, pblk->lm.sec_bitmap_len, 0, 0, NULL); if (!pblk_line_meta_cache) { - kmem_cache_destroy(pblk_blk_ws_cache); + kmem_cache_destroy(pblk_ws_cache); kmem_cache_destroy(pblk_rec_cache); kmem_cache_destroy(pblk_g_rq_cache); kmem_cache_destroy(pblk_w_rq_cache); @@ -246,20 +246,20 @@ static int pblk_core_init(struct pblk *pblk) if (pblk_init_global_caches(pblk)) return -ENOMEM; - /* internal bios can be at most the sectors signaled by the device. */ + /* Internal bios can be at most the sectors signaled by the device. */ pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev), 0); if (!pblk->page_bio_pool) return -ENOMEM; - pblk->line_ws_pool = mempool_create_slab_pool(PBLK_WS_POOL_SIZE, - pblk_blk_ws_cache); - if (!pblk->line_ws_pool) + pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE, + pblk_ws_cache); + if (!pblk->gen_ws_pool) goto free_page_bio_pool; pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache); if (!pblk->rec_pool) - goto free_blk_ws_pool; + goto free_gen_ws_pool; pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE, pblk_g_rq_cache); @@ -308,8 +308,8 @@ free_g_rq_pool: mempool_destroy(pblk->g_rq_pool); free_rec_pool: mempool_destroy(pblk->rec_pool); -free_blk_ws_pool: - mempool_destroy(pblk->line_ws_pool); +free_gen_ws_pool: + mempool_destroy(pblk->gen_ws_pool); free_page_bio_pool: mempool_destroy(pblk->page_bio_pool); return -ENOMEM; @@ -324,13 +324,13 @@ static void pblk_core_free(struct pblk *pblk) destroy_workqueue(pblk->bb_wq); mempool_destroy(pblk->page_bio_pool); - mempool_destroy(pblk->line_ws_pool); + mempool_destroy(pblk->gen_ws_pool); mempool_destroy(pblk->rec_pool); mempool_destroy(pblk->g_rq_pool); mempool_destroy(pblk->w_rq_pool); mempool_destroy(pblk->line_meta_pool); - kmem_cache_destroy(pblk_blk_ws_cache); + kmem_cache_destroy(pblk_ws_cache); kmem_cache_destroy(pblk_rec_cache); kmem_cache_destroy(pblk_g_rq_cache); kmem_cache_destroy(pblk_w_rq_cache); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index d82ca8bd8390..c73b17bca06b 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -198,8 +198,8 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); if (sync == emeta->nr_entries) - pblk_line_run_ws(pblk, line, NULL, pblk_line_close_ws, - pblk->close_wq); + pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, + GFP_ATOMIC, pblk->close_wq); bio_put(rqd->bio); nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 229f6020ad8a..efaa781abb06 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -40,7 +40,6 @@ #define PBLK_MAX_REQ_ADDRS (64) #define PBLK_MAX_REQ_ADDRS_PW (6) -#define PBLK_WS_POOL_SIZE (128) #define PBLK_META_POOL_SIZE (128) #define PBLK_READ_REQ_POOL_SIZE (1024) @@ -61,6 +60,8 @@ #define ERASE 2 /* READ = 0, WRITE = 1 */ +#define PBLK_GEN_WS_POOL_SIZE (2) + enum { /* IO Types */ PBLK_IOTYPE_USER = 1 << 0, @@ -621,7 +622,7 @@ struct pblk { struct list_head compl_list; mempool_t *page_bio_pool; - mempool_t *line_ws_pool; + mempool_t *gen_ws_pool; mempool_t *rec_pool; mempool_t *g_rq_pool; mempool_t *w_rq_pool; @@ -725,9 +726,9 @@ void pblk_line_close_meta_sync(struct pblk *pblk); void pblk_line_close_ws(struct work_struct *work); void pblk_pipeline_stop(struct pblk *pblk); void pblk_line_mark_bb(struct work_struct *work); -void pblk_line_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, - void (*work)(struct work_struct *), - struct workqueue_struct *wq); +void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, + void (*work)(struct work_struct *), gfp_t gfp_mask, + struct workqueue_struct *wq); u64 pblk_line_smeta_start(struct pblk *pblk, struct pblk_line *line); int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line); int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, -- cgit v1.2.1 From 0d880398cb6254ab3e110e2a8a659da65a56ffee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:08 +0200 Subject: lightnvm: pblk: decouple read/erase mempools MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since read and erase paths offer different guarantees for inflight I/Os, separate the mempools to set the right min_nr for each on creation. Reported-by: Jens Axboe Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 8 ++++---- drivers/lightnvm/pblk-init.c | 22 +++++++++++++++------- drivers/lightnvm/pblk.h | 5 +++-- 3 files changed, 22 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index b92532211866..0c22e5ccdfdd 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -64,7 +64,7 @@ static void pblk_end_io_erase(struct nvm_rq *rqd) struct pblk *pblk = rqd->private; __pblk_end_io_erase(pblk, rqd); - mempool_free(rqd, pblk->g_rq_pool); + mempool_free(rqd, pblk->e_rq_pool); } void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, @@ -161,7 +161,7 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) pool = pblk->w_rq_pool; rq_size = pblk_w_rq_size; } else { - pool = pblk->g_rq_pool; + pool = pblk->r_rq_pool; rq_size = pblk_g_rq_size; } @@ -178,7 +178,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw) if (rw == WRITE) pool = pblk->w_rq_pool; else - pool = pblk->g_rq_pool; + pool = pblk->r_rq_pool; mempool_free(rqd, pool); } @@ -1479,7 +1479,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) struct nvm_rq *rqd; int err; - rqd = mempool_alloc(pblk->g_rq_pool, GFP_KERNEL); + rqd = mempool_alloc(pblk->e_rq_pool, GFP_KERNEL); memset(rqd, 0, pblk_g_rq_size); pblk_setup_e_rq(pblk, rqd, ppa); diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 340552253580..2f8d3f9ffbaf 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -261,15 +261,20 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->rec_pool) goto free_gen_ws_pool; - pblk->g_rq_pool = mempool_create_slab_pool(PBLK_READ_REQ_POOL_SIZE, + pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns, pblk_g_rq_cache); - if (!pblk->g_rq_pool) + if (!pblk->r_rq_pool) goto free_rec_pool; - pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns * 2, + pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns, + pblk_g_rq_cache); + if (!pblk->e_rq_pool) + goto free_r_rq_pool; + + pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns, pblk_w_rq_cache); if (!pblk->w_rq_pool) - goto free_g_rq_pool; + goto free_e_rq_pool; pblk->line_meta_pool = mempool_create_slab_pool(PBLK_META_POOL_SIZE, @@ -304,8 +309,10 @@ free_line_meta_pool: mempool_destroy(pblk->line_meta_pool); free_w_rq_pool: mempool_destroy(pblk->w_rq_pool); -free_g_rq_pool: - mempool_destroy(pblk->g_rq_pool); +free_e_rq_pool: + mempool_destroy(pblk->e_rq_pool); +free_r_rq_pool: + mempool_destroy(pblk->r_rq_pool); free_rec_pool: mempool_destroy(pblk->rec_pool); free_gen_ws_pool: @@ -326,7 +333,8 @@ static void pblk_core_free(struct pblk *pblk) mempool_destroy(pblk->page_bio_pool); mempool_destroy(pblk->gen_ws_pool); mempool_destroy(pblk->rec_pool); - mempool_destroy(pblk->g_rq_pool); + mempool_destroy(pblk->r_rq_pool); + mempool_destroy(pblk->e_rq_pool); mempool_destroy(pblk->w_rq_pool); mempool_destroy(pblk->line_meta_pool); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index efaa781abb06..419e1b7328e4 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -41,7 +41,6 @@ #define PBLK_MAX_REQ_ADDRS_PW (6) #define PBLK_META_POOL_SIZE (128) -#define PBLK_READ_REQ_POOL_SIZE (1024) #define PBLK_NR_CLOSE_JOBS (4) @@ -60,6 +59,7 @@ #define ERASE 2 /* READ = 0, WRITE = 1 */ +/* Static pool sizes */ #define PBLK_GEN_WS_POOL_SIZE (2) enum { @@ -624,8 +624,9 @@ struct pblk { mempool_t *page_bio_pool; mempool_t *gen_ws_pool; mempool_t *rec_pool; - mempool_t *g_rq_pool; + mempool_t *r_rq_pool; mempool_t *w_rq_pool; + mempool_t *e_rq_pool; mempool_t *line_meta_pool; struct workqueue_struct *close_wq; -- cgit v1.2.1 From e72ec1d31bcb6dffe325418c6d96f2fcab7c2654 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:09 +0200 Subject: lightnvm: pblk: do not use a mempool for line bitmaps MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk holds two sector bitmaps: one to keep track of the mapped sectors while the line is active and another one to keep track of the invalid sectors. The latter is kept during the whole live of the line, until it is recycled. Since we cannot guarantee forward progress for the mempool in this case, get rid of the mempool and simply allocate memory through kmalloc. Reported-by: Jens Axboe Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 26 ++++++++++---------------- drivers/lightnvm/pblk-init.c | 29 ++--------------------------- drivers/lightnvm/pblk-recovery.c | 2 +- drivers/lightnvm/pblk-write.c | 4 +--- drivers/lightnvm/pblk.h | 3 --- 5 files changed, 14 insertions(+), 50 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 0c22e5ccdfdd..215aadb84c6e 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1095,25 +1095,21 @@ static int pblk_line_prepare(struct pblk *pblk, struct pblk_line *line) struct pblk_line_meta *lm = &pblk->lm; int blk_in_line = atomic_read(&line->blk_in_line); - line->map_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC); + line->map_bitmap = kzalloc(lm->sec_bitmap_len, GFP_ATOMIC); if (!line->map_bitmap) return -ENOMEM; - memset(line->map_bitmap, 0, lm->sec_bitmap_len); - /* invalid_bitmap is special since it is used when line is closed. No - * need to zeroized; it will be initialized using bb info form - * map_bitmap - */ - line->invalid_bitmap = mempool_alloc(pblk->line_meta_pool, GFP_ATOMIC); + /* will be initialized using bb info from map_bitmap */ + line->invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_ATOMIC); if (!line->invalid_bitmap) { - mempool_free(line->map_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); return -ENOMEM; } spin_lock(&line->lock); if (line->state != PBLK_LINESTATE_FREE) { - mempool_free(line->invalid_bitmap, pblk->line_meta_pool); - mempool_free(line->map_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); + kfree(line->invalid_bitmap); spin_unlock(&line->lock); WARN(1, "pblk: corrupted line %d, state %d\n", line->id, line->state); @@ -1165,7 +1161,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line) void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line) { - mempool_free(line->map_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); line->map_bitmap = NULL; line->smeta = NULL; line->emeta = NULL; @@ -1440,10 +1436,8 @@ retry_setup: void pblk_line_free(struct pblk *pblk, struct pblk_line *line) { - if (line->map_bitmap) - mempool_free(line->map_bitmap, pblk->line_meta_pool); - if (line->invalid_bitmap) - mempool_free(line->invalid_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); + kfree(line->invalid_bitmap); *line->vsc = cpu_to_le32(EMPTY_ENTRY); @@ -1584,7 +1578,7 @@ void pblk_line_close(struct pblk *pblk, struct pblk_line *line) list_add_tail(&line->list, move_list); - mempool_free(line->map_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); line->map_bitmap = NULL; line->smeta = NULL; line->emeta = NULL; diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 2f8d3f9ffbaf..4d719782f65b 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -21,7 +21,7 @@ #include "pblk.h" static struct kmem_cache *pblk_ws_cache, *pblk_rec_cache, *pblk_g_rq_cache, - *pblk_w_rq_cache, *pblk_line_meta_cache; + *pblk_w_rq_cache; static DECLARE_RWSEM(pblk_lock); struct bio_set *pblk_bio_set; @@ -181,8 +181,6 @@ static int pblk_set_ppaf(struct pblk *pblk) static int pblk_init_global_caches(struct pblk *pblk) { - char cache_name[PBLK_CACHE_NAME_LEN]; - down_write(&pblk_lock); pblk_ws_cache = kmem_cache_create("pblk_blk_ws", sizeof(struct pblk_line_ws), 0, 0, NULL); @@ -217,19 +215,6 @@ static int pblk_init_global_caches(struct pblk *pblk) up_write(&pblk_lock); return -ENOMEM; } - - snprintf(cache_name, sizeof(cache_name), "pblk_line_m_%s", - pblk->disk->disk_name); - pblk_line_meta_cache = kmem_cache_create(cache_name, - pblk->lm.sec_bitmap_len, 0, 0, NULL); - if (!pblk_line_meta_cache) { - kmem_cache_destroy(pblk_ws_cache); - kmem_cache_destroy(pblk_rec_cache); - kmem_cache_destroy(pblk_g_rq_cache); - kmem_cache_destroy(pblk_w_rq_cache); - up_write(&pblk_lock); - return -ENOMEM; - } up_write(&pblk_lock); return 0; @@ -276,16 +261,10 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->w_rq_pool) goto free_e_rq_pool; - pblk->line_meta_pool = - mempool_create_slab_pool(PBLK_META_POOL_SIZE, - pblk_line_meta_cache); - if (!pblk->line_meta_pool) - goto free_w_rq_pool; - pblk->close_wq = alloc_workqueue("pblk-close-wq", WQ_MEM_RECLAIM | WQ_UNBOUND, PBLK_NR_CLOSE_JOBS); if (!pblk->close_wq) - goto free_line_meta_pool; + goto free_w_rq_pool; pblk->bb_wq = alloc_workqueue("pblk-bb-wq", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); @@ -305,8 +284,6 @@ free_bb_wq: destroy_workqueue(pblk->bb_wq); free_close_wq: destroy_workqueue(pblk->close_wq); -free_line_meta_pool: - mempool_destroy(pblk->line_meta_pool); free_w_rq_pool: mempool_destroy(pblk->w_rq_pool); free_e_rq_pool: @@ -336,13 +313,11 @@ static void pblk_core_free(struct pblk *pblk) mempool_destroy(pblk->r_rq_pool); mempool_destroy(pblk->e_rq_pool); mempool_destroy(pblk->w_rq_pool); - mempool_destroy(pblk->line_meta_pool); kmem_cache_destroy(pblk_ws_cache); kmem_cache_destroy(pblk_rec_cache); kmem_cache_destroy(pblk_g_rq_cache); kmem_cache_destroy(pblk_w_rq_cache); - kmem_cache_destroy(pblk_line_meta_cache); } static void pblk_luns_free(struct pblk *pblk) diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index caf124279575..de5270712be7 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -987,7 +987,7 @@ next: list_move_tail(&line->list, move_list); spin_unlock(&l_mg->gc_lock); - mempool_free(line->map_bitmap, pblk->line_meta_pool); + kfree(line->map_bitmap); line->map_bitmap = NULL; line->smeta = NULL; line->emeta = NULL; diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index c73b17bca06b..26c2b8345149 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -411,8 +411,6 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) if (emeta->mem >= lm->emeta_len[0]) { spin_lock(&l_mg->close_lock); list_del(&meta_line->list); - WARN(!bitmap_full(meta_line->map_bitmap, lm->sec_per_line), - "pblk: corrupt meta line %d\n", meta_line->id); spin_unlock(&l_mg->close_lock); } @@ -456,7 +454,7 @@ retry: return 0; } meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); - if (bitmap_full(meta_line->map_bitmap, lm->sec_per_line)) + if (meta_line->emeta->mem >= lm->emeta_len[0]) goto retry; spin_unlock(&l_mg->close_lock); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 419e1b7328e4..60edcda0fc7f 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -40,8 +40,6 @@ #define PBLK_MAX_REQ_ADDRS (64) #define PBLK_MAX_REQ_ADDRS_PW (6) -#define PBLK_META_POOL_SIZE (128) - #define PBLK_NR_CLOSE_JOBS (4) #define PBLK_CACHE_NAME_LEN (DISK_NAME_LEN + 16) @@ -627,7 +625,6 @@ struct pblk { mempool_t *r_rq_pool; mempool_t *w_rq_pool; mempool_t *e_rq_pool; - mempool_t *line_meta_pool; struct workqueue_struct *close_wq; struct workqueue_struct *bb_wq; -- cgit v1.2.1 From 2942f50fa389a62865572452dce6214a8aed69dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:10 +0200 Subject: lightnvm: pblk: remove checks on mempool alloc. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of the mempool audit on pblk, remove unnecessary mempool allocation checks on mempools. Reported-by: Jens Axboe Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 4 ---- drivers/lightnvm/pblk-read.c | 8 -------- drivers/lightnvm/pblk-recovery.c | 35 +++++++---------------------------- drivers/lightnvm/pblk-write.c | 24 +++++------------------- 4 files changed, 12 insertions(+), 59 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 215aadb84c6e..0da58869006b 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -206,8 +206,6 @@ int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, for (i = 0; i < nr_pages; i++) { page = mempool_alloc(pblk->page_bio_pool, flags); - if (!page) - goto err; ret = bio_add_pc_page(q, bio, page, PBLK_EXPOSED_PAGE_SIZE, 0); if (ret != PBLK_EXPOSED_PAGE_SIZE) { @@ -1653,8 +1651,6 @@ void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, struct pblk_line_ws *line_ws; line_ws = mempool_alloc(pblk->gen_ws_pool, gfp_mask); - if (!line_ws) - return; line_ws->pblk = pblk; line_ws->line = line; diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 402c732f0970..d2b6e2a7d7d5 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -168,10 +168,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, DECLARE_COMPLETION_ONSTACK(wait); new_bio = bio_alloc(GFP_KERNEL, nr_holes); - if (!new_bio) { - pr_err("pblk: could not alloc read bio\n"); - return NVM_IO_ERR; - } if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) goto err; @@ -321,10 +317,6 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) bitmap_zero(&read_bitmap, nr_secs); rqd = pblk_alloc_rqd(pblk, READ); - if (IS_ERR(rqd)) { - pr_err_ratelimited("pblk: not able to alloc rqd"); - return NVM_IO_ERR; - } rqd->opcode = NVM_OP_PREAD; rqd->bio = bio; diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index de5270712be7..6b6b4183b41e 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -34,10 +34,6 @@ void pblk_submit_rec(struct work_struct *work) max_secs); bio = bio_alloc(GFP_KERNEL, nr_rec_secs); - if (!bio) { - pr_err("pblk: not able to create recovery bio\n"); - return; - } bio->bi_iter.bi_sector = 0; bio_set_op_attrs(bio, REQ_OP_WRITE, 0); @@ -85,11 +81,6 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded; rec_rqd = pblk_alloc_rqd(pblk, WRITE); - if (IS_ERR(rec_rqd)) { - pr_err("pblk: could not create recovery req.\n"); - return -ENOMEM; - } - rec_ctx = nvm_rq_to_pdu(rec_rqd); /* Copy completion bitmap, but exclude the first X completed entries */ @@ -404,22 +395,18 @@ next_pad_rq: ppa_list = (void *)(meta_list) + pblk_dma_meta_size; dma_ppa_list = dma_meta_list + pblk_dma_meta_size; - rqd = pblk_alloc_rqd(pblk, WRITE); - if (IS_ERR(rqd)) { - ret = PTR_ERR(rqd); - goto fail_free_meta; - } - bio = pblk_bio_map_addr(pblk, data, rq_ppas, rq_len, PBLK_VMALLOC_META, GFP_KERNEL); if (IS_ERR(bio)) { ret = PTR_ERR(bio); - goto fail_free_rqd; + goto fail_free_meta; } bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + rqd = pblk_alloc_rqd(pblk, WRITE); + rqd->bio = bio; rqd->opcode = NVM_OP_PWRITE; rqd->flags = pblk_set_progr_mode(pblk, WRITE); @@ -490,8 +477,6 @@ free_rq: fail_free_bio: bio_put(bio); -fail_free_rqd: - pblk_free_rqd(pblk, rqd, WRITE); fail_free_meta: nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); fail_free_pad: @@ -785,15 +770,9 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) dma_addr_t dma_ppa_list, dma_meta_list; int done, ret = 0; - rqd = pblk_alloc_rqd(pblk, READ); - if (IS_ERR(rqd)) - return PTR_ERR(rqd); - meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &dma_meta_list); - if (!meta_list) { - ret = -ENOMEM; - goto free_rqd; - } + if (!meta_list) + return -ENOMEM; ppa_list = (void *)(meta_list) + pblk_dma_meta_size; dma_ppa_list = dma_meta_list + pblk_dma_meta_size; @@ -804,6 +783,8 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) goto free_meta_list; } + rqd = pblk_alloc_rqd(pblk, READ); + p.ppa_list = ppa_list; p.meta_list = meta_list; p.rqd = rqd; @@ -832,8 +813,6 @@ out: kfree(data); free_meta_list: nvm_dev_dma_free(dev->parent, meta_list, dma_meta_list); -free_rqd: - pblk_free_rqd(pblk, rqd, READ); return ret; } diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 26c2b8345149..0fb8f26a6311 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -111,10 +111,7 @@ static void pblk_end_w_fail(struct pblk *pblk, struct nvm_rq *rqd) ppa_list = &rqd->ppa_addr; recovery = mempool_alloc(pblk->rec_pool, GFP_ATOMIC); - if (!recovery) { - pr_err("pblk: could not allocate recovery context\n"); - return; - } + INIT_LIST_HEAD(&recovery->failed); bit = -1; @@ -375,10 +372,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) int ret; rqd = pblk_alloc_rqd(pblk, READ); - if (IS_ERR(rqd)) { - pr_err("pblk: cannot allocate write req.\n"); - return PTR_ERR(rqd); - } + m_ctx = nvm_rq_to_pdu(rqd); m_ctx->private = meta_line; @@ -546,19 +540,12 @@ static int pblk_submit_write(struct pblk *pblk) if (!secs_to_flush && secs_avail < pblk->min_write_pgs) return 1; - rqd = pblk_alloc_rqd(pblk, WRITE); - if (IS_ERR(rqd)) { - pr_err("pblk: cannot allocate write req.\n"); - return 1; - } - bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs); - if (!bio) { - pr_err("pblk: cannot allocate write bio\n"); - goto fail_free_rqd; - } + bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + rqd = pblk_alloc_rqd(pblk, WRITE); rqd->bio = bio; secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); @@ -589,7 +576,6 @@ fail_free_bio: pblk_free_write_rqd(pblk, rqd); fail_put_bio: bio_put(bio); -fail_free_rqd: pblk_free_rqd(pblk, rqd, WRITE); return 1; -- cgit v1.2.1 From 3627896a4b12ea6bb9e0ff77724a24f53726db2d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:11 +0200 Subject: lightnvm: pblk: use constant for GC max inflight MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use a constant to set the maximum number of inflight GC requests allowed. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 4 ++-- drivers/lightnvm/pblk.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index f163829ecca8..c21b2077432a 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -93,7 +93,7 @@ static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq) retry: spin_lock(&gc->w_lock); - if (gc->w_entries >= PBLK_GC_W_QD) { + if (gc->w_entries >= PBLK_GC_RQ_QD) { spin_unlock(&gc->w_lock); pblk_gc_writer_kick(&pblk->gc); usleep_range(128, 256); @@ -602,7 +602,7 @@ int pblk_gc_init(struct pblk *pblk) spin_lock_init(&gc->w_lock); spin_lock_init(&gc->r_lock); - sema_init(&gc->gc_sem, 128); + sema_init(&gc->gc_sem, PBLK_GC_RQ_QD); INIT_LIST_HEAD(&gc->w_list); INIT_LIST_HEAD(&gc->r_list); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 60edcda0fc7f..baa6a633990f 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -816,7 +816,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, * pblk gc */ #define PBLK_GC_MAX_READERS 8 /* Max number of outstanding GC reader jobs */ -#define PBLK_GC_W_QD 128 /* Queue depth for inflight GC write I/Os */ +#define PBLK_GC_RQ_QD 128 /* Queue depth for inflight GC requests */ #define PBLK_GC_L_QD 4 /* Queue depth for inflight GC lines */ #define PBLK_GC_RSV_LINE 1 /* Reserved lines for GC */ -- cgit v1.2.1 From 9f6cb13bb40bd9067498e908a3272aba998c0309 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:12 +0200 Subject: lightnvm: pblk: normalize ppa namings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Normalize the way we name ppa variables to improve code readability. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 48 +++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 0da58869006b..b6d7c6660149 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1746,7 +1746,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) { - struct ppa_addr l2p_ppa; + struct ppa_addr ppa_l2p; /* logic error: lba out-of-bounds. Ignore update */ if (!(lba < pblk->rl.nr_secs)) { @@ -1755,10 +1755,10 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) } spin_lock(&pblk->trans_lock); - l2p_ppa = pblk_trans_map_get(pblk, lba); + ppa_l2p = pblk_trans_map_get(pblk, lba); - if (!pblk_addr_in_cache(l2p_ppa) && !pblk_ppa_empty(l2p_ppa)) - pblk_map_invalidate(pblk, l2p_ppa); + if (!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)) + pblk_map_invalidate(pblk, ppa_l2p); pblk_trans_map_set(pblk, lba, ppa); spin_unlock(&pblk->trans_lock); @@ -1775,16 +1775,16 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) pblk_update_map(pblk, lba, ppa); } -int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, +int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, struct pblk_line *gc_line) { - struct ppa_addr l2p_ppa; + struct ppa_addr ppa_l2p; int ret = 1; #ifdef CONFIG_NVM_DEBUG /* Callers must ensure that the ppa points to a cache address */ - BUG_ON(!pblk_addr_in_cache(ppa)); - BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa))); + BUG_ON(!pblk_addr_in_cache(ppa_new)); + BUG_ON(pblk_rb_pos_oob(&pblk->rwb, pblk_addr_to_cacheline(ppa_new))); #endif /* logic error: lba out-of-bounds. Ignore update */ @@ -1794,36 +1794,38 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, } spin_lock(&pblk->trans_lock); - l2p_ppa = pblk_trans_map_get(pblk, lba); + ppa_l2p = pblk_trans_map_get(pblk, lba); /* Prevent updated entries to be overwritten by GC */ - if (pblk_addr_in_cache(l2p_ppa) || pblk_ppa_empty(l2p_ppa) || - pblk_tgt_ppa_to_line(l2p_ppa) != gc_line->id) { + if (pblk_addr_in_cache(ppa_l2p) || pblk_ppa_empty(ppa_l2p) || + pblk_tgt_ppa_to_line(ppa_l2p) != gc_line->id) { + ret = 0; goto out; } - pblk_trans_map_set(pblk, lba, ppa); + pblk_trans_map_set(pblk, lba, ppa_new); out: spin_unlock(&pblk->trans_lock); return ret; } -void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, - struct ppa_addr entry_line) +void pblk_update_map_dev(struct pblk *pblk, sector_t lba, + struct ppa_addr ppa_mapped, struct ppa_addr ppa_cache) { - struct ppa_addr l2p_line; + struct ppa_addr ppa_l2p; #ifdef CONFIG_NVM_DEBUG /* Callers must ensure that the ppa points to a device address */ - BUG_ON(pblk_addr_in_cache(ppa)); + BUG_ON(pblk_addr_in_cache(ppa_mapped)); #endif /* Invalidate and discard padded entries */ if (lba == ADDR_EMPTY) { #ifdef CONFIG_NVM_DEBUG atomic_long_inc(&pblk->padded_wb); #endif - pblk_map_invalidate(pblk, ppa); + if (!pblk_ppa_empty(ppa_mapped)) + pblk_map_invalidate(pblk, ppa_mapped); return; } @@ -1834,22 +1836,22 @@ void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, } spin_lock(&pblk->trans_lock); - l2p_line = pblk_trans_map_get(pblk, lba); + ppa_l2p = pblk_trans_map_get(pblk, lba); /* Do not update L2P if the cacheline has been updated. In this case, * the mapped ppa must be invalidated */ - if (l2p_line.ppa != entry_line.ppa) { - if (!pblk_ppa_empty(ppa)) - pblk_map_invalidate(pblk, ppa); + if (!pblk_ppa_comp(ppa_l2p, ppa_cache)) { + if (!pblk_ppa_empty(ppa_mapped)) + pblk_map_invalidate(pblk, ppa_mapped); goto out; } #ifdef CONFIG_NVM_DEBUG - WARN_ON(!pblk_addr_in_cache(l2p_line) && !pblk_ppa_empty(l2p_line)); + WARN_ON(!pblk_addr_in_cache(ppa_l2p) && !pblk_ppa_empty(ppa_l2p)); #endif - pblk_trans_map_set(pblk, lba, ppa); + pblk_trans_map_set(pblk, lba, ppa_mapped); out: spin_unlock(&pblk->trans_lock); } -- cgit v1.2.1 From 84454e6de56bb5c8629c41ed09aaf5750ff56f5f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:13 +0200 Subject: lightnvm: pblk: refactor read lba sanity check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor lba sanity check on read path to avoid code duplication. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-read.c | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index d2b6e2a7d7d5..eaaf9d55ba97 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -39,21 +39,14 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, } static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned long *read_bitmap) + sector_t blba, unsigned long *read_bitmap) { struct bio *bio = rqd->bio; struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; - sector_t blba = pblk_get_lba(bio); int nr_secs = rqd->nr_ppas; bool advanced_bio = false; int i, j = 0; - /* logic error: lba out-of-bounds. Ignore read request */ - if (blba + nr_secs >= pblk->rl.nr_secs) { - WARN(1, "pblk: read lbas out of bounds\n"); - return; - } - pblk_lookup_l2p_seq(pblk, ppas, blba, nr_secs); for (i = 0; i < nr_secs; i++) { @@ -259,17 +252,10 @@ err: } static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, - unsigned long *read_bitmap) + sector_t lba, unsigned long *read_bitmap) { struct bio *bio = rqd->bio; struct ppa_addr ppa; - sector_t lba = pblk_get_lba(bio); - - /* logic error: lba out-of-bounds. Ignore read request */ - if (lba >= pblk->rl.nr_secs) { - WARN(1, "pblk: read lba out of bounds\n"); - return; - } pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); @@ -305,14 +291,19 @@ retry: int pblk_submit_read(struct pblk *pblk, struct bio *bio) { struct nvm_tgt_dev *dev = pblk->dev; + sector_t blba = pblk_get_lba(bio); unsigned int nr_secs = pblk_get_secs(bio); struct nvm_rq *rqd; unsigned long read_bitmap; /* Max 64 ppas per request */ unsigned int bio_init_idx; int ret = NVM_IO_ERR; - if (nr_secs > PBLK_MAX_REQ_ADDRS) + /* logic error: lba out-of-bounds. Ignore read request */ + if (blba >= pblk->rl.nr_secs || nr_secs > PBLK_MAX_REQ_ADDRS) { + WARN(1, "pblk: read lba out of bounds (lba:%llu, nr:%d)\n", + (unsigned long long)blba, nr_secs); return NVM_IO_ERR; + } bitmap_zero(&read_bitmap, nr_secs); @@ -340,9 +331,9 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) rqd->ppa_list = rqd->meta_list + pblk_dma_meta_size; rqd->dma_ppa_list = rqd->dma_meta_list + pblk_dma_meta_size; - pblk_read_ppalist_rq(pblk, rqd, &read_bitmap); + pblk_read_ppalist_rq(pblk, rqd, blba, &read_bitmap); } else { - pblk_read_rq(pblk, rqd, &read_bitmap); + pblk_read_rq(pblk, rqd, blba, &read_bitmap); } bio_get(bio); -- cgit v1.2.1 From d340121eb770de3b02bfc73c5f2b00f5345090c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:14 +0200 Subject: lightnvm: pblk: simplify data validity check on GC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a line is selected for recycling by the garbage collector (GC), the line state changes and the invalid bitmap is frozen, preventing invalidations from happening. Throughout the GC, the L2P map is checked to verify that not data being recycled has been updated. The last check is done before the new map is being stored on the L2P table. Though this algorithm works, it requires a number of corner cases to be checked each time the L2P table is being updated. This complicates readability and is error prone in case that the recycling algorithm is modified. Instead, this patch makes the invalid bitmap accessible even when the line is being recycled. When recycled data is being remapped, it is enough to check the invalid bitmap for the line before updating the L2P table. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-cache.c | 20 +++++------ drivers/lightnvm/pblk-core.c | 29 +++++++--------- drivers/lightnvm/pblk-gc.c | 58 +++++++++++++++++-------------- drivers/lightnvm/pblk-rb.c | 6 ++-- drivers/lightnvm/pblk-read.c | 79 +++++++++++++++++++++++-------------------- drivers/lightnvm/pblk.h | 23 ++++--------- 6 files changed, 109 insertions(+), 106 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 024a8fc93069..1d6b8e3585f1 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -73,12 +73,11 @@ out: * On GC the incoming lbas are not necessarily sequential. Also, some of the * lbas might not be valid entries, which are marked as empty by the GC thread */ -int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, - unsigned int nr_entries, unsigned int nr_rec_entries, - struct pblk_line *gc_line, unsigned long flags) +int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq) { struct pblk_w_ctx w_ctx; unsigned int bpos, pos; + void *data = gc_rq->data; int i, valid_entries; /* Update the write buffer head (mem) with the entries that we can @@ -86,28 +85,29 @@ int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, * rollback from here on. */ retry: - if (!pblk_rb_may_write_gc(&pblk->rwb, nr_rec_entries, &bpos)) { + if (!pblk_rb_may_write_gc(&pblk->rwb, gc_rq->secs_to_gc, &bpos)) { io_schedule(); goto retry; } - w_ctx.flags = flags; + w_ctx.flags = PBLK_IOTYPE_GC; pblk_ppa_set_empty(&w_ctx.ppa); - for (i = 0, valid_entries = 0; i < nr_entries; i++) { - if (lba_list[i] == ADDR_EMPTY) + for (i = 0, valid_entries = 0; i < gc_rq->nr_secs; i++) { + if (gc_rq->lba_list[i] == ADDR_EMPTY) continue; - w_ctx.lba = lba_list[i]; + w_ctx.lba = gc_rq->lba_list[i]; pos = pblk_rb_wrap_pos(&pblk->rwb, bpos + valid_entries); - pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_line, pos); + pblk_rb_write_entry_gc(&pblk->rwb, data, w_ctx, gc_rq->line, + gc_rq->paddr_list[i], pos); data += PBLK_EXPOSED_PAGE_SIZE; valid_entries++; } - WARN_ONCE(nr_rec_entries != valid_entries, + WARN_ONCE(gc_rq->secs_to_gc != valid_entries, "pblk: inconsistent GC write\n"); #ifdef CONFIG_NVM_DEBUG diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index b6d7c6660149..6dd4866e579c 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -78,11 +78,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, * that newer updates are not overwritten. */ spin_lock(&line->lock); - if (line->state == PBLK_LINESTATE_GC || - line->state == PBLK_LINESTATE_FREE) { - spin_unlock(&line->lock); - return; - } + WARN_ON(line->state == PBLK_LINESTATE_FREE); if (test_and_set_bit(paddr, line->invalid_bitmap)) { WARN_ONCE(1, "pblk: double invalidate\n"); @@ -99,8 +95,7 @@ void __pblk_map_invalidate(struct pblk *pblk, struct pblk_line *line, spin_lock(&l_mg->gc_lock); spin_lock(&line->lock); /* Prevent moving a line that has just been chosen for GC */ - if (line->state == PBLK_LINESTATE_GC || - line->state == PBLK_LINESTATE_FREE) { + if (line->state == PBLK_LINESTATE_GC) { spin_unlock(&line->lock); spin_unlock(&l_mg->gc_lock); return; @@ -1766,6 +1761,7 @@ void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) { + #ifdef CONFIG_NVM_DEBUG /* Callers must ensure that the ppa points to a cache address */ BUG_ON(!pblk_addr_in_cache(ppa)); @@ -1776,9 +1772,9 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) } int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, - struct pblk_line *gc_line) + struct pblk_line *gc_line, u64 paddr_gc) { - struct ppa_addr ppa_l2p; + struct ppa_addr ppa_l2p, ppa_gc; int ret = 1; #ifdef CONFIG_NVM_DEBUG @@ -1795,10 +1791,13 @@ int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa_new, spin_lock(&pblk->trans_lock); ppa_l2p = pblk_trans_map_get(pblk, lba); + ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, gc_line->id); - /* Prevent updated entries to be overwritten by GC */ - if (pblk_addr_in_cache(ppa_l2p) || pblk_ppa_empty(ppa_l2p) || - pblk_tgt_ppa_to_line(ppa_l2p) != gc_line->id) { + if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) { + spin_lock(&gc_line->lock); + WARN(!test_bit(paddr_gc, gc_line->invalid_bitmap), + "pblk: corrupted GC update"); + spin_unlock(&gc_line->lock); ret = 0; goto out; @@ -1870,15 +1869,13 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, u64 *lba_list, int nr_secs) { - sector_t lba; + u64 lba; int i; spin_lock(&pblk->trans_lock); for (i = 0; i < nr_secs; i++) { lba = lba_list[i]; - if (lba == ADDR_EMPTY) { - ppas[i].ppa = ADDR_EMPTY; - } else { + if (lba != ADDR_EMPTY) { /* logic error: lba out-of-bounds. Ignore update */ if (!(lba < pblk->rl.nr_secs)) { WARN(1, "pblk: corrupted L2P map request\n"); diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index c21b2077432a..7ad0cfe58a21 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -20,7 +20,8 @@ static void pblk_gc_free_gc_rq(struct pblk_gc_rq *gc_rq) { - vfree(gc_rq->data); + if (gc_rq->data) + vfree(gc_rq->data); kfree(gc_rq); } @@ -41,10 +42,7 @@ static int pblk_gc_write(struct pblk *pblk) spin_unlock(&gc->w_lock); list_for_each_entry_safe(gc_rq, tgc_rq, &w_list, list) { - pblk_write_gc_to_cache(pblk, gc_rq->data, gc_rq->lba_list, - gc_rq->nr_secs, gc_rq->secs_to_gc, - gc_rq->line, PBLK_IOTYPE_GC); - + pblk_write_gc_to_cache(pblk, gc_rq); list_del(&gc_rq->list); kref_put(&gc_rq->line->ref, pblk_line_put); pblk_gc_free_gc_rq(gc_rq); @@ -69,27 +67,23 @@ static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq) struct pblk_gc *gc = &pblk->gc; struct pblk_line *line = gc_rq->line; void *data; - unsigned int secs_to_gc; int ret = 0; data = vmalloc(gc_rq->nr_secs * geo->sec_size); if (!data) { ret = -ENOMEM; - goto out; + goto fail; } - /* Read from GC victim block */ - if (pblk_submit_read_gc(pblk, gc_rq->lba_list, data, gc_rq->nr_secs, - &secs_to_gc, line)) { - ret = -EFAULT; - goto free_data; - } + gc_rq->data = data; - if (!secs_to_gc) - goto free_rq; + /* Read from GC victim block */ + ret = pblk_submit_read_gc(pblk, gc_rq); + if (ret) + goto fail; - gc_rq->data = data; - gc_rq->secs_to_gc = secs_to_gc; + if (!gc_rq->secs_to_gc) + goto fail; retry: spin_lock(&gc->w_lock); @@ -107,11 +101,8 @@ retry: return 0; -free_rq: - kfree(gc_rq); -free_data: - vfree(data); -out: +fail: + pblk_gc_free_gc_rq(gc_rq); kref_put(&line->ref, pblk_line_put); return ret; } @@ -167,14 +158,21 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) struct pblk_line_ws *gc_rq_ws; struct pblk_gc_rq *gc_rq; __le64 *lba_list; + unsigned long *invalid_bitmap; int sec_left, nr_secs, bit; int ret; + invalid_bitmap = kmalloc(lm->sec_bitmap_len, GFP_KERNEL); + if (!invalid_bitmap) { + pr_err("pblk: could not allocate GC invalid bitmap\n"); + goto fail_free_ws; + } + emeta_buf = pblk_malloc(lm->emeta_len[0], l_mg->emeta_alloc_type, GFP_KERNEL); if (!emeta_buf) { pr_err("pblk: cannot use GC emeta\n"); - return; + goto fail_free_bitmap; } ret = pblk_line_read_emeta(pblk, line, emeta_buf); @@ -193,7 +191,11 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work) goto fail_free_emeta; } + spin_lock(&line->lock); + bitmap_copy(invalid_bitmap, line->invalid_bitmap, lm->sec_per_line); sec_left = pblk_line_vsc(line); + spin_unlock(&line->lock); + if (sec_left < 0) { pr_err("pblk: corrupted GC line (%d)\n", line->id); goto fail_free_emeta; @@ -207,11 +209,12 @@ next_rq: nr_secs = 0; do { - bit = find_next_zero_bit(line->invalid_bitmap, lm->sec_per_line, + bit = find_next_zero_bit(invalid_bitmap, lm->sec_per_line, bit + 1); if (bit > line->emeta_ssec) break; + gc_rq->paddr_list[nr_secs] = bit; gc_rq->lba_list[nr_secs++] = le64_to_cpu(lba_list[bit]); } while (nr_secs < pblk->max_write_pgs); @@ -244,6 +247,7 @@ next_rq: out: pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); kfree(line_ws); + kfree(invalid_bitmap); kref_put(&line->ref, pblk_line_put); atomic_dec(&gc->inflight_gc); @@ -254,9 +258,13 @@ fail_free_gc_rq: kfree(gc_rq); fail_free_emeta: pblk_mfree(emeta_buf, l_mg->emeta_alloc_type); +fail_free_bitmap: + kfree(invalid_bitmap); +fail_free_ws: + kfree(line_ws); + pblk_put_line_back(pblk, line); kref_put(&line->ref, pblk_line_put); - kfree(line_ws); atomic_dec(&gc->inflight_gc); pr_err("pblk: Failed to GC line %d\n", line->id); diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 9bc32578a766..74c768ce09ef 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -325,8 +325,8 @@ void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, } void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, struct pblk_line *gc_line, - unsigned int ring_pos) + struct pblk_w_ctx w_ctx, struct pblk_line *line, + u64 paddr, unsigned int ring_pos) { struct pblk *pblk = container_of(rb, struct pblk, rwb); struct pblk_rb_entry *entry; @@ -341,7 +341,7 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, __pblk_rb_write_entry(rb, data, w_ctx, entry); - if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, gc_line)) + if (!pblk_update_map_gc(pblk, w_ctx.lba, entry->cacheline, line, paddr)) entry->w_ctx.lba = ADDR_EMPTY; flags = w_ctx.flags | PBLK_WRITTEN_DATA; diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index eaaf9d55ba97..c28d6509312e 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -388,34 +388,40 @@ fail_rqd_free: static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_line *line, u64 *lba_list, - unsigned int nr_secs) + u64 *paddr_list_gc, unsigned int nr_secs) { - struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; + struct ppa_addr ppa_list_l2p[PBLK_MAX_REQ_ADDRS]; + struct ppa_addr ppa_gc; int valid_secs = 0; int i; - pblk_lookup_l2p_rand(pblk, ppas, lba_list, nr_secs); + pblk_lookup_l2p_rand(pblk, ppa_list_l2p, lba_list, nr_secs); for (i = 0; i < nr_secs; i++) { - if (pblk_addr_in_cache(ppas[i]) || ppas[i].g.blk != line->id || - pblk_ppa_empty(ppas[i])) { - lba_list[i] = ADDR_EMPTY; + if (lba_list[i] == ADDR_EMPTY) + continue; + + ppa_gc = addr_to_gen_ppa(pblk, paddr_list_gc[i], line->id); + if (!pblk_ppa_comp(ppa_list_l2p[i], ppa_gc)) { + paddr_list_gc[i] = lba_list[i] = ADDR_EMPTY; continue; } - rqd->ppa_list[valid_secs++] = ppas[i]; + rqd->ppa_list[valid_secs++] = ppa_list_l2p[i]; } #ifdef CONFIG_NVM_DEBUG atomic_long_add(valid_secs, &pblk->inflight_reads); #endif + return valid_secs; } static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_line *line, sector_t lba) + struct pblk_line *line, sector_t lba, + u64 paddr_gc) { - struct ppa_addr ppa; + struct ppa_addr ppa_l2p, ppa_gc; int valid_secs = 0; if (lba == ADDR_EMPTY) @@ -428,15 +434,14 @@ static int read_rq_gc(struct pblk *pblk, struct nvm_rq *rqd, } spin_lock(&pblk->trans_lock); - ppa = pblk_trans_map_get(pblk, lba); + ppa_l2p = pblk_trans_map_get(pblk, lba); spin_unlock(&pblk->trans_lock); - /* Ignore updated values until the moment */ - if (pblk_addr_in_cache(ppa) || ppa.g.blk != line->id || - pblk_ppa_empty(ppa)) + ppa_gc = addr_to_gen_ppa(pblk, paddr_gc, line->id); + if (!pblk_ppa_comp(ppa_l2p, ppa_gc)) goto out; - rqd->ppa_addr = ppa; + rqd->ppa_addr = ppa_l2p; valid_secs = 1; #ifdef CONFIG_NVM_DEBUG @@ -447,15 +452,14 @@ out: return valid_secs; } -int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, - unsigned int nr_secs, unsigned int *secs_to_gc, - struct pblk_line *line) +int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; struct bio *bio; struct nvm_rq rqd; - int ret, data_len; + int data_len; + int ret = NVM_IO_OK; DECLARE_COMPLETION_ONSTACK(wait); memset(&rqd, 0, sizeof(struct nvm_rq)); @@ -463,25 +467,29 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, rqd.meta_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL, &rqd.dma_meta_list); if (!rqd.meta_list) - return NVM_IO_ERR; + return -ENOMEM; - if (nr_secs > 1) { + if (gc_rq->nr_secs > 1) { rqd.ppa_list = rqd.meta_list + pblk_dma_meta_size; rqd.dma_ppa_list = rqd.dma_meta_list + pblk_dma_meta_size; - *secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, line, lba_list, - nr_secs); - if (*secs_to_gc == 1) + gc_rq->secs_to_gc = read_ppalist_rq_gc(pblk, &rqd, gc_rq->line, + gc_rq->lba_list, + gc_rq->paddr_list, + gc_rq->nr_secs); + if (gc_rq->secs_to_gc == 1) rqd.ppa_addr = rqd.ppa_list[0]; } else { - *secs_to_gc = read_rq_gc(pblk, &rqd, line, lba_list[0]); + gc_rq->secs_to_gc = read_rq_gc(pblk, &rqd, gc_rq->line, + gc_rq->lba_list[0], + gc_rq->paddr_list[0]); } - if (!(*secs_to_gc)) + if (!(gc_rq->secs_to_gc)) goto out; - data_len = (*secs_to_gc) * geo->sec_size; - bio = pblk_bio_map_addr(pblk, data, *secs_to_gc, data_len, + data_len = (gc_rq->secs_to_gc) * geo->sec_size; + bio = pblk_bio_map_addr(pblk, gc_rq->data, gc_rq->secs_to_gc, data_len, PBLK_VMALLOC_META, GFP_KERNEL); if (IS_ERR(bio)) { pr_err("pblk: could not allocate GC bio (%lu)\n", PTR_ERR(bio)); @@ -494,13 +502,12 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, rqd.opcode = NVM_OP_PREAD; rqd.end_io = pblk_end_io_sync; rqd.private = &wait; - rqd.nr_ppas = *secs_to_gc; + rqd.nr_ppas = gc_rq->secs_to_gc; rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); rqd.bio = bio; - ret = pblk_submit_read_io(pblk, &rqd); - if (ret) { - bio_endio(bio); + if (pblk_submit_read_io(pblk, &rqd)) { + ret = -EIO; pr_err("pblk: GC read request failed\n"); goto err_free_bio; } @@ -519,19 +526,19 @@ int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, } #ifdef CONFIG_NVM_DEBUG - atomic_long_add(*secs_to_gc, &pblk->sync_reads); - atomic_long_add(*secs_to_gc, &pblk->recov_gc_reads); - atomic_long_sub(*secs_to_gc, &pblk->inflight_reads); + atomic_long_add(gc_rq->secs_to_gc, &pblk->sync_reads); + atomic_long_add(gc_rq->secs_to_gc, &pblk->recov_gc_reads); + atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); #endif bio_put(bio); out: nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); - return NVM_IO_OK; + return ret; err_free_bio: bio_put(bio); err_free_dma: nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); - return NVM_IO_ERR; + return ret; } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index baa6a633990f..876b50f97234 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -206,6 +206,7 @@ struct pblk_lun { struct pblk_gc_rq { struct pblk_line *line; void *data; + u64 paddr_list[PBLK_MAX_REQ_ADDRS]; u64 lba_list[PBLK_MAX_REQ_ADDRS]; int nr_secs; int secs_to_gc; @@ -658,8 +659,8 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int nr_entries, void pblk_rb_write_entry_user(struct pblk_rb *rb, void *data, struct pblk_w_ctx w_ctx, unsigned int pos); void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data, - struct pblk_w_ctx w_ctx, struct pblk_line *gc_line, - unsigned int pos); + struct pblk_w_ctx w_ctx, struct pblk_line *line, + u64 paddr, unsigned int pos); struct pblk_w_ctx *pblk_rb_w_ctx(struct pblk_rb *rb, unsigned int pos); void pblk_rb_flush(struct pblk_rb *rb); @@ -761,7 +762,7 @@ void pblk_update_map_cache(struct pblk *pblk, sector_t lba, void pblk_update_map_dev(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, struct ppa_addr entry_line); int pblk_update_map_gc(struct pblk *pblk, sector_t lba, struct ppa_addr ppa, - struct pblk_line *gc_line); + struct pblk_line *gc_line, u64 paddr); void pblk_lookup_l2p_rand(struct pblk *pblk, struct ppa_addr *ppas, u64 *lba_list, int nr_secs); void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, @@ -772,9 +773,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, */ int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags); -int pblk_write_gc_to_cache(struct pblk *pblk, void *data, u64 *lba_list, - unsigned int nr_entries, unsigned int nr_rec_entries, - struct pblk_line *gc_line, unsigned long flags); +int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq); /* * pblk map @@ -798,9 +797,7 @@ void pblk_write_should_kick(struct pblk *pblk); */ extern struct bio_set *pblk_bio_set; int pblk_submit_read(struct pblk *pblk, struct bio *bio); -int pblk_submit_read_gc(struct pblk *pblk, u64 *lba_list, void *data, - unsigned int nr_secs, unsigned int *secs_to_gc, - struct pblk_line *line); +int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq); /* * pblk recovery */ @@ -893,13 +890,7 @@ static inline void *emeta_to_vsc(struct pblk *pblk, struct line_emeta *emeta) static inline int pblk_line_vsc(struct pblk_line *line) { - int vsc; - - spin_lock(&line->lock); - vsc = le32_to_cpu(*line->vsc); - spin_unlock(&line->lock); - - return vsc; + return le32_to_cpu(*line->vsc); } #define NVM_MEM_PAGE_WRITE (8) -- cgit v1.2.1 From 2a19b10d423c6dc47449e905ed3a8eabb49c48a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:15 +0200 Subject: lightnvm: pblk: refactor read path on GC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify the part of the garbage collector where data is read from the line being recycled and moved into an internal queue before being copied to the memory buffer. This allows to get rid of a dedicated function, which introduces an unnecessary dependency on the code. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 94 +++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 55 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 7ad0cfe58a21..7b103bce58bf 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -56,57 +56,6 @@ static void pblk_gc_writer_kick(struct pblk_gc *gc) wake_up_process(gc->gc_writer_ts); } -/* - * Responsible for managing all memory related to a gc request. Also in case of - * failure - */ -static int pblk_gc_move_valid_secs(struct pblk *pblk, struct pblk_gc_rq *gc_rq) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_gc *gc = &pblk->gc; - struct pblk_line *line = gc_rq->line; - void *data; - int ret = 0; - - data = vmalloc(gc_rq->nr_secs * geo->sec_size); - if (!data) { - ret = -ENOMEM; - goto fail; - } - - gc_rq->data = data; - - /* Read from GC victim block */ - ret = pblk_submit_read_gc(pblk, gc_rq); - if (ret) - goto fail; - - if (!gc_rq->secs_to_gc) - goto fail; - -retry: - spin_lock(&gc->w_lock); - if (gc->w_entries >= PBLK_GC_RQ_QD) { - spin_unlock(&gc->w_lock); - pblk_gc_writer_kick(&pblk->gc); - usleep_range(128, 256); - goto retry; - } - gc->w_entries++; - list_add_tail(&gc_rq->list, &gc->w_list); - spin_unlock(&gc->w_lock); - - pblk_gc_writer_kick(&pblk->gc); - - return 0; - -fail: - pblk_gc_free_gc_rq(gc_rq); - kref_put(&line->ref, pblk_line_put); - return ret; -} - static void pblk_put_line_back(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -130,18 +79,53 @@ static void pblk_gc_line_ws(struct work_struct *work) struct pblk_line_ws *gc_rq_ws = container_of(work, struct pblk_line_ws, ws); struct pblk *pblk = gc_rq_ws->pblk; + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; struct pblk_gc *gc = &pblk->gc; struct pblk_line *line = gc_rq_ws->line; struct pblk_gc_rq *gc_rq = gc_rq_ws->priv; + int ret; up(&gc->gc_sem); - if (pblk_gc_move_valid_secs(pblk, gc_rq)) { - pr_err("pblk: could not GC all sectors: line:%d (%d/%d)\n", - line->id, *line->vsc, - gc_rq->nr_secs); + gc_rq->data = vmalloc(gc_rq->nr_secs * geo->sec_size); + if (!gc_rq->data) { + pr_err("pblk: could not GC line:%d (%d/%d)\n", + line->id, *line->vsc, gc_rq->nr_secs); + goto out; + } + + /* Read from GC victim block */ + ret = pblk_submit_read_gc(pblk, gc_rq); + if (ret) { + pr_err("pblk: failed GC read in line:%d (err:%d)\n", + line->id, ret); + goto out; } + if (!gc_rq->secs_to_gc) + goto out; + +retry: + spin_lock(&gc->w_lock); + if (gc->w_entries >= PBLK_GC_RQ_QD) { + spin_unlock(&gc->w_lock); + pblk_gc_writer_kick(&pblk->gc); + usleep_range(128, 256); + goto retry; + } + gc->w_entries++; + list_add_tail(&gc_rq->list, &gc->w_list); + spin_unlock(&gc->w_lock); + + pblk_gc_writer_kick(&pblk->gc); + + kfree(gc_rq_ws); + return; + +out: + pblk_gc_free_gc_rq(gc_rq); + kref_put(&line->ref, pblk_line_put); kfree(gc_rq_ws); } -- cgit v1.2.1 From 55e836d401601e7903b36db015ce899dc11085ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:16 +0200 Subject: lightnvm: pblk: put bio on bio completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Simplify put bio by doing it on bio end_io instead of manually putting it on the completion path. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 10 +++++++--- drivers/lightnvm/pblk-read.c | 1 - drivers/lightnvm/pblk-recovery.c | 1 - drivers/lightnvm/pblk-write.c | 8 +------- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 6dd4866e579c..3d27d24baa0b 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -417,6 +417,11 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) return nvm_submit_io(dev, rqd); } +static void pblk_bio_map_addr_endio(struct bio *bio) +{ + bio_put(bio); +} + struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, unsigned int nr_secs, unsigned int len, int alloc_type, gfp_t gfp_mask) @@ -453,6 +458,8 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, kaddr += PAGE_SIZE; } + + bio->bi_end_io = pblk_bio_map_addr_endio; out: return bio; } @@ -671,9 +678,6 @@ next_rq: atomic_dec(&pblk->inflight_io); reinit_completion(&wait); - if (likely(pblk->l_mg.emeta_alloc_type == PBLK_VMALLOC_META)) - bio_put(bio); - if (rqd.error) { if (dir == WRITE) pblk_log_write_err(pblk, &rqd); diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index c28d6509312e..e7141b1aaded 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -531,7 +531,6 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) atomic_long_sub(gc_rq->secs_to_gc, &pblk->inflight_reads); #endif - bio_put(bio); out: nvm_dev_dma_free(dev->parent, rqd.meta_list, rqd.dma_meta_list); return ret; diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 6b6b4183b41e..e59270e60b58 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -333,7 +333,6 @@ static void pblk_end_io_recov(struct nvm_rq *rqd) pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); - bio_put(rqd->bio); nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); pblk_free_rqd(pblk, rqd, WRITE); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 0fb8f26a6311..0c0481cf9f5d 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -188,17 +188,12 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) pblk_log_write_err(pblk, rqd); pr_err("pblk: metadata I/O failed. Line %d\n", line->id); } -#ifdef CONFIG_NVM_DEBUG - else - WARN_ONCE(rqd->bio->bi_status, "pblk: corrupted write error\n"); -#endif sync = atomic_add_return(rqd->nr_ppas, &emeta->sync); if (sync == emeta->nr_entries) pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, GFP_ATOMIC, pblk->close_wq); - bio_put(rqd->bio); nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); pblk_free_rqd(pblk, rqd, READ); @@ -427,8 +422,7 @@ fail_rollback: nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); fail_free_bio: - if (likely(l_mg->emeta_alloc_type == PBLK_VMALLOC_META)) - bio_put(bio); + bio_put(bio); fail_free_rqd: pblk_free_rqd(pblk, rqd, READ); return ret; -- cgit v1.2.1 From 6ca2f71f3e3d94d188000b420ce0529b07f3ce95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:17 +0200 Subject: lightnvm: pblk: simplify path on REQ_PREFLUSH MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On REQ_PREFLUSH, directly tag the I/O context flags to signal a flush in the write to cache path, instead of finding the correct entry context and imposing a memory barrier. This simplifies the code and might potentially prevent race conditions when adding functionality to the write path. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-cache.c | 4 +++- drivers/lightnvm/pblk-rb.c | 8 +------- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c index 1d6b8e3585f1..0d227ef7d1b9 100644 --- a/drivers/lightnvm/pblk-cache.c +++ b/drivers/lightnvm/pblk-cache.c @@ -43,8 +43,10 @@ retry: if (unlikely(!bio_has_data(bio))) goto out; - w_ctx.flags = flags; pblk_ppa_set_empty(&w_ctx.ppa); + w_ctx.flags = flags; + if (bio->bi_opf & REQ_PREFLUSH) + w_ctx.flags |= PBLK_FLUSH_ENTRY; for (i = 0; i < nr_entries; i++) { void *data = bio_data(bio); diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 74c768ce09ef..05e6b2e9221d 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -355,7 +355,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, { struct pblk_rb_entry *entry; unsigned int subm, sync_point; - int flags; subm = READ_ONCE(rb->subm); @@ -369,12 +368,6 @@ static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio, sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1); entry = &rb->entries[sync_point]; - flags = READ_ONCE(entry->w_ctx.flags); - flags |= PBLK_FLUSH_ENTRY; - - /* Release flags on context. Protect from writes */ - smp_store_release(&entry->w_ctx.flags, flags); - /* Protect syncs */ smp_store_release(&rb->sync_point, sync_point); @@ -454,6 +447,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries, /* Protect from read count */ smp_store_release(&rb->mem, mem); + return 1; } -- cgit v1.2.1 From 875d94f3a4838f2243334e5ce55ac8153f9bbf5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:18 +0200 Subject: lightnvm: pblk: allocate bio size more accurately MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wait until we know the exact number of ppas to be sent to the device, before allocating the bio. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-rb.c | 5 +++-- drivers/lightnvm/pblk-write.c | 20 ++++++++++---------- drivers/lightnvm/pblk.h | 4 ++-- 3 files changed, 15 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 05e6b2e9221d..1173e2380137 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -552,12 +552,13 @@ out: * persist data on the write buffer to the media. */ unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, - struct bio *bio, unsigned int pos, - unsigned int nr_entries, unsigned int count) + unsigned int pos, unsigned int nr_entries, + unsigned int count) { struct pblk *pblk = container_of(rb, struct pblk, rwb); struct request_queue *q = pblk->dev->q; struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); + struct bio *bio = rqd->bio; struct pblk_rb_entry *entry; struct page *page; unsigned int pad = 0, to_read = nr_entries; diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 0c0481cf9f5d..140a26edd1d3 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -534,24 +534,24 @@ static int pblk_submit_write(struct pblk *pblk) if (!secs_to_flush && secs_avail < pblk->min_write_pgs) return 1; - bio = bio_alloc(GFP_KERNEL, pblk->max_write_pgs); - - bio->bi_iter.bi_sector = 0; /* internal bio */ - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - - rqd = pblk_alloc_rqd(pblk, WRITE); - rqd->bio = bio; - secs_to_sync = pblk_calc_secs_to_sync(pblk, secs_avail, secs_to_flush); if (secs_to_sync > pblk->max_write_pgs) { pr_err("pblk: bad buffer sync calculation\n"); - goto fail_put_bio; + return 1; } secs_to_com = (secs_to_sync > secs_avail) ? secs_avail : secs_to_sync; pos = pblk_rb_read_commit(&pblk->rwb, secs_to_com); - if (pblk_rb_read_to_bio(&pblk->rwb, rqd, bio, pos, secs_to_sync, + bio = bio_alloc(GFP_KERNEL, secs_to_sync); + + bio->bi_iter.bi_sector = 0; /* internal bio */ + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + rqd = pblk_alloc_rqd(pblk, WRITE); + rqd->bio = bio; + + if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, secs_avail)) { pr_err("pblk: corrupted write bio\n"); goto fail_put_bio; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 876b50f97234..9f162057d497 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -666,8 +666,8 @@ void pblk_rb_flush(struct pblk_rb *rb); void pblk_rb_sync_l2p(struct pblk_rb *rb); unsigned int pblk_rb_read_to_bio(struct pblk_rb *rb, struct nvm_rq *rqd, - struct bio *bio, unsigned int pos, - unsigned int nr_entries, unsigned int count); + unsigned int pos, unsigned int nr_entries, + unsigned int count); unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio, struct list_head *list, unsigned int max); -- cgit v1.2.1 From e2cddf2082e700218b898b1c899f6a1c2130074a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:19 +0200 Subject: lightnvm: pblk: improve naming for internal req. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each request type sent to the LightNVM subsystem requires different metadata. Until now, we have tailored this metadata based on write, read and erase commands. However, pblk uses different metadata for internal writes that do not hit the write buffer. Instead of abusing the metadata for reads, create a new request type - internal write to improve code readability. In the process, create internal values for each I/O type instead of abusing the READ/WRITE macros, as suggested by Christoph. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 32 ++++++++++++++++---------------- drivers/lightnvm/pblk-read.c | 6 +++--- drivers/lightnvm/pblk-recovery.c | 12 ++++++------ drivers/lightnvm/pblk-write.c | 16 ++++++++-------- drivers/lightnvm/pblk.h | 11 ++++++++--- 5 files changed, 41 insertions(+), 36 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 3d27d24baa0b..a492964abea8 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -152,7 +152,7 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) struct nvm_rq *rqd; int rq_size; - if (rw == WRITE) { + if (rw == PBLK_WRITE) { pool = pblk->w_rq_pool; rq_size = pblk_w_rq_size; } else { @@ -170,7 +170,7 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw) { mempool_t *pool; - if (rw == WRITE) + if (rw == PBLK_WRITE) pool = pblk->w_rq_pool; else pool = pblk->r_rq_pool; @@ -569,10 +569,10 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, int ret; DECLARE_COMPLETION_ONSTACK(wait); - if (dir == WRITE) { + if (dir == PBLK_WRITE) { bio_op = REQ_OP_WRITE; cmd_op = NVM_OP_PWRITE; - } else if (dir == READ) { + } else if (dir == PBLK_READ) { bio_op = REQ_OP_READ; cmd_op = NVM_OP_PREAD; } else @@ -612,10 +612,10 @@ next_rq: rqd.end_io = pblk_end_io_sync; rqd.private = &wait; - if (dir == WRITE) { + if (dir == PBLK_WRITE) { struct pblk_sec_meta *meta_list = rqd.meta_list; - rqd.flags = pblk_set_progr_mode(pblk, WRITE); + rqd.flags = pblk_set_progr_mode(pblk, PBLK_WRITE); for (i = 0; i < rqd.nr_ppas; ) { spin_lock(&line->lock); paddr = __pblk_alloc_page(pblk, line, min); @@ -679,7 +679,7 @@ next_rq: reinit_completion(&wait); if (rqd.error) { - if (dir == WRITE) + if (dir == PBLK_WRITE) pblk_log_write_err(pblk, &rqd); else pblk_log_read_err(pblk, &rqd); @@ -722,12 +722,12 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, int flags; DECLARE_COMPLETION_ONSTACK(wait); - if (dir == WRITE) { + if (dir == PBLK_WRITE) { bio_op = REQ_OP_WRITE; cmd_op = NVM_OP_PWRITE; - flags = pblk_set_progr_mode(pblk, WRITE); + flags = pblk_set_progr_mode(pblk, PBLK_WRITE); lba_list = emeta_to_lbas(pblk, line->emeta->buf); - } else if (dir == READ) { + } else if (dir == PBLK_READ) { bio_op = REQ_OP_READ; cmd_op = NVM_OP_PREAD; flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); @@ -765,7 +765,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, rqd.ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); - if (dir == WRITE) { + if (dir == PBLK_WRITE) { __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); meta_list[i].lba = lba_list[paddr] = addr_empty; @@ -791,7 +791,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, atomic_dec(&pblk->inflight_io); if (rqd.error) { - if (dir == WRITE) + if (dir == PBLK_WRITE) pblk_log_write_err(pblk, &rqd); else pblk_log_read_err(pblk, &rqd); @@ -807,14 +807,14 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line) { u64 bpaddr = pblk_line_smeta_start(pblk, line); - return pblk_line_submit_smeta_io(pblk, line, bpaddr, READ); + return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ); } int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, void *emeta_buf) { return pblk_line_submit_emeta_io(pblk, line, emeta_buf, - line->emeta_ssec, READ); + line->emeta_ssec, PBLK_READ); } static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, @@ -823,7 +823,7 @@ static void pblk_setup_e_rq(struct pblk *pblk, struct nvm_rq *rqd, rqd->opcode = NVM_OP_ERASE; rqd->ppa_addr = ppa; rqd->nr_ppas = 1; - rqd->flags = pblk_set_progr_mode(pblk, ERASE); + rqd->flags = pblk_set_progr_mode(pblk, PBLK_ERASE); rqd->bio = NULL; } @@ -1045,7 +1045,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, line->smeta_ssec = off; line->cur_sec = off + lm->smeta_sec; - if (init && pblk_line_submit_smeta_io(pblk, line, off, WRITE)) { + if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) { pr_debug("pblk: line smeta I/O failed. Retry\n"); return 1; } diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index e7141b1aaded..4b1722fbe5a0 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -142,7 +142,7 @@ static void pblk_end_io_read(struct nvm_rq *rqd) atomic_long_sub(rqd->nr_ppas, &pblk->inflight_reads); #endif - pblk_free_rqd(pblk, rqd, READ); + pblk_free_rqd(pblk, rqd, PBLK_READ); atomic_dec(&pblk->inflight_io); } @@ -307,7 +307,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) bitmap_zero(&read_bitmap, nr_secs); - rqd = pblk_alloc_rqd(pblk, READ); + rqd = pblk_alloc_rqd(pblk, PBLK_READ); rqd->opcode = NVM_OP_PREAD; rqd->bio = bio; @@ -382,7 +382,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) return NVM_IO_OK; fail_rqd_free: - pblk_free_rqd(pblk, rqd, READ); + pblk_free_rqd(pblk, rqd, PBLK_READ); return ret; } diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index e59270e60b58..19f2fb1a9e4b 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -67,7 +67,7 @@ void pblk_submit_rec(struct work_struct *work) err: bio_put(bio); - pblk_free_rqd(pblk, rqd, WRITE); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); } int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, @@ -80,7 +80,7 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx, struct pblk_c_ctx *rec_ctx; int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded; - rec_rqd = pblk_alloc_rqd(pblk, WRITE); + rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); rec_ctx = nvm_rq_to_pdu(rec_rqd); /* Copy completion bitmap, but exclude the first X completed entries */ @@ -334,7 +334,7 @@ static void pblk_end_io_recov(struct nvm_rq *rqd) pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); - pblk_free_rqd(pblk, rqd, WRITE); + pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); atomic_dec(&pblk->inflight_io); kref_put(&pad_rq->ref, pblk_recov_complete); @@ -404,11 +404,11 @@ next_pad_rq: bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - rqd = pblk_alloc_rqd(pblk, WRITE); + rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); rqd->bio = bio; rqd->opcode = NVM_OP_PWRITE; - rqd->flags = pblk_set_progr_mode(pblk, WRITE); + rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); rqd->meta_list = meta_list; rqd->nr_ppas = rq_ppas; rqd->ppa_list = ppa_list; @@ -782,7 +782,7 @@ static int pblk_recov_l2p_from_oob(struct pblk *pblk, struct pblk_line *line) goto free_meta_list; } - rqd = pblk_alloc_rqd(pblk, READ); + rqd = pblk_alloc_rqd(pblk, PBLK_READ); p.ppa_list = ppa_list; p.meta_list = meta_list; diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index 140a26edd1d3..c1b8b83e149d 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -46,7 +46,7 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); bio_put(rqd->bio); - pblk_free_rqd(pblk, rqd, WRITE); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); return ret; } @@ -195,7 +195,7 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) GFP_ATOMIC, pblk->close_wq); nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); - pblk_free_rqd(pblk, rqd, READ); + pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); atomic_dec(&pblk->inflight_io); } @@ -209,7 +209,7 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, /* Setup write request */ rqd->opcode = NVM_OP_PWRITE; rqd->nr_ppas = nr_secs; - rqd->flags = pblk_set_progr_mode(pblk, WRITE); + rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); rqd->private = pblk; rqd->end_io = end_io; @@ -275,7 +275,7 @@ int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap, c_ctx->nr_valid, 0); rqd->ppa_status = (u64)0; - rqd->flags = pblk_set_progr_mode(pblk, WRITE); + rqd->flags = pblk_set_progr_mode(pblk, PBLK_WRITE); return ret; } @@ -366,7 +366,7 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) int i, j; int ret; - rqd = pblk_alloc_rqd(pblk, READ); + rqd = pblk_alloc_rqd(pblk, PBLK_WRITE_INT); m_ctx = nvm_rq_to_pdu(rqd); m_ctx->private = meta_line; @@ -424,7 +424,7 @@ fail_rollback: fail_free_bio: bio_put(bio); fail_free_rqd: - pblk_free_rqd(pblk, rqd, READ); + pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); return ret; } @@ -548,7 +548,7 @@ static int pblk_submit_write(struct pblk *pblk) bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - rqd = pblk_alloc_rqd(pblk, WRITE); + rqd = pblk_alloc_rqd(pblk, PBLK_WRITE); rqd->bio = bio; if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync, @@ -570,7 +570,7 @@ fail_free_bio: pblk_free_write_rqd(pblk, rqd); fail_put_bio: bio_put(bio); - pblk_free_rqd(pblk, rqd, WRITE); + pblk_free_rqd(pblk, rqd, PBLK_WRITE); return 1; } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 9f162057d497..d01e003d3d74 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -55,11 +55,16 @@ for ((i) = 0, rlun = &(pblk)->luns[0]; \ (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)]) -#define ERASE 2 /* READ = 0, WRITE = 1 */ - /* Static pool sizes */ #define PBLK_GEN_WS_POOL_SIZE (2) +enum { + PBLK_READ = READ, + PBLK_WRITE = WRITE,/* Write from write buffer */ + PBLK_WRITE_INT, /* Internal write - no write buffer */ + PBLK_ERASE, +}; + enum { /* IO Types */ PBLK_IOTYPE_USER = 1 << 0, @@ -1132,7 +1137,7 @@ static inline int pblk_set_progr_mode(struct pblk *pblk, int type) flags = geo->plane_mode >> 1; - if (type == WRITE) + if (type == PBLK_WRITE) flags |= NVM_IO_SCRAMBLE_ENABLE; return flags; -- cgit v1.2.1 From 67bf26a3220e3bd403a62a9289aa1d065d3db82c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:20 +0200 Subject: lightnvm: pblk: refactor rqd alloc/free MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactor the rqd allocation and free functions so that all I/O types can use these helper functions. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 40 ++++++++++++++++++++++++++++++---------- drivers/lightnvm/pblk-read.c | 2 -- drivers/lightnvm/pblk-recovery.c | 2 -- drivers/lightnvm/pblk-write.c | 7 ------- drivers/lightnvm/pblk.h | 4 ++-- 5 files changed, 32 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index a492964abea8..0de3875211b1 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -146,18 +146,26 @@ static void pblk_invalidate_range(struct pblk *pblk, sector_t slba, spin_unlock(&pblk->trans_lock); } -struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) +/* Caller must guarantee that the request is a valid type */ +struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type) { mempool_t *pool; struct nvm_rq *rqd; int rq_size; - if (rw == PBLK_WRITE) { + switch (type) { + case PBLK_WRITE: + case PBLK_WRITE_INT: pool = pblk->w_rq_pool; rq_size = pblk_w_rq_size; - } else { + break; + case PBLK_READ: pool = pblk->r_rq_pool; rq_size = pblk_g_rq_size; + break; + default: + pool = pblk->e_rq_pool; + rq_size = pblk_g_rq_size; } rqd = mempool_alloc(pool, GFP_KERNEL); @@ -166,15 +174,30 @@ struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw) return rqd; } -void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw) +/* Typically used on completion path. Cannot guarantee request consistency */ +void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type) { + struct nvm_tgt_dev *dev = pblk->dev; mempool_t *pool; - if (rw == PBLK_WRITE) + switch (type) { + case PBLK_WRITE: + kfree(((struct pblk_c_ctx *)nvm_rq_to_pdu(rqd))->lun_bitmap); + case PBLK_WRITE_INT: pool = pblk->w_rq_pool; - else + break; + case PBLK_READ: pool = pblk->r_rq_pool; + break; + case PBLK_ERASE: + pool = pblk->e_rq_pool; + break; + default: + pr_err("pblk: trying to free unknown rqd type\n"); + return; + } + nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); mempool_free(rqd, pool); } @@ -1470,8 +1493,7 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) struct nvm_rq *rqd; int err; - rqd = mempool_alloc(pblk->e_rq_pool, GFP_KERNEL); - memset(rqd, 0, pblk_g_rq_size); + rqd = pblk_alloc_rqd(pblk, PBLK_ERASE); pblk_setup_e_rq(pblk, rqd, ppa); @@ -1739,8 +1761,6 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, rlun = &pblk->luns[bit]; up(&rlun->wr_sem); } - - kfree(lun_bitmap); } void pblk_update_map(struct pblk *pblk, sector_t lba, struct ppa_addr ppa) diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 4b1722fbe5a0..d7c90c303540 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -124,8 +124,6 @@ static void pblk_end_io_read(struct nvm_rq *rqd) WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); #endif - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); - bio_put(bio); if (r_ctx->private) { struct bio *orig_bio = r_ctx->private; diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 19f2fb1a9e4b..686bc17f080f 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -329,11 +329,9 @@ static void pblk_end_io_recov(struct nvm_rq *rqd) { struct pblk_pad_rq *pad_rq = rqd->private; struct pblk *pblk = pad_rq->pblk; - struct nvm_tgt_dev *dev = pblk->dev; pblk_up_page(pblk, rqd->ppa_list, rqd->nr_ppas); - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); atomic_dec(&pblk->inflight_io); diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index c1b8b83e149d..f2e846fe9242 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -20,7 +20,6 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx) { - struct nvm_tgt_dev *dev = pblk->dev; struct bio *original_bio; unsigned long ret; int i; @@ -43,8 +42,6 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd, ret = pblk_rb_sync_advance(&pblk->rwb, c_ctx->nr_valid); - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); - bio_put(rqd->bio); pblk_free_rqd(pblk, rqd, PBLK_WRITE); @@ -176,7 +173,6 @@ static void pblk_end_io_write(struct nvm_rq *rqd) static void pblk_end_io_write_meta(struct nvm_rq *rqd) { struct pblk *pblk = rqd->private; - struct nvm_tgt_dev *dev = pblk->dev; struct pblk_g_ctx *m_ctx = nvm_rq_to_pdu(rqd); struct pblk_line *line = m_ctx->private; struct pblk_emeta *emeta = line->emeta; @@ -194,7 +190,6 @@ static void pblk_end_io_write_meta(struct nvm_rq *rqd) pblk_gen_run_ws(pblk, line, NULL, pblk_line_close_ws, GFP_ATOMIC, pblk->close_wq); - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); pblk_free_rqd(pblk, rqd, PBLK_WRITE_INT); atomic_dec(&pblk->inflight_io); @@ -419,8 +414,6 @@ fail_rollback: pblk_dealloc_page(pblk, meta_line, rq_ppas); list_add(&meta_line->list, &meta_line->list); spin_unlock(&l_mg->close_lock); - - nvm_dev_dma_free(dev->parent, rqd->meta_list, rqd->dma_meta_list); fail_free_bio: bio_put(bio); fail_free_rqd: diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index d01e003d3d74..12a20f800c26 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -699,11 +699,11 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf); /* * pblk core */ -struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int rw); +struct nvm_rq *pblk_alloc_rqd(struct pblk *pblk, int type); +void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type); void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx); -void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int rw); void pblk_wait_for_meta(struct pblk *pblk); struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba); void pblk_discard(struct pblk *pblk, struct bio *bio); -- cgit v1.2.1 From 26532ee52b77185b095d29b54c83386f737a74ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:21 +0200 Subject: lightnvm: pblk: use rqd->end_io for completion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For consistency with the rest of pblk, use rqd->end_io to point to the function taking care of ending the request on the completion path. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 7 ------- drivers/lightnvm/pblk-read.c | 5 ++--- 2 files changed, 2 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 0de3875211b1..08d166ac4f3c 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -261,13 +261,6 @@ void pblk_write_should_kick(struct pblk *pblk) pblk_write_kick(pblk); } -void pblk_end_bio_sync(struct bio *bio) -{ - struct completion *waiting = bio->bi_private; - - complete(waiting); -} - void pblk_end_io_sync(struct nvm_rq *rqd) { struct completion *waiting = rqd->private; diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index d7c90c303540..0299fc08291d 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -170,13 +170,12 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, new_bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(new_bio, REQ_OP_READ, 0); - new_bio->bi_private = &wait; - new_bio->bi_end_io = pblk_end_bio_sync; rqd->bio = new_bio; rqd->nr_ppas = nr_holes; rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); - rqd->end_io = NULL; + rqd->end_io = pblk_end_io_sync; + rqd->private = &wait; if (unlikely(nr_secs > 1 && nr_holes == 1)) { ppa_ptr = rqd->ppa_list; -- cgit v1.2.1 From a4809fee4e774fdf3296cc69c22ce6e6acef36b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:22 +0200 Subject: lightnvm: pblk: check lba sanity on read path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As part of pblk's recovery scheme, we store the lba mapped to each physical sector on the device's out-of-bound (OOB) area. On the read path, we can use this information to validate that the data being delivered to the upper layers corresponds to the lba being requested. The cost of this check is an extra copy on the DMA region on the device and an extra comparison in the host, given that (i) the OOB area is being read together with the data in the media, and (ii) the DMA region allocated for the ppa list can be reused for the metadata stored on the OOB area. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-read.c | 51 +++++++++++++++++++++++++++++++++++++++++--- drivers/lightnvm/pblk.h | 4 +++- 2 files changed, 51 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 0299fc08291d..a465d9980df4 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -41,6 +41,7 @@ static int pblk_read_from_cache(struct pblk *pblk, struct bio *bio, static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, sector_t blba, unsigned long *read_bitmap) { + struct pblk_sec_meta *meta_list = rqd->meta_list; struct bio *bio = rqd->bio; struct ppa_addr ppas[PBLK_MAX_REQ_ADDRS]; int nr_secs = rqd->nr_ppas; @@ -56,6 +57,7 @@ static void pblk_read_ppalist_rq(struct pblk *pblk, struct nvm_rq *rqd, retry: if (pblk_ppa_empty(p)) { WARN_ON(test_and_set_bit(i, read_bitmap)); + meta_list[i].lba = cpu_to_le64(ADDR_EMPTY); if (unlikely(!advanced_bio)) { bio_advance(bio, (i) * PBLK_EXPOSED_PAGE_SIZE); @@ -75,6 +77,7 @@ retry: goto retry; } WARN_ON(test_and_set_bit(i, read_bitmap)); + meta_list[i].lba = cpu_to_le64(lba); advanced_bio = true; #ifdef CONFIG_NVM_DEBUG atomic_long_inc(&pblk->cache_reads); @@ -110,10 +113,26 @@ static int pblk_submit_read_io(struct pblk *pblk, struct nvm_rq *rqd) return NVM_IO_OK; } +static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd, + sector_t blba) +{ + struct pblk_sec_meta *meta_list = rqd->meta_list; + int nr_lbas = rqd->nr_ppas; + int i; + + for (i = 0; i < nr_lbas; i++) { + u64 lba = le64_to_cpu(meta_list[i].lba); + + if (lba == ADDR_EMPTY) + continue; + + WARN(lba != blba + i, "pblk: corrupted read LBA\n"); + } +} + static void pblk_end_io_read(struct nvm_rq *rqd) { struct pblk *pblk = rqd->private; - struct nvm_tgt_dev *dev = pblk->dev; struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); struct bio *bio = rqd->bio; @@ -124,6 +143,8 @@ static void pblk_end_io_read(struct nvm_rq *rqd) WARN_ONCE(bio->bi_status, "pblk: corrupted read error\n"); #endif + pblk_read_check(pblk, rqd, r_ctx->lba); + bio_put(bio); if (r_ctx->private) { struct bio *orig_bio = r_ctx->private; @@ -149,15 +170,21 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, unsigned long *read_bitmap) { struct bio *new_bio, *bio = rqd->bio; + struct pblk_sec_meta *meta_list = rqd->meta_list; struct bio_vec src_bv, dst_bv; void *ppa_ptr = NULL; void *src_p, *dst_p; dma_addr_t dma_ppa_list = 0; + __le64 *lba_list_mem, *lba_list_media; int nr_secs = rqd->nr_ppas; int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); int i, ret, hole; DECLARE_COMPLETION_ONSTACK(wait); + /* Re-use allocated memory for intermediate lbas */ + lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); + lba_list_media = (((void *)rqd->ppa_list) + 2 * pblk_dma_ppa_size); + new_bio = bio_alloc(GFP_KERNEL, nr_holes); if (pblk_bio_add_pages(pblk, new_bio, GFP_KERNEL, nr_holes)) @@ -168,6 +195,9 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, goto err; } + for (i = 0; i < nr_secs; i++) + lba_list_mem[i] = meta_list[i].lba; + new_bio->bi_iter.bi_sector = 0; /* internal bio */ bio_set_op_attrs(new_bio, REQ_OP_READ, 0); @@ -207,10 +237,17 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, rqd->dma_ppa_list = dma_ppa_list; } + for (i = 0; i < nr_secs; i++) { + lba_list_media[i] = meta_list[i].lba; + meta_list[i].lba = lba_list_mem[i]; + } + /* Fill the holes in the original bio */ i = 0; hole = find_first_zero_bit(read_bitmap, nr_secs); do { + meta_list[hole].lba = lba_list_media[i]; + src_bv = new_bio->bi_io_vec[i++]; dst_bv = bio->bi_io_vec[bio_init_idx + hole]; @@ -251,6 +288,7 @@ err: static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, sector_t lba, unsigned long *read_bitmap) { + struct pblk_sec_meta *meta_list = rqd->meta_list; struct bio *bio = rqd->bio; struct ppa_addr ppa; @@ -263,6 +301,7 @@ static void pblk_read_rq(struct pblk *pblk, struct nvm_rq *rqd, retry: if (pblk_ppa_empty(ppa)) { WARN_ON(test_and_set_bit(0, read_bitmap)); + meta_list[0].lba = cpu_to_le64(ADDR_EMPTY); return; } @@ -274,6 +313,9 @@ retry: pblk_lookup_l2p_seq(pblk, &ppa, lba, 1); goto retry; } + + meta_list[0].lba = cpu_to_le64(lba); + WARN_ON(test_and_set_bit(0, read_bitmap)); #ifdef CONFIG_NVM_DEBUG atomic_long_inc(&pblk->cache_reads); @@ -290,9 +332,10 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) struct nvm_tgt_dev *dev = pblk->dev; sector_t blba = pblk_get_lba(bio); unsigned int nr_secs = pblk_get_secs(bio); + struct pblk_g_ctx *r_ctx; struct nvm_rq *rqd; - unsigned long read_bitmap; /* Max 64 ppas per request */ unsigned int bio_init_idx; + unsigned long read_bitmap; /* Max 64 ppas per request */ int ret = NVM_IO_ERR; /* logic error: lba out-of-bounds. Ignore read request */ @@ -312,6 +355,9 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) rqd->private = pblk; rqd->end_io = pblk_end_io_read; + r_ctx = nvm_rq_to_pdu(rqd); + r_ctx->lba = blba; + /* Save the index for this bio's start. This is needed in case * we need to fill a partial read. */ @@ -344,7 +390,6 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) /* All sectors are to be read from the device */ if (bitmap_empty(&read_bitmap, rqd->nr_ppas)) { struct bio *int_bio = NULL; - struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); /* Clone read bio to deal with read errors internally */ int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 12a20f800c26..4a51e6d4d036 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -99,6 +99,7 @@ enum { }; #define pblk_dma_meta_size (sizeof(struct pblk_sec_meta) * PBLK_MAX_REQ_ADDRS) +#define pblk_dma_ppa_size (sizeof(u64) * PBLK_MAX_REQ_ADDRS) /* write buffer completion context */ struct pblk_c_ctx { @@ -110,9 +111,10 @@ struct pblk_c_ctx { unsigned int nr_padded; }; -/* generic context */ +/* read context */ struct pblk_g_ctx { void *private; + u64 lba; }; /* Pad context */ -- cgit v1.2.1 From 7bd4d370db6090004a06deb526f0f01fa99a3f9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:23 +0200 Subject: lightnvm: pblk: guarantee line integrity on reads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a line is recycled during garbage collection, reads can still be issued to the line. If the line is freed in the middle of this process, data corruption might occur. This patch guarantees that lines are not freed in the middle of reads that target them (lines). Specifically, we use the existing line reference to decide when a line is eligible for being freed after the recycle process. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 56 ++++++++++++++++++++++++++++++---- drivers/lightnvm/pblk-init.c | 14 +++++++-- drivers/lightnvm/pblk-read.c | 71 +++++++++++++++++++++++++++++++++----------- drivers/lightnvm/pblk.h | 2 ++ 4 files changed, 118 insertions(+), 25 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 08d166ac4f3c..0a41fb998d55 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1460,10 +1460,8 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line) line->emeta = NULL; } -void pblk_line_put(struct kref *ref) +static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) { - struct pblk_line *line = container_of(ref, struct pblk_line, ref); - struct pblk *pblk = line->pblk; struct pblk_line_mgmt *l_mg = &pblk->l_mg; spin_lock(&line->lock); @@ -1481,6 +1479,43 @@ void pblk_line_put(struct kref *ref) pblk_rl_free_lines_inc(&pblk->rl, line); } +static void pblk_line_put_ws(struct work_struct *work) +{ + struct pblk_line_ws *line_put_ws = container_of(work, + struct pblk_line_ws, ws); + struct pblk *pblk = line_put_ws->pblk; + struct pblk_line *line = line_put_ws->line; + + __pblk_line_put(pblk, line); + mempool_free(line_put_ws, pblk->gen_ws_pool); +} + +void pblk_line_put(struct kref *ref) +{ + struct pblk_line *line = container_of(ref, struct pblk_line, ref); + struct pblk *pblk = line->pblk; + + __pblk_line_put(pblk, line); +} + +void pblk_line_put_wq(struct kref *ref) +{ + struct pblk_line *line = container_of(ref, struct pblk_line, ref); + struct pblk *pblk = line->pblk; + struct pblk_line_ws *line_put_ws; + + line_put_ws = mempool_alloc(pblk->gen_ws_pool, GFP_ATOMIC); + if (!line_put_ws) + return; + + line_put_ws->pblk = pblk; + line_put_ws->line = line; + line_put_ws->priv = NULL; + + INIT_WORK(&line_put_ws->ws, pblk_line_put_ws); + queue_work(pblk->r_end_wq, &line_put_ws->ws); +} + int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa) { struct nvm_rq *rqd; @@ -1878,8 +1913,19 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas, int i; spin_lock(&pblk->trans_lock); - for (i = 0; i < nr_secs; i++) - ppas[i] = pblk_trans_map_get(pblk, blba + i); + for (i = 0; i < nr_secs; i++) { + struct ppa_addr ppa; + + ppa = ppas[i] = pblk_trans_map_get(pblk, blba + i); + + /* If the L2P entry maps to a line, the reference is valid */ + if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) { + int line_id = pblk_dev_ppa_to_line(ppa); + struct pblk_line *line = &pblk->lines[line_id]; + + kref_get(&line->ref); + } + } spin_unlock(&pblk->trans_lock); } diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 4d719782f65b..34527646c01b 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -271,15 +271,22 @@ static int pblk_core_init(struct pblk *pblk) if (!pblk->bb_wq) goto free_close_wq; - if (pblk_set_ppaf(pblk)) + pblk->r_end_wq = alloc_workqueue("pblk-read-end-wq", + WQ_MEM_RECLAIM | WQ_UNBOUND, 0); + if (!pblk->r_end_wq) goto free_bb_wq; + if (pblk_set_ppaf(pblk)) + goto free_r_end_wq; + if (pblk_rwb_init(pblk)) - goto free_bb_wq; + goto free_r_end_wq; INIT_LIST_HEAD(&pblk->compl_list); return 0; +free_r_end_wq: + destroy_workqueue(pblk->r_end_wq); free_bb_wq: destroy_workqueue(pblk->bb_wq); free_close_wq: @@ -304,6 +311,9 @@ static void pblk_core_free(struct pblk *pblk) if (pblk->close_wq) destroy_workqueue(pblk->close_wq); + if (pblk->r_end_wq) + destroy_workqueue(pblk->r_end_wq); + if (pblk->bb_wq) destroy_workqueue(pblk->bb_wq); diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index a465d9980df4..402f8eff6a2e 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -130,9 +130,34 @@ static void pblk_read_check(struct pblk *pblk, struct nvm_rq *rqd, } } -static void pblk_end_io_read(struct nvm_rq *rqd) +static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct ppa_addr *ppa_list; + int i; + + ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; + + for (i = 0; i < rqd->nr_ppas; i++) { + struct ppa_addr ppa = ppa_list[i]; + struct pblk_line *line; + + line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; + kref_put(&line->ref, pblk_line_put_wq); + } +} + +static void pblk_end_user_read(struct bio *bio) +{ +#ifdef CONFIG_NVM_DEBUG + WARN_ONCE(bio->bi_status, "pblk: corrupted read bio\n"); +#endif + bio_endio(bio); + bio_put(bio); +} + +static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd, + bool put_line) { - struct pblk *pblk = rqd->private; struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd); struct bio *bio = rqd->bio; @@ -146,15 +171,11 @@ static void pblk_end_io_read(struct nvm_rq *rqd) pblk_read_check(pblk, rqd, r_ctx->lba); bio_put(bio); - if (r_ctx->private) { - struct bio *orig_bio = r_ctx->private; + if (r_ctx->private) + pblk_end_user_read((struct bio *)r_ctx->private); -#ifdef CONFIG_NVM_DEBUG - WARN_ONCE(orig_bio->bi_status, "pblk: corrupted read bio\n"); -#endif - bio_endio(orig_bio); - bio_put(orig_bio); - } + if (put_line) + pblk_read_put_rqd_kref(pblk, rqd); #ifdef CONFIG_NVM_DEBUG atomic_long_add(rqd->nr_ppas, &pblk->sync_reads); @@ -165,6 +186,13 @@ static void pblk_end_io_read(struct nvm_rq *rqd) atomic_dec(&pblk->inflight_io); } +static void pblk_end_io_read(struct nvm_rq *rqd) +{ + struct pblk *pblk = rqd->private; + + __pblk_end_io_read(pblk, rqd, true); +} + static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, unsigned int bio_init_idx, unsigned long *read_bitmap) @@ -233,8 +261,12 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, } if (unlikely(nr_secs > 1 && nr_holes == 1)) { + struct ppa_addr ppa; + + ppa = rqd->ppa_addr; rqd->ppa_list = ppa_ptr; rqd->dma_ppa_list = dma_ppa_list; + rqd->ppa_list[0] = ppa; } for (i = 0; i < nr_secs; i++) { @@ -246,6 +278,11 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, i = 0; hole = find_first_zero_bit(read_bitmap, nr_secs); do { + int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]); + struct pblk_line *line = &pblk->lines[line_id]; + + kref_put(&line->ref, pblk_line_put); + meta_list[hole].lba = lba_list_media[i]; src_bv = new_bio->bi_io_vec[i++]; @@ -269,19 +306,17 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, bio_put(new_bio); /* Complete the original bio and associated request */ + bio_endio(bio); rqd->bio = bio; rqd->nr_ppas = nr_secs; - rqd->private = pblk; - bio_endio(bio); - pblk_end_io_read(rqd); + __pblk_end_io_read(pblk, rqd, false); return NVM_IO_OK; err: /* Free allocated pages in new bio */ pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt); - rqd->private = pblk; - pblk_end_io_read(rqd); + __pblk_end_io_read(pblk, rqd, false); return NVM_IO_ERR; } @@ -314,11 +349,11 @@ retry: goto retry; } + WARN_ON(test_and_set_bit(0, read_bitmap)); meta_list[0].lba = cpu_to_le64(lba); - WARN_ON(test_and_set_bit(0, read_bitmap)); #ifdef CONFIG_NVM_DEBUG - atomic_long_inc(&pblk->cache_reads); + atomic_long_inc(&pblk->cache_reads); #endif } else { rqd->ppa_addr = ppa; @@ -383,7 +418,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio) if (bitmap_full(&read_bitmap, nr_secs)) { bio_endio(bio); atomic_inc(&pblk->inflight_io); - pblk_end_io_read(rqd); + __pblk_end_io_read(pblk, rqd, false); return NVM_IO_OK; } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 4a51e6d4d036..e4704373398b 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -636,6 +636,7 @@ struct pblk { struct workqueue_struct *close_wq; struct workqueue_struct *bb_wq; + struct workqueue_struct *r_end_wq; struct timer_list wtimer; @@ -741,6 +742,7 @@ int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line, void *emeta_buf); int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr erase_ppa); void pblk_line_put(struct kref *ref); +void pblk_line_put_wq(struct kref *ref); struct list_head *pblk_line_gc_list(struct pblk *pblk, struct pblk_line *line); u64 pblk_lookup_page(struct pblk *pblk, struct pblk_line *line); void pblk_dealloc_page(struct pblk *pblk, struct pblk_line *line, int nr_secs); -- cgit v1.2.1 From 0f9248cf1e22333b2a0458540aafb1ad3b2b3337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:24 +0200 Subject: lightnvm: pblk: remove redundant check on read path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A partial read I/O in pblk is an I/O where some sectors reside in the write buffer in main memory and some are persisted on the device. Such an I/O must at least contain 2 lbas, therefore checking for the case where a single lba is mapped is not necessary. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-read.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 402f8eff6a2e..71c58503f1a4 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -235,7 +235,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, rqd->end_io = pblk_end_io_sync; rqd->private = &wait; - if (unlikely(nr_secs > 1 && nr_holes == 1)) { + if (unlikely(nr_holes == 1)) { ppa_ptr = rqd->ppa_list; dma_ppa_list = rqd->dma_ppa_list; rqd->ppa_addr = rqd->ppa_list[0]; @@ -260,7 +260,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, #endif } - if (unlikely(nr_secs > 1 && nr_holes == 1)) { + if (unlikely(nr_holes == 1)) { struct ppa_addr ppa; ppa = rqd->ppa_addr; -- cgit v1.2.1 From 1e82123da6a4c6019ef03bcd47e4b3dc18dd136e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:25 +0200 Subject: lightnvm: pblk: remove I/O dependency on write path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk schedules user I/O, metadata I/O and erases on the write path in order to minimize collisions at the media level. Until now, there has been a dependency between user and metadata I/Os that could lead to a deadlock as both take the per-LUN semaphore to schedule submission. This path removes this dependency and guarantees forward progress at a per I/O granurality. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-write.c | 145 +++++++++++++++++++----------------------- 1 file changed, 65 insertions(+), 80 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c index f2e846fe9242..6c1cafafef53 100644 --- a/drivers/lightnvm/pblk-write.c +++ b/drivers/lightnvm/pblk-write.c @@ -220,15 +220,16 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd, } static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd, - struct pblk_c_ctx *c_ctx, struct ppa_addr *erase_ppa) + struct ppa_addr *erase_ppa) { struct pblk_line_meta *lm = &pblk->lm; struct pblk_line *e_line = pblk_line_get_erase(pblk); + struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); unsigned int valid = c_ctx->nr_valid; unsigned int padded = c_ctx->nr_padded; unsigned int nr_secs = valid + padded; unsigned long *lun_bitmap; - int ret = 0; + int ret; lun_bitmap = kzalloc(lm->lun_bitmap_len, GFP_KERNEL); if (!lun_bitmap) @@ -294,55 +295,6 @@ static int pblk_calc_secs_to_sync(struct pblk *pblk, unsigned int secs_avail, return secs_to_sync; } -static inline int pblk_valid_meta_ppa(struct pblk *pblk, - struct pblk_line *meta_line, - struct ppa_addr *ppa_list, int nr_ppas) -{ - struct nvm_tgt_dev *dev = pblk->dev; - struct nvm_geo *geo = &dev->geo; - struct pblk_line *data_line; - struct ppa_addr ppa, ppa_opt; - u64 paddr; - int i; - - data_line = &pblk->lines[pblk_dev_ppa_to_line(ppa_list[0])]; - paddr = pblk_lookup_page(pblk, meta_line); - ppa = addr_to_gen_ppa(pblk, paddr, 0); - - if (test_bit(pblk_ppa_to_pos(geo, ppa), data_line->blk_bitmap)) - return 1; - - /* Schedule a metadata I/O that is half the distance from the data I/O - * with regards to the number of LUNs forming the pblk instance. This - * balances LUN conflicts across every I/O. - * - * When the LUN configuration changes (e.g., due to GC), this distance - * can align, which would result on a LUN deadlock. In this case, modify - * the distance to not be optimal, but allow metadata I/Os to succeed. - */ - ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); - if (unlikely(ppa_opt.ppa == ppa.ppa)) { - data_line->meta_distance--; - return 0; - } - - for (i = 0; i < nr_ppas; i += pblk->min_write_pgs) - if (ppa_list[i].g.ch == ppa_opt.g.ch && - ppa_list[i].g.lun == ppa_opt.g.lun) - return 1; - - if (test_bit(pblk_ppa_to_pos(geo, ppa_opt), data_line->blk_bitmap)) { - for (i = 0; i < nr_ppas; i += pblk->min_write_pgs) - if (ppa_list[i].g.ch == ppa.g.ch && - ppa_list[i].g.lun == ppa.g.lun) - return 0; - - return 1; - } - - return 0; -} - int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line) { struct nvm_tgt_dev *dev = pblk->dev; @@ -421,8 +373,44 @@ fail_free_rqd: return ret; } -static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list, - int prev_n) +static inline bool pblk_valid_meta_ppa(struct pblk *pblk, + struct pblk_line *meta_line, + struct nvm_rq *data_rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct nvm_geo *geo = &dev->geo; + struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd); + struct pblk_line *data_line = pblk_line_get_data(pblk); + struct ppa_addr ppa, ppa_opt; + u64 paddr; + int pos_opt; + + /* Schedule a metadata I/O that is half the distance from the data I/O + * with regards to the number of LUNs forming the pblk instance. This + * balances LUN conflicts across every I/O. + * + * When the LUN configuration changes (e.g., due to GC), this distance + * can align, which would result on metadata and data I/Os colliding. In + * this case, modify the distance to not be optimal, but move the + * optimal in the right direction. + */ + paddr = pblk_lookup_page(pblk, meta_line); + ppa = addr_to_gen_ppa(pblk, paddr, 0); + ppa_opt = addr_to_gen_ppa(pblk, paddr + data_line->meta_distance, 0); + pos_opt = pblk_ppa_to_pos(geo, ppa_opt); + + if (test_bit(pos_opt, data_c_ctx->lun_bitmap) || + test_bit(pos_opt, data_line->blk_bitmap)) + return true; + + if (unlikely(pblk_ppa_comp(ppa_opt, ppa))) + data_line->meta_distance--; + + return false; +} + +static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk, + struct nvm_rq *data_rqd) { struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -432,57 +420,45 @@ static int pblk_sched_meta_io(struct pblk *pblk, struct ppa_addr *prev_list, retry: if (list_empty(&l_mg->emeta_list)) { spin_unlock(&l_mg->close_lock); - return 0; + return NULL; } meta_line = list_first_entry(&l_mg->emeta_list, struct pblk_line, list); if (meta_line->emeta->mem >= lm->emeta_len[0]) goto retry; spin_unlock(&l_mg->close_lock); - if (!pblk_valid_meta_ppa(pblk, meta_line, prev_list, prev_n)) - return 0; + if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd)) + return NULL; - return pblk_submit_meta_io(pblk, meta_line); + return meta_line; } static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) { - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd); struct ppa_addr erase_ppa; + struct pblk_line *meta_line; int err; ppa_set_empty(&erase_ppa); /* Assign lbas to ppas and populate request structure */ - err = pblk_setup_w_rq(pblk, rqd, c_ctx, &erase_ppa); + err = pblk_setup_w_rq(pblk, rqd, &erase_ppa); if (err) { pr_err("pblk: could not setup write request: %d\n", err); return NVM_IO_ERR; } - if (likely(ppa_empty(erase_ppa))) { - /* Submit metadata write for previous data line */ - err = pblk_sched_meta_io(pblk, rqd->ppa_list, rqd->nr_ppas); - if (err) { - pr_err("pblk: metadata I/O submission failed: %d", err); - return NVM_IO_ERR; - } + meta_line = pblk_should_submit_meta_io(pblk, rqd); - /* Submit data write for current data line */ - err = pblk_submit_io(pblk, rqd); - if (err) { - pr_err("pblk: data I/O submission failed: %d\n", err); - return NVM_IO_ERR; - } - } else { - /* Submit data write for current data line */ - err = pblk_submit_io(pblk, rqd); - if (err) { - pr_err("pblk: data I/O submission failed: %d\n", err); - return NVM_IO_ERR; - } + /* Submit data write for current data line */ + err = pblk_submit_io(pblk, rqd); + if (err) { + pr_err("pblk: data I/O submission failed: %d\n", err); + return NVM_IO_ERR; + } - /* Submit available erase for next data line */ + if (!ppa_empty(erase_ppa)) { + /* Submit erase for next data line */ if (pblk_blk_erase_async(pblk, erase_ppa)) { struct pblk_line *e_line = pblk_line_get_erase(pblk); struct nvm_tgt_dev *dev = pblk->dev; @@ -495,6 +471,15 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd) } } + if (meta_line) { + /* Submit metadata write for previous data line */ + err = pblk_submit_meta_io(pblk, meta_line); + if (err) { + pr_err("pblk: metadata I/O submission failed: %d", err); + return NVM_IO_ERR; + } + } + return NVM_IO_OK; } -- cgit v1.2.1 From 21d2287119e843929c29fb1adbd271bde1fac7ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:26 +0200 Subject: lightnvm: pblk: enable 1 LUN configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Metadata I/Os are scheduled to minimize their impact on user data I/Os. When there are enough LUNs instantiated (i.e., enough bandwidth), it is easy to interleave metadata and data one after the other so that metadata I/Os are the ones being blocked and not vice-versa. We do this by calculating the distance between the I/Os in terms of the LUNs that are not in used, and selecting a free LUN that satisfies a the simple heuristic that metadata is scheduled behind. The per-LUN semaphores guarantee consistency. This works fine on >1 LUN configuration. However, when a single LUN is instantiated, this design leads to a deadlock, where metadata waits to be scheduled on a free LUN. This patch implements the 1 LUN case by simply scheduling the metadada I/O after the data I/O. In the process, we refactor the way a line is replaced to ensure that metadata writes are submitted after data writes in order to guarantee block sequentiality. Note that, since there is only one LUN, both I/Os will block each other by design. However, such configuration only pursues tight read latencies, not write bandwidth. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 17 ++++++++++------- drivers/lightnvm/pblk-init.c | 8 ++++++-- drivers/lightnvm/pblk-map.c | 21 ++++++++++++--------- drivers/lightnvm/pblk.h | 2 +- 4 files changed, 29 insertions(+), 19 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 0a41fb998d55..e38e91897246 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1372,17 +1372,17 @@ void pblk_pipeline_stop(struct pblk *pblk) spin_unlock(&l_mg->free_lock); } -void pblk_line_replace_data(struct pblk *pblk) +struct pblk_line *pblk_line_replace_data(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line *cur, *new; + struct pblk_line *cur, *new = NULL; unsigned int left_seblks; int is_next = 0; cur = l_mg->data_line; new = l_mg->data_next; if (!new) - return; + goto out; l_mg->data_line = new; spin_lock(&l_mg->free_lock); @@ -1390,7 +1390,7 @@ void pblk_line_replace_data(struct pblk *pblk) l_mg->data_line = NULL; l_mg->data_next = NULL; spin_unlock(&l_mg->free_lock); - return; + goto out; } pblk_line_setup_metadata(new, l_mg, &pblk->lm); @@ -1402,7 +1402,7 @@ retry_erase: /* If line is not fully erased, erase it */ if (atomic_read(&new->left_eblks)) { if (pblk_line_erase(pblk, new)) - return; + goto out; } else { io_schedule(); } @@ -1413,7 +1413,7 @@ retry_setup: if (!pblk_line_init_metadata(pblk, new, cur)) { new = pblk_line_retry(pblk, new); if (!new) - return; + goto out; goto retry_setup; } @@ -1421,7 +1421,7 @@ retry_setup: if (!pblk_line_init_bb(pblk, new, 1)) { new = pblk_line_retry(pblk, new); if (!new) - return; + goto out; goto retry_setup; } @@ -1445,6 +1445,9 @@ retry_setup: if (is_next) pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next); + +out: + return new; } void pblk_line_free(struct pblk *pblk, struct pblk_line *line) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 34527646c01b..3c3749186053 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -711,8 +711,12 @@ add_emeta_page: } lm->emeta_bb = geo->nr_luns - i; - lm->min_blk_line = 1 + DIV_ROUND_UP(lm->smeta_sec + lm->emeta_sec[0], - geo->sec_per_blk); + + lm->min_blk_line = 1; + if (geo->nr_luns > 1) + lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec + + lm->emeta_sec[0], geo->sec_per_blk); + if (lm->min_blk_line > lm->blk_per_line) { pr_err("pblk: config. not supported. Min. LUN in line:%d\n", lm->blk_per_line); diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index fddb924f6dde..3bc4c94f9cf2 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c @@ -25,13 +25,23 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, unsigned int valid_secs) { struct pblk_line *line = pblk_line_get_data(pblk); - struct pblk_emeta *emeta = line->emeta; + struct pblk_emeta *emeta; struct pblk_w_ctx *w_ctx; - __le64 *lba_list = emeta_to_lbas(pblk, emeta->buf); + __le64 *lba_list; u64 paddr; int nr_secs = pblk->min_write_pgs; int i; + if (pblk_line_is_full(line)) { + struct pblk_line *prev_line = line; + + line = pblk_line_replace_data(pblk); + pblk_line_close_meta(pblk, prev_line); + } + + emeta = line->emeta; + lba_list = emeta_to_lbas(pblk, emeta->buf); + paddr = pblk_alloc_page(pblk, line, nr_secs); for (i = 0; i < nr_secs; i++, paddr++) { @@ -60,13 +70,6 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, } } - if (pblk_line_is_full(line)) { - struct pblk_line *prev_line = line; - - pblk_line_replace_data(pblk); - pblk_line_close_meta(pblk, prev_line); - } - pblk_down_rq(pblk, ppa_list, nr_secs, lun_bitmap); } diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index e4704373398b..191b1ec0627b 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -719,7 +719,7 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, int alloc_type, gfp_t gfp_mask); struct pblk_line *pblk_line_get(struct pblk *pblk); struct pblk_line *pblk_line_get_first_data(struct pblk *pblk); -void pblk_line_replace_data(struct pblk *pblk); +struct pblk_line *pblk_line_replace_data(struct pblk *pblk); int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line); void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line); struct pblk_line *pblk_line_get_data(struct pblk *pblk); -- cgit v1.2.1 From e6b754c252bacebdfbe3c57e790431ab8f445d1f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:27 +0200 Subject: lightnvm: pblk: ensure right bad block calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure that the variable controlling block threshold for allocating extra metadata sectors in case of a line with bad blocks does not get a negative value. Otherwise, the line will be marked as corrupted and wasted. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 3c3749186053..c7239c41ba40 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -710,7 +710,7 @@ add_emeta_page: goto add_emeta_page; } - lm->emeta_bb = geo->nr_luns - i; + lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0; lm->min_blk_line = 1; if (geo->nr_luns > 1) -- cgit v1.2.1 From 27b978725d895e704aab44b99242a0514485d798 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:28 +0200 Subject: lightnvm: pblk: fix changing GC group list for a line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pblk_line_gc_list seems to had a bug since the introduction of pblk in getting GC list for a line. In b20ba1bc7 while redesigning the GC algorithm, the naming for the GC thresholds was altered, but the values for high_thrs and mid_thrs were not. The result is that when moving to the GC lists, the mid threshold is never evaluated. Fixes: a4bd217b4("lightnvm: physical block device (pblk) target") Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index c7239c41ba40..56ece7dfac0e 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -678,8 +678,8 @@ static int pblk_lines_init(struct pblk *pblk) lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long); lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long); - lm->high_thrs = lm->sec_per_line / 2; - lm->mid_thrs = lm->sec_per_line / 4; + lm->mid_thrs = lm->sec_per_line / 2; + lm->high_thrs = lm->sec_per_line / 4; lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs; /* Calculate necessary pages for smeta. See comment over struct -- cgit v1.2.1 From e480689bd1cc35f6ed3fa628bc8d913177b0726a Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:29 +0200 Subject: lightnvm: pblk: remove useless line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index e38e91897246..43866ad87586 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1071,7 +1071,6 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line, /* Mark emeta metadata sectors as bad sectors. We need to consider bad * blocks to make sure that there are enough sectors to store emeta */ - bit = lm->sec_per_line; off = lm->sec_per_line - lm->emeta_sec[0]; bitmap_set(line->invalid_bitmap, off, lm->emeta_sec[0]); while (nr_bb) { -- cgit v1.2.1 From ef56b9ce562753cacf518f081a4ff3227efdab25 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:30 +0200 Subject: lightnvm: remove unused argument from nvm_set_tgt_bb_tbl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vblk isn't being used anyway and if we ever have a usecase we can introduce this again. This makes the logic easier and removes unnecessary checks. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 798964f511cd..231c92899431 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -633,7 +633,7 @@ int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, memset(&rqd, 0, sizeof(struct nvm_rq)); - nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1); + nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); nvm_rq_tgt_to_dev(tgt_dev, &rqd); ret = dev->ops->set_bb_tbl(dev, &rqd.ppa_addr, rqd.nr_ppas, type); @@ -697,7 +697,7 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, rqd.private = &wait; rqd.flags = geo->plane_mode >> 1; - ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas, 1); + ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); if (ret) return ret; @@ -793,14 +793,14 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin) EXPORT_SYMBOL(nvm_put_area); int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - const struct ppa_addr *ppas, int nr_ppas, int vblk) + const struct ppa_addr *ppas, int nr_ppas) { struct nvm_dev *dev = tgt_dev->parent; struct nvm_geo *geo = &tgt_dev->geo; int i, plane_cnt, pl_idx; struct ppa_addr ppa; - if ((!vblk || geo->plane_mode == NVM_PLANE_SINGLE) && nr_ppas == 1) { + if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { rqd->nr_ppas = nr_ppas; rqd->ppa_addr = ppas[0]; @@ -814,19 +814,14 @@ int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, return -ENOMEM; } - if (!vblk) { - for (i = 0; i < nr_ppas; i++) - rqd->ppa_list[i] = ppas[i]; - } else { - plane_cnt = geo->plane_mode; - rqd->nr_ppas *= plane_cnt; - - for (i = 0; i < nr_ppas; i++) { - for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - ppa = ppas[i]; - ppa.g.pl = pl_idx; - rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; - } + plane_cnt = geo->plane_mode; + rqd->nr_ppas *= plane_cnt; + + for (i = 0; i < nr_ppas; i++) { + for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { + ppa = ppas[i]; + ppa.g.pl = pl_idx; + rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; } } -- cgit v1.2.1 From eb6f168f97438bf1cac8b9b1301c662eace9e39f Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:31 +0200 Subject: lightnvm: remove stale extern and unused exported symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Not all exported symbols are being used outside core and there were some stale entries in lightnvm.h Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 129 ++++++++++++++++++++++++------------------------ 1 file changed, 64 insertions(+), 65 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 231c92899431..0e5f77234c79 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -226,6 +226,24 @@ static const struct block_device_operations nvm_fops = { .owner = THIS_MODULE, }; +static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) +{ + struct nvm_tgt_type *tmp, *tt = NULL; + + if (lock) + down_write(&nvm_tgtt_lock); + + list_for_each_entry(tmp, &nvm_tgt_types, list) + if (!strcmp(name, tmp->name)) { + tt = tmp; + break; + } + + if (lock) + up_write(&nvm_tgtt_lock); + return tt; +} + static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create) { struct nvm_ioctl_create_simple *s = &create->conf.s; @@ -549,25 +567,6 @@ void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries, } EXPORT_SYMBOL(nvm_part_to_tgt); -struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock) -{ - struct nvm_tgt_type *tmp, *tt = NULL; - - if (lock) - down_write(&nvm_tgtt_lock); - - list_for_each_entry(tmp, &nvm_tgt_types, list) - if (!strcmp(name, tmp->name)) { - tt = tmp; - break; - } - - if (lock) - up_write(&nvm_tgtt_lock); - return tt; -} -EXPORT_SYMBOL(nvm_find_target_type); - int nvm_register_tgt_type(struct nvm_tgt_type *tt) { int ret = 0; @@ -619,6 +618,52 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name) return NULL; } +static int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, + const struct ppa_addr *ppas, int nr_ppas) +{ + struct nvm_dev *dev = tgt_dev->parent; + struct nvm_geo *geo = &tgt_dev->geo; + int i, plane_cnt, pl_idx; + struct ppa_addr ppa; + + if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { + rqd->nr_ppas = nr_ppas; + rqd->ppa_addr = ppas[0]; + + return 0; + } + + rqd->nr_ppas = nr_ppas; + rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); + if (!rqd->ppa_list) { + pr_err("nvm: failed to allocate dma memory\n"); + return -ENOMEM; + } + + plane_cnt = geo->plane_mode; + rqd->nr_ppas *= plane_cnt; + + for (i = 0; i < nr_ppas; i++) { + for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { + ppa = ppas[i]; + ppa.g.pl = pl_idx; + rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; + } + } + + return 0; +} + +static void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, + struct nvm_rq *rqd) +{ + if (!rqd->ppa_list) + return; + + nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list); +} + + int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int nr_ppas, int type) { @@ -792,52 +837,6 @@ void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin) } EXPORT_SYMBOL(nvm_put_area); -int nvm_set_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd, - const struct ppa_addr *ppas, int nr_ppas) -{ - struct nvm_dev *dev = tgt_dev->parent; - struct nvm_geo *geo = &tgt_dev->geo; - int i, plane_cnt, pl_idx; - struct ppa_addr ppa; - - if (geo->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { - rqd->nr_ppas = nr_ppas; - rqd->ppa_addr = ppas[0]; - - return 0; - } - - rqd->nr_ppas = nr_ppas; - rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); - if (!rqd->ppa_list) { - pr_err("nvm: failed to allocate dma memory\n"); - return -ENOMEM; - } - - plane_cnt = geo->plane_mode; - rqd->nr_ppas *= plane_cnt; - - for (i = 0; i < nr_ppas; i++) { - for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - ppa = ppas[i]; - ppa.g.pl = pl_idx; - rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppa; - } - } - - return 0; -} -EXPORT_SYMBOL(nvm_set_rqd_ppalist); - -void nvm_free_rqd_ppalist(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) -{ - if (!rqd->ppa_list) - return; - - nvm_dev_dma_free(tgt_dev->parent, rqd->ppa_list, rqd->dma_ppa_list); -} -EXPORT_SYMBOL(nvm_free_rqd_ppalist); - void nvm_end_io(struct nvm_rq *rqd) { struct nvm_tgt_dev *tgt_dev = rqd->dev; -- cgit v1.2.1 From 05ed3447698203219319ec9d1c46303aff5932a2 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:32 +0200 Subject: lightnvm: pblk: reduce arguments in __pblk_rb_update_l2p MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We already pass the structure pointer so no need to pass the member. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-rb.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c index 1173e2380137..b8f78e401482 100644 --- a/drivers/lightnvm/pblk-rb.c +++ b/drivers/lightnvm/pblk-rb.c @@ -201,8 +201,7 @@ unsigned int pblk_rb_read_commit(struct pblk_rb *rb, unsigned int nr_entries) return subm; } -static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, - unsigned int to_update) +static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update) { struct pblk *pblk = container_of(rb, struct pblk, rwb); struct pblk_line *line; @@ -213,7 +212,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, int flags; for (i = 0; i < to_update; i++) { - entry = &rb->entries[*l2p_upd]; + entry = &rb->entries[rb->l2p_update]; w_ctx = &entry->w_ctx; flags = READ_ONCE(entry->w_ctx.flags); @@ -230,7 +229,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int *l2p_upd, line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)]; kref_put(&line->ref, pblk_line_put); clean_wctx(w_ctx); - *l2p_upd = (*l2p_upd + 1) & (rb->nr_entries - 1); + rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1); } pblk_rl_out(&pblk->rl, user_io, gc_io); @@ -258,7 +257,7 @@ static int pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int nr_entries, count = nr_entries - space; /* l2p_update used exclusively under rb->w_lock */ - ret = __pblk_rb_update_l2p(rb, &rb->l2p_update, count); + ret = __pblk_rb_update_l2p(rb, count); out: return ret; @@ -280,7 +279,7 @@ void pblk_rb_sync_l2p(struct pblk_rb *rb) sync = smp_load_acquire(&rb->sync); to_update = pblk_rb_ring_count(sync, rb->l2p_update, rb->nr_entries); - __pblk_rb_update_l2p(rb, &rb->l2p_update, to_update); + __pblk_rb_update_l2p(rb, to_update); spin_unlock(&rb->w_lock); } -- cgit v1.2.1 From 22a4e061ea11cc754785a12b3ba5f3e135bc0c63 Mon Sep 17 00:00:00 2001 From: Rakesh Pandit Date: Fri, 13 Oct 2017 14:46:33 +0200 Subject: lightnvm: pblk: fix releases of kmem cache in error path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If pblk_core_init fails lets destroy all global caches. Signed-off-by: Rakesh Pandit Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 56ece7dfac0e..2e599738372d 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -220,6 +220,14 @@ static int pblk_init_global_caches(struct pblk *pblk) return 0; } +static void pblk_free_global_caches(struct pblk *pblk) +{ + kmem_cache_destroy(pblk_ws_cache); + kmem_cache_destroy(pblk_rec_cache); + kmem_cache_destroy(pblk_g_rq_cache); + kmem_cache_destroy(pblk_w_rq_cache); +} + static int pblk_core_init(struct pblk *pblk) { struct nvm_tgt_dev *dev = pblk->dev; @@ -235,7 +243,7 @@ static int pblk_core_init(struct pblk *pblk) pblk->page_bio_pool = mempool_create_page_pool(nvm_max_phys_sects(dev), 0); if (!pblk->page_bio_pool) - return -ENOMEM; + goto free_global_caches; pblk->gen_ws_pool = mempool_create_slab_pool(PBLK_GEN_WS_POOL_SIZE, pblk_ws_cache); @@ -303,6 +311,8 @@ free_gen_ws_pool: mempool_destroy(pblk->gen_ws_pool); free_page_bio_pool: mempool_destroy(pblk->page_bio_pool); +free_global_caches: + pblk_free_global_caches(pblk); return -ENOMEM; } @@ -324,10 +334,7 @@ static void pblk_core_free(struct pblk *pblk) mempool_destroy(pblk->e_rq_pool); mempool_destroy(pblk->w_rq_pool); - kmem_cache_destroy(pblk_ws_cache); - kmem_cache_destroy(pblk_rec_cache); - kmem_cache_destroy(pblk_g_rq_cache); - kmem_cache_destroy(pblk_w_rq_cache); + pblk_free_global_caches(pblk); } static void pblk_luns_free(struct pblk *pblk) -- cgit v1.2.1 From 3e3a5b8ebd5d3b1d68facc58b0674a2564653222 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:34 +0200 Subject: lightnvm: pblk: prevent gc kicks when gc is not operational MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GC can be kicked after it has been shut down when closing the last line during exit, resulting in accesses to freed structures. Make sure that GC is not triggered while it is not operational. Also make sure that GC won't be re-activated during exit when running on another processor by using timer_del_sync. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 9 +++++---- drivers/lightnvm/pblk-init.c | 1 + 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 7b103bce58bf..81efac18ff57 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -478,10 +478,10 @@ void pblk_gc_should_start(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; - if (gc->gc_enabled && !gc->gc_active) + if (gc->gc_enabled && !gc->gc_active) { pblk_gc_start(pblk); - - pblk_gc_kick(pblk); + pblk_gc_kick(pblk); + } } /* @@ -620,7 +620,8 @@ void pblk_gc_exit(struct pblk *pblk) flush_workqueue(gc->gc_reader_wq); flush_workqueue(gc->gc_line_reader_wq); - del_timer(&gc->gc_timer); + gc->gc_enabled = 0; + del_timer_sync(&gc->gc_timer); pblk_gc_stop(pblk, 1); if (gc->gc_ts) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 2e599738372d..27eb430958ff 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -931,6 +931,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, pblk->dev = dev; pblk->disk = tdisk; pblk->state = PBLK_STATE_RUNNING; + pblk->gc.gc_enabled = 0; spin_lock_init(&pblk->trans_lock); spin_lock_init(&pblk->lock); -- cgit v1.2.1 From 92957091e93931c91fccb7cce456312edeeea36c Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:35 +0200 Subject: lightnvm: pblk: recover partially written lines correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When recovering partially written lines, the valid sector count must be decreased by the number of padded sectors in the line. Update line recovery to take all ADDR_EMPTY(padded) sectors into account. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-recovery.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 686bc17f080f..a080cf888982 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -133,8 +133,8 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) struct pblk_emeta *emeta = line->emeta; struct line_emeta *emeta_buf = emeta->buf; __le64 *lba_list; - int data_start; - int nr_data_lbas, nr_valid_lbas, nr_lbas = 0; + int data_start, data_end; + int nr_valid_lbas, nr_lbas = 0; int i; lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); @@ -142,10 +142,10 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) return 1; data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - nr_data_lbas = lm->sec_per_line - lm->emeta_sec[0]; + data_end = lm->sec_per_line - lm->emeta_sec[0]; nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); - for (i = data_start; i < nr_data_lbas && nr_lbas < nr_valid_lbas; i++) { + for (i = data_start; i < data_end; i++) { struct ppa_addr ppa; int pos; -- cgit v1.2.1 From 37ce33d5756f4ba8bdd45371a1918ceeeba5b158 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:36 +0200 Subject: lightnvm: pblk: free full lines during recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When rebuilding the L2P table, any full lines (lines without any valid sectors) will be identified. If these lines are not freed, we risk not being able to allocate the first data line. This patch refactors the part of GC that frees empty lines into a separate function and adds a call to this after the L2P table has been rebuilt. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 32 ++++++++++++++++++++------------ drivers/lightnvm/pblk-init.c | 3 +++ drivers/lightnvm/pblk.h | 1 + 3 files changed, 24 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 81efac18ff57..374089fe4326 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -330,26 +330,16 @@ static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) return ((gc->gc_active) && (nr_blocks_need > nr_blocks_free)); } -/* - * Lines with no valid sectors will be returned to the free list immediately. If - * GC is activated - either because the free block count is under the determined - * threshold, or because it is being forced from user space - only lines with a - * high count of invalid sectors will be recycled. - */ -static void pblk_gc_run(struct pblk *pblk) +void pblk_gc_free_full_lines(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_gc *gc = &pblk->gc; struct pblk_line *line; - struct list_head *group_list; - bool run_gc; - int inflight_gc, gc_group = 0, prev_group = 0; do { spin_lock(&l_mg->gc_lock); if (list_empty(&l_mg->gc_full_list)) { spin_unlock(&l_mg->gc_lock); - break; + return; } line = list_first_entry(&l_mg->gc_full_list, @@ -365,6 +355,24 @@ static void pblk_gc_run(struct pblk *pblk) kref_put(&line->ref, pblk_line_put); } while (1); +} + +/* + * Lines with no valid sectors will be returned to the free list immediately. If + * GC is activated - either because the free block count is under the determined + * threshold, or because it is being forced from user space - only lines with a + * high count of invalid sectors will be recycled. + */ +static void pblk_gc_run(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_gc *gc = &pblk->gc; + struct pblk_line *line; + struct list_head *group_list; + bool run_gc; + int inflight_gc, gc_group = 0, prev_group = 0; + + pblk_gc_free_full_lines(pblk); run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD)) diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 27eb430958ff..f08fa2083fbc 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -508,6 +508,9 @@ static int pblk_lines_configure(struct pblk *pblk, int flags) } } + /* Free full lines directly as GC has not been started yet */ + pblk_gc_free_full_lines(pblk); + if (!line) { /* Configure next line for user data */ line = pblk_line_get_first_data(pblk); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 191b1ec0627b..21438d1550a2 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -831,6 +831,7 @@ void pblk_gc_exit(struct pblk *pblk); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); void pblk_gc_kick(struct pblk *pblk); +void pblk_gc_free_full_lines(struct pblk *pblk); void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, int *gc_active); int pblk_gc_sysfs_force(struct pblk *pblk, int force); -- cgit v1.2.1 From 03661b5f756c92b9924869334a2afa19753c4fe7 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:37 +0200 Subject: lightnvm: pblk: start gc if needed during init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Start GC if needed, directly after init, as we might need to garbage collect in order to make room for user writes. Create a helper function that allows to kick GC without exposing the internals of the GC/rate-limiter interaction. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 5 +++++ drivers/lightnvm/pblk-init.c | 4 ++++ drivers/lightnvm/pblk-rl.c | 2 +- drivers/lightnvm/pblk.h | 2 ++ 4 files changed, 12 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 374089fe4326..4bac9e1531f5 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -510,6 +510,11 @@ void pblk_gc_should_stop(struct pblk *pblk) pblk_gc_stop(pblk, 0); } +void pblk_gc_should_kick(struct pblk *pblk) +{ + pblk_rl_update_rates(&pblk->rl); +} + void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, int *gc_active) { diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index f08fa2083fbc..ad9f014a086b 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -1025,6 +1025,10 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk, pblk->rwb.nr_entries); wake_up_process(pblk->writer_ts); + + /* Check if we need to start GC */ + pblk_gc_should_kick(pblk); + return pblk; fail_stop_writer: diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 0896439a91b0..739f855d4216 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -96,7 +96,7 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl) * * Only the total number of free blocks is used to configure the rate limiter. */ -static void pblk_rl_update_rates(struct pblk_rl *rl) +void pblk_rl_update_rates(struct pblk_rl *rl) { struct pblk *pblk = container_of(rl, struct pblk, rl); unsigned long free_blocks = pblk_rl_nr_free_blks(rl); diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 21438d1550a2..29ba7ec32b20 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -830,6 +830,7 @@ int pblk_gc_init(struct pblk *pblk); void pblk_gc_exit(struct pblk *pblk); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); +void pblk_gc_should_kick(struct pblk *pblk); void pblk_gc_kick(struct pblk *pblk); void pblk_gc_free_full_lines(struct pblk *pblk); void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, @@ -841,6 +842,7 @@ int pblk_gc_sysfs_force(struct pblk *pblk, int force); */ void pblk_rl_init(struct pblk_rl *rl, int budget); void pblk_rl_free(struct pblk_rl *rl); +void pblk_rl_update_rates(struct pblk_rl *rl); int pblk_rl_high_thrs(struct pblk_rl *rl); int pblk_rl_low_thrs(struct pblk_rl *rl); unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); -- cgit v1.2.1 From 75610cd974aba4fadc9a8500d5470e8f28a3626f Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:38 +0200 Subject: lightnvm: pblk: consider bad sectors in emeta during recovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When recovering lines we need to consider that bad blocks in a line affect the emeta area size. Previously it was assumed that the emeta area would grow by the number of sectors per page * number of bad blocks in the line. This assumption is not correct - the number of "extra" pages that are consumed could be both smaller (depending on emeta size) and bigger (depending on the placement of the bad blocks). Fix this by calculating the emeta start by iterating backwards through the line, skipping ppas that map to bad blocks. Also fix the data types used for ppa indices/counts in pblk_recov_l2p_from_emeta - we should use u64. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-recovery.c | 44 +++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index a080cf888982..9772a947ca4f 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -133,16 +133,16 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) struct pblk_emeta *emeta = line->emeta; struct line_emeta *emeta_buf = emeta->buf; __le64 *lba_list; - int data_start, data_end; - int nr_valid_lbas, nr_lbas = 0; - int i; + u64 data_start, data_end; + u64 nr_valid_lbas, nr_lbas = 0; + u64 i; lba_list = pblk_recov_get_lba_list(pblk, emeta_buf); if (!lba_list) return 1; data_start = pblk_line_smeta_start(pblk, line) + lm->smeta_sec; - data_end = lm->sec_per_line - lm->emeta_sec[0]; + data_end = line->emeta_ssec; nr_valid_lbas = le64_to_cpu(emeta_buf->nr_valid_lbas); for (i = data_start; i < data_end; i++) { @@ -172,8 +172,8 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line) } if (nr_valid_lbas != nr_lbas) - pr_err("pblk: line %d - inconsistent lba list(%llu/%d)\n", - line->id, emeta_buf->nr_valid_lbas, nr_lbas); + pr_err("pblk: line %d - inconsistent lba list(%llu/%llu)\n", + line->id, nr_valid_lbas, nr_lbas); line->left_msecs = 0; @@ -827,10 +827,32 @@ static void pblk_recov_line_add_ordered(struct list_head *head, __list_add(&line->list, t->list.prev, &t->list); } -struct pblk_line *pblk_recov_l2p(struct pblk *pblk) +static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; + struct pblk_line_meta *lm = &pblk->lm; + unsigned int emeta_secs; + u64 emeta_start; + struct ppa_addr ppa; + int pos; + + emeta_secs = lm->emeta_sec[0]; + emeta_start = lm->sec_per_line; + + while (emeta_secs) { + emeta_start--; + ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id); + pos = pblk_ppa_to_pos(geo, ppa); + if (!test_bit(pos, line->blk_bitmap)) + emeta_secs--; + } + + return emeta_start; +} + +struct pblk_line *pblk_recov_l2p(struct pblk *pblk) +{ struct pblk_line_meta *lm = &pblk->lm; struct pblk_line_mgmt *l_mg = &pblk->l_mg; struct pblk_line *line, *tline, *data_line = NULL; @@ -930,15 +952,9 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk) /* Verify closed blocks and recover this portion of L2P table*/ list_for_each_entry_safe(line, tline, &recov_list, list) { - int off, nr_bb; - recovered_lines++; - /* Calculate where emeta starts based on the line bb */ - off = lm->sec_per_line - lm->emeta_sec[0]; - nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line); - off -= nr_bb * geo->sec_per_pl; - line->emeta_ssec = off; + line->emeta_ssec = pblk_line_emeta_start(pblk, line); line->emeta = emeta; memset(line->emeta->buf, 0, lm->emeta_len[0]); -- cgit v1.2.1 From 1edebacf8b736774d2f160512aec721f47e1f5ac Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:39 +0200 Subject: lightnvm: pblk: shut down gc gracefully during exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Shut down the GC workqueues and tasks in the right order. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index 4bac9e1531f5..e00e5a0743e9 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -422,10 +422,15 @@ void pblk_gc_kick(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; - wake_up_process(gc->gc_ts); pblk_gc_writer_kick(gc); pblk_gc_reader_kick(gc); - mod_timer(&gc->gc_timer, jiffies + msecs_to_jiffies(GC_TIME_MSECS)); + + /* If we're shutting down GC, let's not start it up again */ + if (gc->gc_enabled) { + wake_up_process(gc->gc_ts); + mod_timer(&gc->gc_timer, + jiffies + msecs_to_jiffies(GC_TIME_MSECS)); + } } static void pblk_gc_timer(unsigned long data) @@ -630,9 +635,6 @@ void pblk_gc_exit(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; - flush_workqueue(gc->gc_reader_wq); - flush_workqueue(gc->gc_line_reader_wq); - gc->gc_enabled = 0; del_timer_sync(&gc->gc_timer); pblk_gc_stop(pblk, 1); @@ -640,15 +642,17 @@ void pblk_gc_exit(struct pblk *pblk) if (gc->gc_ts) kthread_stop(gc->gc_ts); + if (gc->gc_reader_ts) + kthread_stop(gc->gc_reader_ts); + + flush_workqueue(gc->gc_reader_wq); if (gc->gc_reader_wq) destroy_workqueue(gc->gc_reader_wq); + flush_workqueue(gc->gc_line_reader_wq); if (gc->gc_line_reader_wq) destroy_workqueue(gc->gc_line_reader_wq); if (gc->gc_writer_ts) kthread_stop(gc->gc_writer_ts); - - if (gc->gc_reader_ts) - kthread_stop(gc->gc_reader_ts); } -- cgit v1.2.1 From c55861926a78bf129e06bd3372b34225f4968757 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:40 +0200 Subject: lightnvm: pblk: add l2p crc debug printouts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Print the CRC of the logical-to-physical mapping during exit and after recovering the L2P table to facilitate detection of meta data corruption/recovery issues. The CRC printed after recovery should match the CRC printed during the previous exit - if it doesn't this indicates that either the meta data written to the disk is corrupt or recovery failed. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index ad9f014a086b..52c85f4f672d 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -76,6 +76,28 @@ static blk_qc_t pblk_make_rq(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } +static size_t pblk_trans_map_size(struct pblk *pblk) +{ + int entry_size = 8; + + if (pblk->ppaf_bitsize < 32) + entry_size = 4; + + return entry_size * pblk->rl.nr_secs; +} + +#ifdef CONFIG_NVM_DEBUG +static u32 pblk_l2p_crc(struct pblk *pblk) +{ + size_t map_size; + u32 crc = ~(u32)0; + + map_size = pblk_trans_map_size(pblk); + crc = crc32_le(crc, pblk->trans_map, map_size); + return crc; +} +#endif + static void pblk_l2p_free(struct pblk *pblk) { vfree(pblk->trans_map); @@ -85,12 +107,10 @@ static int pblk_l2p_init(struct pblk *pblk) { sector_t i; struct ppa_addr ppa; - int entry_size = 8; - - if (pblk->ppaf_bitsize < 32) - entry_size = 4; + size_t map_size; - pblk->trans_map = vmalloc(entry_size * pblk->rl.nr_secs); + map_size = pblk_trans_map_size(pblk); + pblk->trans_map = vmalloc(map_size); if (!pblk->trans_map) return -ENOMEM; @@ -508,6 +528,10 @@ static int pblk_lines_configure(struct pblk *pblk, int flags) } } +#ifdef CONFIG_NVM_DEBUG + pr_info("pblk init: L2P CRC: %x\n", pblk_l2p_crc(pblk)); +#endif + /* Free full lines directly as GC has not been started yet */ pblk_gc_free_full_lines(pblk); @@ -901,6 +925,11 @@ static void pblk_exit(void *private) down_write(&pblk_lock); pblk_gc_exit(pblk); pblk_tear_down(pblk); + +#ifdef CONFIG_NVM_DEBUG + pr_info("pblk exit: L2P CRC: %x\n", pblk_l2p_crc(pblk)); +#endif + pblk_free(pblk); up_write(&pblk_lock); } -- cgit v1.2.1 From d6b992f7ab6279884238d4e2babf100c0879b3d6 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:41 +0200 Subject: lightnvm: pblk: gc all lines in the pipeline before exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finish garbage collect of the lines that are in the gc pipeline before exiting. Ensure that all lines already in in the pipeline goes through, from read to write. Do this by keeping track of how many lines are in the pipeline and waiting for that number to reach zero before exiting the gc reader task. Since we're adding a new gc line counter, change the name of inflight_gc to read_inflight_gc to make the distinction clear. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 3 +++ drivers/lightnvm/pblk-gc.c | 31 ++++++++++++++++++++++++------- drivers/lightnvm/pblk-sysfs.c | 2 +- drivers/lightnvm/pblk.h | 5 ++++- 4 files changed, 32 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 43866ad87586..1cd27e38fc46 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -1465,6 +1465,7 @@ void pblk_line_free(struct pblk *pblk, struct pblk_line *line) static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_gc *gc = &pblk->gc; spin_lock(&line->lock); WARN_ON(line->state != PBLK_LINESTATE_GC); @@ -1473,6 +1474,8 @@ static void __pblk_line_put(struct pblk *pblk, struct pblk_line *line) pblk_line_free(pblk, line); spin_unlock(&line->lock); + atomic_dec(&gc->pipeline_gc); + spin_lock(&l_mg->free_lock); list_add_tail(&line->list, &l_mg->free_list); l_mg->nr_free_lines++; diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index e00e5a0743e9..e6fae1959e25 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -234,7 +234,7 @@ out: kfree(invalid_bitmap); kref_put(&line->ref, pblk_line_put); - atomic_dec(&gc->inflight_gc); + atomic_dec(&gc->read_inflight_gc); return; @@ -249,7 +249,7 @@ fail_free_ws: pblk_put_line_back(pblk, line); kref_put(&line->ref, pblk_line_put); - atomic_dec(&gc->inflight_gc); + atomic_dec(&gc->read_inflight_gc); pr_err("pblk: Failed to GC line %d\n", line->id); } @@ -268,6 +268,7 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) line_ws->pblk = pblk; line_ws->line = line; + atomic_inc(&gc->pipeline_gc); INIT_WORK(&line_ws->ws, pblk_gc_line_prepare_ws); queue_work(gc->gc_reader_wq, &line_ws->ws); @@ -333,6 +334,7 @@ static bool pblk_gc_should_run(struct pblk_gc *gc, struct pblk_rl *rl) void pblk_gc_free_full_lines(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_gc *gc = &pblk->gc; struct pblk_line *line; do { @@ -353,6 +355,7 @@ void pblk_gc_free_full_lines(struct pblk *pblk) list_del(&line->list); spin_unlock(&l_mg->gc_lock); + atomic_inc(&gc->pipeline_gc); kref_put(&line->ref, pblk_line_put); } while (1); } @@ -370,12 +373,12 @@ static void pblk_gc_run(struct pblk *pblk) struct pblk_line *line; struct list_head *group_list; bool run_gc; - int inflight_gc, gc_group = 0, prev_group = 0; + int read_inflight_gc, gc_group = 0, prev_group = 0; pblk_gc_free_full_lines(pblk); run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); - if (!run_gc || (atomic_read(&gc->inflight_gc) >= PBLK_GC_L_QD)) + if (!run_gc || (atomic_read(&gc->read_inflight_gc) >= PBLK_GC_L_QD)) return; next_gc_group: @@ -402,14 +405,14 @@ next_gc_group: list_add_tail(&line->list, &gc->r_list); spin_unlock(&gc->r_lock); - inflight_gc = atomic_inc_return(&gc->inflight_gc); + read_inflight_gc = atomic_inc_return(&gc->read_inflight_gc); pblk_gc_reader_kick(gc); prev_group = 1; /* No need to queue up more GC lines than we can handle */ run_gc = pblk_gc_should_run(&pblk->gc, &pblk->rl); - if (!run_gc || inflight_gc >= PBLK_GC_L_QD) + if (!run_gc || read_inflight_gc >= PBLK_GC_L_QD) break; } while (1); @@ -470,6 +473,7 @@ static int pblk_gc_writer_ts(void *data) static int pblk_gc_reader_ts(void *data) { struct pblk *pblk = data; + struct pblk_gc *gc = &pblk->gc; while (!kthread_should_stop()) { if (!pblk_gc_read(pblk)) @@ -478,6 +482,18 @@ static int pblk_gc_reader_ts(void *data) io_schedule(); } +#ifdef CONFIG_NVM_DEBUG + pr_info("pblk: flushing gc pipeline, %d lines left\n", + atomic_read(&gc->pipeline_gc)); +#endif + + do { + if (!atomic_read(&gc->pipeline_gc)) + break; + + schedule(); + } while (1); + return 0; } @@ -586,7 +602,8 @@ int pblk_gc_init(struct pblk *pblk) gc->gc_forced = 0; gc->gc_enabled = 1; gc->w_entries = 0; - atomic_set(&gc->inflight_gc, 0); + atomic_set(&gc->read_inflight_gc, 0); + atomic_set(&gc->pipeline_gc, 0); /* Workqueue that reads valid sectors from a line and submit them to the * GC writer to be recycled. diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c index 95fb434e2f01..cd49e8875d4e 100644 --- a/drivers/lightnvm/pblk-sysfs.c +++ b/drivers/lightnvm/pblk-sysfs.c @@ -253,7 +253,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page) sz += snprintf(page + sz, PAGE_SIZE - sz, "GC: full:%d, high:%d, mid:%d, low:%d, empty:%d, queue:%d\n", gc_full, gc_high, gc_mid, gc_low, gc_empty, - atomic_read(&pblk->gc.inflight_gc)); + atomic_read(&pblk->gc.read_inflight_gc)); sz += snprintf(page + sz, PAGE_SIZE - sz, "data (%d) cur:%d, left:%d, vsc:%d, s:%d, map:%d/%d (%d)\n", diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 29ba7ec32b20..c6f8841973a0 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -238,7 +238,10 @@ struct pblk_gc { struct timer_list gc_timer; struct semaphore gc_sem; - atomic_t inflight_gc; + atomic_t read_inflight_gc; /* Number of lines with inflight GC reads */ + atomic_t pipeline_gc; /* Number of lines in the GC pipeline - + * started reads to finished writes + */ int w_entries; struct list_head w_list; -- cgit v1.2.1 From 03e868eb8adb28e34f6e695667d230786bfdb653 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:42 +0200 Subject: lightnvm: pblk: correct valid lba count calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During garbage collect, lbas being written can end up being invalidated. Make sure that this is reflected in the valid lba count. Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-map.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c index 3bc4c94f9cf2..6f3ecde2140f 100644 --- a/drivers/lightnvm/pblk-map.c +++ b/drivers/lightnvm/pblk-map.c @@ -45,6 +45,8 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, paddr = pblk_alloc_page(pblk, line, nr_secs); for (i = 0; i < nr_secs; i++, paddr++) { + __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); + /* ppa to be sent to the device */ ppa_list[i] = addr_to_gen_ppa(pblk, paddr, line->id); @@ -61,10 +63,9 @@ static void pblk_map_page_data(struct pblk *pblk, unsigned int sentry, w_ctx->ppa = ppa_list[i]; meta_list[i].lba = cpu_to_le64(w_ctx->lba); lba_list[paddr] = cpu_to_le64(w_ctx->lba); - line->nr_valid_lbas++; + if (lba_list[paddr] != addr_empty) + line->nr_valid_lbas++; } else { - __le64 addr_empty = cpu_to_le64(ADDR_EMPTY); - lba_list[paddr] = meta_list[i].lba = addr_empty; __pblk_map_invalidate(pblk, line, paddr); } -- cgit v1.2.1 From 28bd109411eaa4c541f2e240d1285c154de4dfb7 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Fri, 13 Oct 2017 14:46:43 +0200 Subject: lightnvm: pblk: remove spinlock when freeing line metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lockdep complains about being in atomic context while freeing line metadata - and rightly so as we take a spinlock and end up calling vfree that might sleep(in pblk_mfree). There is no need for holding the line manager free_lock while freeing line metadata as the pipeline as stopped, so remove the lock. Fixes: 588726d3ec68 ("lightnvm: pblk: fail gracefully on irrec. error") Signed-off-by: Hans Holmberg Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-init.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index 52c85f4f672d..f62112ba5482 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -393,13 +393,11 @@ static void pblk_line_meta_free(struct pblk *pblk) kfree(l_mg->bb_aux); kfree(l_mg->vsc_list); - spin_lock(&l_mg->free_lock); for (i = 0; i < PBLK_DATA_LINES; i++) { kfree(l_mg->sline_meta[i]); pblk_mfree(l_mg->eline_meta[i]->buf, l_mg->emeta_alloc_type); kfree(l_mg->eline_meta[i]); } - spin_unlock(&l_mg->free_lock); kfree(pblk->lines); } -- cgit v1.2.1 From 8bd400204bd500bb2aea7b551f7c33bad2455340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:44 +0200 Subject: lightnvm: pblk: cleanup unused and static functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup up unused and static functions across the whole codebase. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-core.c | 133 ++++++++++++++++++++----------------------- drivers/lightnvm/pblk-gc.c | 40 ++++++------- drivers/lightnvm/pblk-rl.c | 10 ---- drivers/lightnvm/pblk.h | 14 ++--- 4 files changed, 86 insertions(+), 111 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 1cd27e38fc46..4199119a0754 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -18,6 +18,31 @@ #include "pblk.h" +static void pblk_line_mark_bb(struct work_struct *work) +{ + struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, + ws); + struct pblk *pblk = line_ws->pblk; + struct nvm_tgt_dev *dev = pblk->dev; + struct ppa_addr *ppa = line_ws->priv; + int ret; + + ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD); + if (ret) { + struct pblk_line *line; + int pos; + + line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)]; + pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa); + + pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", + line->id, pos); + } + + kfree(ppa); + mempool_free(line_ws, pblk->gen_ws_pool); +} + static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line, struct ppa_addr *ppa) { @@ -268,7 +293,7 @@ void pblk_end_io_sync(struct nvm_rq *rqd) complete(waiting); } -void pblk_wait_for_meta(struct pblk *pblk) +static void pblk_wait_for_meta(struct pblk *pblk) { do { if (!atomic_read(&pblk->inflight_io)) @@ -345,17 +370,6 @@ void pblk_discard(struct pblk *pblk, struct bio *bio) pblk_invalidate_range(pblk, slba, nr_secs); } -struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba) -{ - struct ppa_addr ppa; - - spin_lock(&pblk->trans_lock); - ppa = pblk_trans_map_get(pblk, lba); - spin_unlock(&pblk->trans_lock); - - return ppa; -} - void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd) { atomic_long_inc(&pblk->write_failed); @@ -1338,6 +1352,41 @@ static void pblk_stop_writes(struct pblk *pblk, struct pblk_line *line) pblk->state = PBLK_STATE_STOPPING; } +static void pblk_line_close_meta_sync(struct pblk *pblk) +{ + struct pblk_line_mgmt *l_mg = &pblk->l_mg; + struct pblk_line_meta *lm = &pblk->lm; + struct pblk_line *line, *tline; + LIST_HEAD(list); + + spin_lock(&l_mg->close_lock); + if (list_empty(&l_mg->emeta_list)) { + spin_unlock(&l_mg->close_lock); + return; + } + + list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); + spin_unlock(&l_mg->close_lock); + + list_for_each_entry_safe(line, tline, &list, list) { + struct pblk_emeta *emeta = line->emeta; + + while (emeta->mem < lm->emeta_len[0]) { + int ret; + + ret = pblk_submit_meta_io(pblk, line); + if (ret) { + pr_err("pblk: sync meta line %d failed (%d)\n", + line->id, ret); + return; + } + } + } + + pblk_wait_for_meta(pblk); + flush_workqueue(pblk->close_wq); +} + void pblk_pipeline_stop(struct pblk *pblk) { struct pblk_line_mgmt *l_mg = &pblk->l_mg; @@ -1565,41 +1614,6 @@ int pblk_line_is_full(struct pblk_line *line) return (line->left_msecs == 0); } -void pblk_line_close_meta_sync(struct pblk *pblk) -{ - struct pblk_line_mgmt *l_mg = &pblk->l_mg; - struct pblk_line_meta *lm = &pblk->lm; - struct pblk_line *line, *tline; - LIST_HEAD(list); - - spin_lock(&l_mg->close_lock); - if (list_empty(&l_mg->emeta_list)) { - spin_unlock(&l_mg->close_lock); - return; - } - - list_cut_position(&list, &l_mg->emeta_list, l_mg->emeta_list.prev); - spin_unlock(&l_mg->close_lock); - - list_for_each_entry_safe(line, tline, &list, list) { - struct pblk_emeta *emeta = line->emeta; - - while (emeta->mem < lm->emeta_len[0]) { - int ret; - - ret = pblk_submit_meta_io(pblk, line); - if (ret) { - pr_err("pblk: sync meta line %d failed (%d)\n", - line->id, ret); - return; - } - } - } - - pblk_wait_for_meta(pblk); - flush_workqueue(pblk->close_wq); -} - static void pblk_line_should_sync_meta(struct pblk *pblk) { if (pblk_rl_is_limit(&pblk->rl)) @@ -1673,31 +1687,6 @@ void pblk_line_close_ws(struct work_struct *work) mempool_free(line_ws, pblk->gen_ws_pool); } -void pblk_line_mark_bb(struct work_struct *work) -{ - struct pblk_line_ws *line_ws = container_of(work, struct pblk_line_ws, - ws); - struct pblk *pblk = line_ws->pblk; - struct nvm_tgt_dev *dev = pblk->dev; - struct ppa_addr *ppa = line_ws->priv; - int ret; - - ret = nvm_set_tgt_bb_tbl(dev, ppa, 1, NVM_BLK_T_GRWN_BAD); - if (ret) { - struct pblk_line *line; - int pos; - - line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)]; - pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa); - - pr_err("pblk: failed to mark bb, line:%d, pos:%d\n", - line->id, pos); - } - - kfree(ppa); - mempool_free(line_ws, pblk->gen_ws_pool); -} - void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, void (*work)(struct work_struct *), gfp_t gfp_mask, struct workqueue_struct *wq) diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index e6fae1959e25..b8323e34d1bc 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -275,6 +275,26 @@ static int pblk_gc_line(struct pblk *pblk, struct pblk_line *line) return 0; } +static void pblk_gc_reader_kick(struct pblk_gc *gc) +{ + wake_up_process(gc->gc_reader_ts); +} + +static void pblk_gc_kick(struct pblk *pblk) +{ + struct pblk_gc *gc = &pblk->gc; + + pblk_gc_writer_kick(gc); + pblk_gc_reader_kick(gc); + + /* If we're shutting down GC, let's not start it up again */ + if (gc->gc_enabled) { + wake_up_process(gc->gc_ts); + mod_timer(&gc->gc_timer, + jiffies + msecs_to_jiffies(GC_TIME_MSECS)); + } +} + static int pblk_gc_read(struct pblk *pblk) { struct pblk_gc *gc = &pblk->gc; @@ -298,11 +318,6 @@ static int pblk_gc_read(struct pblk *pblk) return 0; } -static void pblk_gc_reader_kick(struct pblk_gc *gc) -{ - wake_up_process(gc->gc_reader_ts); -} - static struct pblk_line *pblk_gc_get_victim_line(struct pblk *pblk, struct list_head *group_list) { @@ -421,21 +436,6 @@ next_gc_group: goto next_gc_group; } -void pblk_gc_kick(struct pblk *pblk) -{ - struct pblk_gc *gc = &pblk->gc; - - pblk_gc_writer_kick(gc); - pblk_gc_reader_kick(gc); - - /* If we're shutting down GC, let's not start it up again */ - if (gc->gc_enabled) { - wake_up_process(gc->gc_ts); - mod_timer(&gc->gc_timer, - jiffies + msecs_to_jiffies(GC_TIME_MSECS)); - } -} - static void pblk_gc_timer(unsigned long data) { struct pblk *pblk = (struct pblk *)data; diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c index 739f855d4216..abae31fd434e 100644 --- a/drivers/lightnvm/pblk-rl.c +++ b/drivers/lightnvm/pblk-rl.c @@ -153,16 +153,6 @@ int pblk_rl_high_thrs(struct pblk_rl *rl) return rl->high; } -int pblk_rl_low_thrs(struct pblk_rl *rl) -{ - return rl->low; -} - -int pblk_rl_sysfs_rate_show(struct pblk_rl *rl) -{ - return rl->rb_user_max; -} - int pblk_rl_max_io(struct pblk_rl *rl) { return rl->rb_max_io; diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index c6f8841973a0..6c9ea9a93704 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -710,8 +710,6 @@ void pblk_free_rqd(struct pblk *pblk, struct nvm_rq *rqd, int type); void pblk_set_sec_per_write(struct pblk *pblk, int sec_per_write); int pblk_setup_w_rec_rq(struct pblk *pblk, struct nvm_rq *rqd, struct pblk_c_ctx *c_ctx); -void pblk_wait_for_meta(struct pblk *pblk); -struct ppa_addr pblk_get_lba_map(struct pblk *pblk, sector_t lba); void pblk_discard(struct pblk *pblk, struct bio *bio); void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); @@ -732,10 +730,8 @@ int pblk_line_is_full(struct pblk_line *line); void pblk_line_free(struct pblk *pblk, struct pblk_line *line); void pblk_line_close_meta(struct pblk *pblk, struct pblk_line *line); void pblk_line_close(struct pblk *pblk, struct pblk_line *line); -void pblk_line_close_meta_sync(struct pblk *pblk); void pblk_line_close_ws(struct work_struct *work); void pblk_pipeline_stop(struct pblk *pblk); -void pblk_line_mark_bb(struct work_struct *work); void pblk_gen_run_ws(struct pblk *pblk, struct pblk_line *line, void *priv, void (*work)(struct work_struct *), gfp_t gfp_mask, struct workqueue_struct *wq); @@ -759,7 +755,6 @@ void pblk_down_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, void pblk_down_page(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas); void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas, unsigned long *lun_bitmap); -void pblk_end_bio_sync(struct bio *bio); void pblk_end_io_sync(struct nvm_rq *rqd); int pblk_bio_add_pages(struct pblk *pblk, struct bio *bio, gfp_t flags, int nr_pages); @@ -834,7 +829,6 @@ void pblk_gc_exit(struct pblk *pblk); void pblk_gc_should_start(struct pblk *pblk); void pblk_gc_should_stop(struct pblk *pblk); void pblk_gc_should_kick(struct pblk *pblk); -void pblk_gc_kick(struct pblk *pblk); void pblk_gc_free_full_lines(struct pblk *pblk); void pblk_gc_sysfs_state_show(struct pblk *pblk, int *gc_enabled, int *gc_active); @@ -847,7 +841,6 @@ void pblk_rl_init(struct pblk_rl *rl, int budget); void pblk_rl_free(struct pblk_rl *rl); void pblk_rl_update_rates(struct pblk_rl *rl); int pblk_rl_high_thrs(struct pblk_rl *rl); -int pblk_rl_low_thrs(struct pblk_rl *rl); unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl); int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries); void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries); @@ -855,11 +848,9 @@ void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries); int pblk_rl_gc_may_insert(struct pblk_rl *rl, int nr_entries); void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries); void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc); -int pblk_rl_sysfs_rate_show(struct pblk_rl *rl); int pblk_rl_max_io(struct pblk_rl *rl); void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line); void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line); -void pblk_rl_set_space_limit(struct pblk_rl *rl, int entries_left); int pblk_rl_is_limit(struct pblk_rl *rl); /* @@ -868,6 +859,11 @@ int pblk_rl_is_limit(struct pblk_rl *rl); int pblk_sysfs_init(struct gendisk *tdisk); void pblk_sysfs_exit(struct gendisk *tdisk); +static inline void test(size_t a) +{ + a += 1; +} + static inline void *pblk_malloc(size_t size, int type, gfp_t flags) { if (type == PBLK_KMALLOC_META) -- cgit v1.2.1 From 8da10cce7c7f7f9f5edc77271cf6e0c45b762004 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:45 +0200 Subject: lightnvm: pblk: avoid being reported as hung on rated GC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The amount of GC I/O on the write buffer is managed by the rate-limiter, which is calculated as a function of the number of available free blocks. When reaching the stable point, we risk having scheduled more I/Os for GC than are allowed on the write buffer. This would result on the GC semaphore balancing the outstanding read GC I/Os to be reported as "hung", though the behavior is normal. Solve this by allowing to schedule when we detect that the read GC path is not moving forward. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk-gc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c index b8323e34d1bc..00d5698d64a9 100644 --- a/drivers/lightnvm/pblk-gc.c +++ b/drivers/lightnvm/pblk-gc.c @@ -218,7 +218,13 @@ next_rq: gc_rq_ws->line = line; gc_rq_ws->priv = gc_rq; - down(&gc->gc_sem); + /* The write GC path can be much slower than the read GC one due to + * the budget imposed by the rate-limiter. Balance in case that we get + * back pressure from the write GC path. + */ + while (down_timeout(&gc->gc_sem, msecs_to_jiffies(30000))) + io_schedule(); + kref_get(&line->ref); INIT_WORK(&gc_rq_ws->ws, pblk_gc_line_ws); -- cgit v1.2.1 From 1b839187db62aae22a9254fd9e3ce376149b0918 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:46 +0200 Subject: lightnvm: fail fast on passthrough commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make LightNVM passhtrough commands fail fast. User space will then take care of re-submitting. Fixes: 84d4add793c6 ('lightnvm: add ioctls for vector I/Os') Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/nvme/host/lightnvm.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 1f79e3f141e6..6017153c2439 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -600,8 +600,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, rq->timeout = timeout ? timeout : ADMIN_TIMEOUT; - rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - if (ppa_buf && ppa_len) { ppa_list = dma_pool_alloc(dev->dma_pool, GFP_KERNEL, &ppa_dma); if (!ppa_list) { -- cgit v1.2.1 From 1a94b2d484677dc559c96251dd0e7c7b8811c378 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Fri, 13 Oct 2017 14:46:47 +0200 Subject: lightnvm: implement generic path for sync I/O MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement a generic path for sending sync I/O on LightNVM. This allows to reuse the standard synchronous path trough blk_execute_rq(), instead of implementing a wait_for_completion on the target side (e.g., pblk). Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 25 +++++++++----- drivers/lightnvm/pblk-core.c | 74 ++++++++++++---------------------------- drivers/lightnvm/pblk-read.c | 21 ++---------- drivers/lightnvm/pblk-recovery.c | 31 ++--------------- drivers/lightnvm/pblk.h | 42 +++++++++++++++++++++-- drivers/nvme/host/lightnvm.c | 70 +++++++++++++++++++++++++++++-------- 6 files changed, 140 insertions(+), 123 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 0e5f77234c79..fe21f4dd33e9 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -720,12 +720,25 @@ int nvm_submit_io(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) } EXPORT_SYMBOL(nvm_submit_io); -static void nvm_end_io_sync(struct nvm_rq *rqd) +int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd) { - struct completion *waiting = rqd->private; + struct nvm_dev *dev = tgt_dev->parent; + int ret; + + if (!dev->ops->submit_io_sync) + return -ENODEV; + + nvm_rq_tgt_to_dev(tgt_dev, rqd); - complete(waiting); + rqd->dev = tgt_dev; + + /* In case of error, fail with right address format */ + ret = dev->ops->submit_io_sync(dev, rqd); + nvm_rq_dev_to_tgt(tgt_dev, rqd); + + return ret; } +EXPORT_SYMBOL(nvm_submit_io_sync); int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int nr_ppas) @@ -733,25 +746,21 @@ int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, struct nvm_geo *geo = &tgt_dev->geo; struct nvm_rq rqd; int ret; - DECLARE_COMPLETION_ONSTACK(wait); memset(&rqd, 0, sizeof(struct nvm_rq)); rqd.opcode = NVM_OP_ERASE; - rqd.end_io = nvm_end_io_sync; - rqd.private = &wait; rqd.flags = geo->plane_mode >> 1; ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas); if (ret) return ret; - ret = nvm_submit_io(tgt_dev, &rqd); + ret = nvm_submit_io_sync(tgt_dev, &rqd); if (ret) { pr_err("rrpr: erase I/O submission failed: %d\n", ret); goto free_ppa_list; } - wait_for_completion_io(&wait); free_ppa_list: nvm_free_rqd_ppalist(tgt_dev, &rqd); diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c index 4199119a0754..ce90213a42fa 100644 --- a/drivers/lightnvm/pblk-core.c +++ b/drivers/lightnvm/pblk-core.c @@ -412,39 +412,33 @@ int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd) struct nvm_tgt_dev *dev = pblk->dev; #ifdef CONFIG_NVM_DEBUG - struct ppa_addr *ppa_list; + int ret; - ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; - if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { - WARN_ON(1); - return -EINVAL; - } + ret = pblk_check_io(pblk, rqd); + if (ret) + return ret; +#endif - if (rqd->opcode == NVM_OP_PWRITE) { - struct pblk_line *line; - struct ppa_addr ppa; - int i; + atomic_inc(&pblk->inflight_io); - for (i = 0; i < rqd->nr_ppas; i++) { - ppa = ppa_list[i]; - line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; + return nvm_submit_io(dev, rqd); +} - spin_lock(&line->lock); - if (line->state != PBLK_LINESTATE_OPEN) { - pr_err("pblk: bad ppa: line:%d,state:%d\n", - line->id, line->state); - WARN_ON(1); - spin_unlock(&line->lock); - return -EINVAL; - } - spin_unlock(&line->lock); - } - } +int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + +#ifdef CONFIG_NVM_DEBUG + int ret; + + ret = pblk_check_io(pblk, rqd); + if (ret) + return ret; #endif atomic_inc(&pblk->inflight_io); - return nvm_submit_io(dev, rqd); + return nvm_submit_io_sync(dev, rqd); } static void pblk_bio_map_addr_endio(struct bio *bio) @@ -597,7 +591,6 @@ static int pblk_line_submit_emeta_io(struct pblk *pblk, struct pblk_line *line, int cmd_op, bio_op; int i, j; int ret; - DECLARE_COMPLETION_ONSTACK(wait); if (dir == PBLK_WRITE) { bio_op = REQ_OP_WRITE; @@ -639,8 +632,6 @@ next_rq: rqd.dma_ppa_list = dma_ppa_list; rqd.opcode = cmd_op; rqd.nr_ppas = rq_ppas; - rqd.end_io = pblk_end_io_sync; - rqd.private = &wait; if (dir == PBLK_WRITE) { struct pblk_sec_meta *meta_list = rqd.meta_list; @@ -694,19 +685,14 @@ next_rq: } } - ret = pblk_submit_io(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd); if (ret) { pr_err("pblk: emeta I/O submission failed: %d\n", ret); bio_put(bio); goto free_rqd_dma; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: emeta I/O timed out\n"); - } atomic_dec(&pblk->inflight_io); - reinit_completion(&wait); if (rqd.error) { if (dir == PBLK_WRITE) @@ -750,7 +736,6 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, int i, ret; int cmd_op, bio_op; int flags; - DECLARE_COMPLETION_ONSTACK(wait); if (dir == PBLK_WRITE) { bio_op = REQ_OP_WRITE; @@ -787,8 +772,6 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, rqd.opcode = cmd_op; rqd.flags = flags; rqd.nr_ppas = lm->smeta_sec; - rqd.end_io = pblk_end_io_sync; - rqd.private = &wait; for (i = 0; i < lm->smeta_sec; i++, paddr++) { struct pblk_sec_meta *meta_list = rqd.meta_list; @@ -807,17 +790,13 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line, * the write thread is the only one sending write and erase commands, * there is no need to take the LUN semaphore. */ - ret = pblk_submit_io(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd); if (ret) { pr_err("pblk: smeta I/O submission failed: %d\n", ret); bio_put(bio); goto free_ppa_list; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: smeta I/O timed out\n"); - } atomic_dec(&pblk->inflight_io); if (rqd.error) { @@ -861,19 +840,15 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) { struct nvm_rq rqd; int ret = 0; - DECLARE_COMPLETION_ONSTACK(wait); memset(&rqd, 0, sizeof(struct nvm_rq)); pblk_setup_e_rq(pblk, &rqd, ppa); - rqd.end_io = pblk_end_io_sync; - rqd.private = &wait; - /* The write thread schedules erases so that it minimizes disturbances * with writes. Thus, there is no need to take the LUN semaphore. */ - ret = pblk_submit_io(pblk, &rqd); + ret = pblk_submit_io_sync(pblk, &rqd); if (ret) { struct nvm_tgt_dev *dev = pblk->dev; struct nvm_geo *geo = &dev->geo; @@ -886,11 +861,6 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa) goto out; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: sync erase timed out\n"); - } - out: rqd.private = pblk; __pblk_end_io_erase(pblk, &rqd); diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c index 71c58503f1a4..ca79d8fb3e60 100644 --- a/drivers/lightnvm/pblk-read.c +++ b/drivers/lightnvm/pblk-read.c @@ -207,7 +207,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, int nr_secs = rqd->nr_ppas; int nr_holes = nr_secs - bitmap_weight(read_bitmap, nr_secs); int i, ret, hole; - DECLARE_COMPLETION_ONSTACK(wait); /* Re-use allocated memory for intermediate lbas */ lba_list_mem = (((void *)rqd->ppa_list) + pblk_dma_ppa_size); @@ -232,8 +231,6 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, rqd->bio = new_bio; rqd->nr_ppas = nr_holes; rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); - rqd->end_io = pblk_end_io_sync; - rqd->private = &wait; if (unlikely(nr_holes == 1)) { ppa_ptr = rqd->ppa_list; @@ -241,18 +238,13 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd, rqd->ppa_addr = rqd->ppa_list[0]; } - ret = pblk_submit_read_io(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd); if (ret) { bio_put(rqd->bio); - pr_err("pblk: read IO submission failed\n"); + pr_err("pblk: sync read IO submission failed\n"); goto err; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: partial read I/O timed out\n"); - } - if (rqd->error) { atomic_long_inc(&pblk->read_failed); #ifdef CONFIG_NVM_DEBUG @@ -537,7 +529,6 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) struct nvm_rq rqd; int data_len; int ret = NVM_IO_OK; - DECLARE_COMPLETION_ONSTACK(wait); memset(&rqd, 0, sizeof(struct nvm_rq)); @@ -577,22 +568,16 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq) bio_set_op_attrs(bio, REQ_OP_READ, 0); rqd.opcode = NVM_OP_PREAD; - rqd.end_io = pblk_end_io_sync; - rqd.private = &wait; rqd.nr_ppas = gc_rq->secs_to_gc; rqd.flags = pblk_set_read_mode(pblk, PBLK_READ_RANDOM); rqd.bio = bio; - if (pblk_submit_read_io(pblk, &rqd)) { + if (pblk_submit_io_sync(pblk, &rqd)) { ret = -EIO; pr_err("pblk: GC read request failed\n"); goto err_free_bio; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: GC read I/O timed out\n"); - } atomic_dec(&pblk->inflight_io); if (rqd.error) { diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c index 9772a947ca4f..eadb3eb5d4dc 100644 --- a/drivers/lightnvm/pblk-recovery.c +++ b/drivers/lightnvm/pblk-recovery.c @@ -216,7 +216,6 @@ static int pblk_recov_read_oob(struct pblk *pblk, struct pblk_line *line, int rq_ppas, rq_len; int i, j; int ret = 0; - DECLARE_COMPLETION_ONSTACK(wait); ppa_list = p.ppa_list; meta_list = p.meta_list; @@ -253,8 +252,6 @@ next_read_rq: rqd->ppa_list = ppa_list; rqd->dma_ppa_list = dma_ppa_list; rqd->dma_meta_list = dma_meta_list; - rqd->end_io = pblk_end_io_sync; - rqd->private = &wait; if (pblk_io_aligned(pblk, rq_ppas)) rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); @@ -280,19 +277,13 @@ next_read_rq: } /* If read fails, more padding is needed */ - ret = pblk_submit_io(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd); if (ret) { pr_err("pblk: I/O submission failed: %d\n", ret); return ret; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: L2P recovery read timed out\n"); - return -EINTR; - } atomic_dec(&pblk->inflight_io); - reinit_completion(&wait); /* At this point, the read should not fail. If it does, it is a problem * we cannot recover from here. Need FTL log. @@ -504,7 +495,6 @@ static int pblk_recov_scan_all_oob(struct pblk *pblk, struct pblk_line *line, int ret = 0; int rec_round; int left_ppas = pblk_calc_sec_in_line(pblk, line) - line->cur_sec; - DECLARE_COMPLETION_ONSTACK(wait); ppa_list = p.ppa_list; meta_list = p.meta_list; @@ -539,8 +529,6 @@ next_rq: rqd->ppa_list = ppa_list; rqd->dma_ppa_list = dma_ppa_list; rqd->dma_meta_list = dma_meta_list; - rqd->end_io = pblk_end_io_sync; - rqd->private = &wait; if (pblk_io_aligned(pblk, rq_ppas)) rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); @@ -566,18 +554,13 @@ next_rq: addr_to_gen_ppa(pblk, w_ptr, line->id); } - ret = pblk_submit_io(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd); if (ret) { pr_err("pblk: I/O submission failed: %d\n", ret); return ret; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: L2P recovery read timed out\n"); - } atomic_dec(&pblk->inflight_io); - reinit_completion(&wait); /* This should not happen since the read failed during normal recovery, * but the media works funny sometimes... @@ -645,7 +628,6 @@ static int pblk_recov_scan_oob(struct pblk *pblk, struct pblk_line *line, int i, j; int ret = 0; int left_ppas = pblk_calc_sec_in_line(pblk, line); - DECLARE_COMPLETION_ONSTACK(wait); ppa_list = p.ppa_list; meta_list = p.meta_list; @@ -678,8 +660,6 @@ next_rq: rqd->ppa_list = ppa_list; rqd->dma_ppa_list = dma_ppa_list; rqd->dma_meta_list = dma_meta_list; - rqd->end_io = pblk_end_io_sync; - rqd->private = &wait; if (pblk_io_aligned(pblk, rq_ppas)) rqd->flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL); @@ -705,19 +685,14 @@ next_rq: addr_to_gen_ppa(pblk, paddr, line->id); } - ret = pblk_submit_io(pblk, rqd); + ret = pblk_submit_io_sync(pblk, rqd); if (ret) { pr_err("pblk: I/O submission failed: %d\n", ret); bio_put(bio); return ret; } - if (!wait_for_completion_io_timeout(&wait, - msecs_to_jiffies(PBLK_COMMAND_TIMEOUT_MS))) { - pr_err("pblk: L2P recovery read timed out\n"); - } atomic_dec(&pblk->inflight_io); - reinit_completion(&wait); /* Reached the end of the written line */ if (rqd->error) { diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 6c9ea9a93704..6b64288de6f7 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -714,6 +714,7 @@ void pblk_discard(struct pblk *pblk, struct bio *bio); void pblk_log_write_err(struct pblk *pblk, struct nvm_rq *rqd); void pblk_log_read_err(struct pblk *pblk, struct nvm_rq *rqd); int pblk_submit_io(struct pblk *pblk, struct nvm_rq *rqd); +int pblk_submit_io_sync(struct pblk *pblk, struct nvm_rq *rqd); int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line); struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data, unsigned int nr_secs, unsigned int len, @@ -1203,7 +1204,6 @@ static inline void pblk_print_failed_rqd(struct pblk *pblk, struct nvm_rq *rqd, pr_err("error:%d, ppa_status:%llx\n", error, rqd->ppa_status); } -#endif static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas, int nr_ppas) @@ -1224,14 +1224,50 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev, ppa->g.sec < geo->sec_per_pg) continue; -#ifdef CONFIG_NVM_DEBUG print_ppa(ppa, "boundary", i); -#endif + return 1; } return 0; } +static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd) +{ + struct nvm_tgt_dev *dev = pblk->dev; + struct ppa_addr *ppa_list; + + ppa_list = (rqd->nr_ppas > 1) ? rqd->ppa_list : &rqd->ppa_addr; + + if (pblk_boundary_ppa_checks(dev, ppa_list, rqd->nr_ppas)) { + WARN_ON(1); + return -EINVAL; + } + + if (rqd->opcode == NVM_OP_PWRITE) { + struct pblk_line *line; + struct ppa_addr ppa; + int i; + + for (i = 0; i < rqd->nr_ppas; i++) { + ppa = ppa_list[i]; + line = &pblk->lines[pblk_dev_ppa_to_line(ppa)]; + + spin_lock(&line->lock); + if (line->state != PBLK_LINESTATE_OPEN) { + pr_err("pblk: bad ppa: line:%d,state:%d\n", + line->id, line->state); + WARN_ON(1); + spin_unlock(&line->lock); + return -EINVAL; + } + spin_unlock(&line->lock); + } + } + + return 0; +} +#endif + static inline int pblk_boundary_paddr_checks(struct pblk *pblk, u64 paddr) { struct pblk_line_meta *lm = &pblk->lm; diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 6017153c2439..8fc949c5b49b 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -492,34 +492,47 @@ static void nvme_nvm_end_io(struct request *rq, blk_status_t status) blk_mq_free_request(rq); } -static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +static struct request *nvme_nvm_alloc_request(struct request_queue *q, + struct nvm_rq *rqd, + struct nvme_nvm_command *cmd) { - struct request_queue *q = dev->q; struct nvme_ns *ns = q->queuedata; struct request *rq; - struct bio *bio = rqd->bio; - struct nvme_nvm_command *cmd; - - cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); - if (!cmd) - return -ENOMEM; nvme_nvm_rqtocmd(rqd, ns, cmd); rq = nvme_alloc_request(q, (struct nvme_command *)cmd, 0, NVME_QID_ANY); - if (IS_ERR(rq)) { - kfree(cmd); - return PTR_ERR(rq); - } + if (IS_ERR(rq)) + return rq; + rq->cmd_flags &= ~REQ_FAILFAST_DRIVER; - if (bio) { - blk_init_request_from_bio(rq, bio); + if (rqd->bio) { + blk_init_request_from_bio(rq, rqd->bio); } else { rq->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); rq->__data_len = 0; } + return rq; +} + +static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + struct request_queue *q = dev->q; + struct nvme_nvm_command *cmd; + struct request *rq; + + cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); + if (!cmd) + return -ENOMEM; + + rq = nvme_nvm_alloc_request(q, rqd, cmd); + if (IS_ERR(rq)) { + kfree(cmd); + return PTR_ERR(rq); + } + rq->end_io_data = rqd; blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); @@ -527,6 +540,34 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) return 0; } +static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + struct request_queue *q = dev->q; + struct request *rq; + struct nvme_nvm_command cmd; + int ret = 0; + + memset(&cmd, 0, sizeof(struct nvme_nvm_command)); + + rq = nvme_nvm_alloc_request(q, rqd, &cmd); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + /* I/Os can fail and the error is signaled through rqd. Callers must + * handle the error accordingly. + */ + blk_execute_rq(q, NULL, rq, 0); + if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) + ret = -EINTR; + + rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64); + rqd->error = nvme_req(rq)->status; + + blk_mq_free_request(rq); + + return ret; +} + static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) { struct nvme_ns *ns = nvmdev->q->queuedata; @@ -562,6 +603,7 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .set_bb_tbl = nvme_nvm_set_bb_tbl, .submit_io = nvme_nvm_submit_io, + .submit_io_sync = nvme_nvm_submit_io_sync, .create_dma_pool = nvme_nvm_create_dma_pool, .destroy_dma_pool = nvme_nvm_destroy_dma_pool, -- cgit v1.2.1 From cdd094fd0ad750c94ccaa5b2ee84fd8264f6f433 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 13 Oct 2017 09:36:06 -0600 Subject: Revert "lightnvm: prevent bd removal if busy" Christoph correctly points out that this issue is no different for other block devices, and poking at cross layer internals is not the right way to solve it. This reverts commit bb6aa6f08268bbce4e0185b18cab9e04505d6695. Signed-off-by: Jens Axboe --- drivers/lightnvm/core.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index fe21f4dd33e9..83249b43dd06 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -390,7 +390,6 @@ static void __nvm_remove_target(struct nvm_target *t) static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) { struct nvm_target *t; - struct block_device *bdev; mutex_lock(&dev->mlock); t = nvm_find_target(dev, remove->tgtname); @@ -398,19 +397,6 @@ static int nvm_remove_tgt(struct nvm_dev *dev, struct nvm_ioctl_remove *remove) mutex_unlock(&dev->mlock); return 1; } - bdev = bdget_disk(t->disk, 0); - if (!bdev) { - pr_err("nvm: removal failed, allocating bd failed\n"); - mutex_unlock(&dev->mlock); - return -ENOMEM; - } - if (bdev->bd_super || bdev->bd_part_count) { - pr_err("nvm: removal failed, block device busy\n"); - bdput(bdev); - mutex_unlock(&dev->mlock); - return -EBUSY; - } - bdput(bdev); __nvm_remove_target(t); mutex_unlock(&dev->mlock); -- cgit v1.2.1 From 761f2e1ed8822cf12378c857de337290d0c82a81 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 5 Oct 2017 18:46:46 +0200 Subject: nvme: simplify compat_ioctl handling We can just use our normal ioctl handler for the compat case and remove the boilerplate code for it. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy --- drivers/nvme/host/core.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 26c8913435b2..573cc3b59bfa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1052,16 +1052,6 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, } } -#ifdef CONFIG_COMPAT -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl NULL -#endif - static int nvme_open(struct block_device *bdev, fmode_t mode) { return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; @@ -1380,7 +1370,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit); static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, + .compat_ioctl = nvme_ioctl, .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, -- cgit v1.2.1 From 58f913dce2814a9ea7260e93ed3a949e0d5565e3 Mon Sep 17 00:00:00 2001 From: Peter Foley Date: Fri, 13 Oct 2017 16:35:28 -0700 Subject: bcache: Avoid nested function definition Fixes below error with clang: ../drivers/md/bcache/sysfs.c:759:3: error: function definition is not allowed here { return *((uint16_t *) r) - *((uint16_t *) l); } ^ ../drivers/md/bcache/sysfs.c:789:32: error: use of undeclared identifier 'cmp' sort(p, n, sizeof(uint16_t), cmp, NULL); ^ 2 errors generated. v2: rename function to __bch_cache_cmp Signed-off-by: Peter Foley Reviewed-by: Coly Li Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/sysfs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 104c57cd666c..69f355b9650c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -745,6 +745,11 @@ static struct attribute *bch_cache_set_internal_files[] = { }; KTYPE(bch_cache_set_internal); +static int __bch_cache_cmp(const void *l, const void *r) +{ + return *((uint16_t *)r) - *((uint16_t *)l); +} + SHOW(__bch_cache) { struct cache *ca = container_of(kobj, struct cache, kobj); @@ -769,9 +774,6 @@ SHOW(__bch_cache) CACHE_REPLACEMENT(&ca->sb)); if (attr == &sysfs_priority_stats) { - int cmp(const void *l, const void *r) - { return *((uint16_t *) r) - *((uint16_t *) l); } - struct bucket *b; size_t n = ca->sb.nbuckets, i; size_t unused = 0, available = 0, dirty = 0, meta = 0; @@ -800,7 +802,7 @@ SHOW(__bch_cache) p[i] = ca->buckets[i].prio; mutex_unlock(&ca->set->bucket_lock); - sort(p, n, sizeof(uint16_t), cmp, NULL); + sort(p, n, sizeof(uint16_t), __bch_cache_cmp, NULL); while (n && !cached[n - 1]) -- cgit v1.2.1 From 91af8300d9c1d7c6b6a2fd754109e08d4798b8d8 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 13 Oct 2017 16:35:29 -0700 Subject: bcache: check ca->alloc_thread initialized before wake up it In bcache code, sysfs entries are created before all resources get allocated, e.g. allocation thread of a cache set. There is posibility for NULL pointer deference if a resource is accessed but which is not initialized yet. Indeed Jorg Bornschein catches one on cache set allocation thread and gets a kernel oops. The reason for this bug is, when bch_bucket_alloc() is called during cache set registration and attaching, ca->alloc_thread is not properly allocated and initialized yet, call wake_up_process() on ca->alloc_thread triggers NULL pointer deference failure. A simple and fast fix is, before waking up ca->alloc_thread, checking whether it is allocated, and only wake up ca->alloc_thread when it is not NULL. Signed-off-by: Coly Li Reported-by: Jorg Bornschein Cc: Kent Overstreet Cc: stable@vger.kernel.org Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index cacbe2dbd5c3..e1a4205caa2e 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -406,7 +406,8 @@ long bch_bucket_alloc(struct cache *ca, unsigned reserve, bool wait) finish_wait(&ca->set->bucket_wait, &w); out: - wake_up_process(ca->alloc_thread); + if (ca->alloc_thread) + wake_up_process(ca->alloc_thread); trace_bcache_alloc(ca, reserve); -- cgit v1.2.1 From b1e8139e48b58e3bc1234e619c750ffd1394be2f Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 13 Oct 2017 16:35:30 -0700 Subject: bcache: fix a comments typo in bch_alloc_sectors() Code comments in alloc.c:bch_alloc_sectors() mentions a function name find_data_bucket(), the correct function name should be pick_data_bucket() indeed. bch_alloc_sectors() is a quite important function in bcache allocation code, fixing the typo may help other people to have less confusion. Signed-off-by: Coly Li Reviewed-by: Tang Junhui Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index e1a4205caa2e..4c40870e99f5 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -601,7 +601,7 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, /* * If we had to allocate, we might race and not need to allocate the - * second time we call find_data_bucket(). If we allocated a bucket but + * second time we call pick_data_bucket(). If we allocated a bucket but * didn't use it, drop the refcount bch_bucket_alloc_set() took: */ if (KEY_PTRS(&alloc.key)) -- cgit v1.2.1 From 1dbe32ad0a82f39c6dfb7667c5da5c23b9333664 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 13 Oct 2017 16:35:31 -0700 Subject: bcache: rewrite multiple partitions support Current partition support of bcache is confusing and buggy. It tries to trace non-continuous device minor numbers by an ida bit string, and mistakenly mixed bcache device index with minor numbers. This design generates several negative results, - Index of bcache device name is not consecutive under /dev/. If there are 3 bcache devices, they name will be, /dev/bcache0, /dev/bcache16, /dev/bcache32 Only bcache code indexes bcache device name is such an interesting way. - First minor number of each bcache device is traced by ida bit string. One bcache device will occupy 16 bits, this is not a good idea. Indeed only one bit is enough. - Because minor number and bcache device index are mixed, a device index is allocated by ida_simple_get(), but an first minor number is sent into ida_simple_remove() to release the device. It confused original author too. Root cause of the above errors is, bcache code should not handle device minor numbers at all! A standard process to support multiple partitions in Linux kernel is, - Device driver provides major device number, and indexes multiple device instances. - Device driver does not allocat nor trace device minor number, only provides a first minor number of a given device instance, and sets how many minor numbers (paritions) the device instance may have. All rested stuffs are handled by block layer code, most of the details can be found from block/{genhd, partition-generic}.c files. This patch re-writes multiple partitions support for bcache. It makes whole things to be more clear, and uses ida bit string in a more efficeint way. - Ida bit string only traces bcache device index, not minor number. For a bcache device with 128 partitions, only one bit in ida bit string is enough. - Device minor number and device index are separated in concept. Device index is used for /dev node naming, and ida bit string trace. Minor number is calculated from device index and only used to initialize first_minor of a bcache device. - It does not follow any standard for 16 partitions on a bcache device. This patch sets 128 partitions on single bcache device at max, this is the limitation from GPT (GUID Partition Table) and supported by fdisk. Considering a typical device minor number is 20 bits width, each bcache device may have 128 partitions (7 bits), there can be 8192 bcache devices existing on system. For most common deployment for a single server in now days, it should be enough. [minor spelling fixes in commit message by Michael Lyle] Signed-off-by: Coly Li Cc: Eric Wheeler Cc: Junhui Tang Reviewed-by: Michael Lyle Signed-off-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index fc0a31b13ac4..a478d1ac0480 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -53,12 +53,15 @@ LIST_HEAD(bch_cache_sets); static LIST_HEAD(uncached_devices); static int bcache_major; -static DEFINE_IDA(bcache_minor); +static DEFINE_IDA(bcache_device_idx); static wait_queue_head_t unregister_wait; struct workqueue_struct *bcache_wq; #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) -#define BCACHE_MINORS 16 /* partition support */ +/* limitation of partitions number on single bcache device */ +#define BCACHE_MINORS 128 +/* limitation of bcache devices number on single system */ +#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS) /* Superblock */ @@ -721,6 +724,16 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, closure_get(&c->caching); } +static inline int first_minor_to_idx(int first_minor) +{ + return (first_minor/BCACHE_MINORS); +} + +static inline int idx_to_first_minor(int idx) +{ + return (idx * BCACHE_MINORS); +} + static void bcache_device_free(struct bcache_device *d) { lockdep_assert_held(&bch_register_lock); @@ -734,7 +747,8 @@ static void bcache_device_free(struct bcache_device *d) if (d->disk && d->disk->queue) blk_cleanup_queue(d->disk->queue); if (d->disk) { - ida_simple_remove(&bcache_minor, d->disk->first_minor); + ida_simple_remove(&bcache_device_idx, + first_minor_to_idx(d->disk->first_minor)); put_disk(d->disk); } @@ -751,7 +765,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, { struct request_queue *q; size_t n; - int minor; + int idx; if (!d->stripe_size) d->stripe_size = 1 << 31; @@ -776,25 +790,24 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, if (!d->full_dirty_stripes) return -ENOMEM; - minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL); - if (minor < 0) - return minor; - - minor *= BCACHE_MINORS; + idx = ida_simple_get(&bcache_device_idx, 0, + BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); + if (idx < 0) + return idx; if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio), BIOSET_NEED_BVECS | BIOSET_NEED_RESCUER)) || !(d->disk = alloc_disk(BCACHE_MINORS))) { - ida_simple_remove(&bcache_minor, minor); + ida_simple_remove(&bcache_device_idx, idx); return -ENOMEM; } set_capacity(d->disk, sectors); - snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor); + snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx); d->disk->major = bcache_major; - d->disk->first_minor = minor; + d->disk->first_minor = idx_to_first_minor(idx); d->disk->fops = &bcache_ops; d->disk->private_data = d; -- cgit v1.2.1 From e89d67596e202119ea846cc997f4cf75cb284490 Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Fri, 13 Oct 2017 16:35:32 -0700 Subject: bcache: Remove redundant set_capacity set_capacity() has been called in bcache_device_init(), remove the redundant one. Signed-off-by: Yijing Wang Reviewed-by: Eric Wheeler Acked-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index a478d1ac0480..72c3b2929ef0 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1142,9 +1142,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) if (ret) return ret; - set_capacity(dc->disk.disk, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset); - dc->disk.disk->queue->backing_dev_info->ra_pages = max(dc->disk.disk->queue->backing_dev_info->ra_pages, q->backing_dev_info->ra_pages); -- cgit v1.2.1 From b41c9b0266e8370033a7799f6806bfc70b7fd75f Mon Sep 17 00:00:00 2001 From: Eric Wheeler Date: Fri, 13 Oct 2017 16:35:33 -0700 Subject: bcache: update bio->bi_opf bypass/writeback REQ_ flag hints Flag for bypass if the IO is for read-ahead or background, unless the read-ahead request is for metadata (eg, from gfs2). Bypass if: bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && !(bio->bi_opf & REQ_META)) Writeback if: op_is_sync(bio->bi_opf) || bio->bi_opf & (REQ_META|REQ_PRIO) Signed-off-by: Eric Wheeler Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 8 ++++++++ drivers/md/bcache/writeback.h | 4 +++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 681b4f12b05a..9ee137e8d387 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -384,6 +384,14 @@ static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) op_is_write(bio_op(bio)))) goto skip; + /* + * Flag for bypass if the IO is for read-ahead or background, + * unless the read-ahead request is for metadata (eg, for gfs2). + */ + if (bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND) && + !(bio->bi_opf & REQ_META)) + goto skip; + if (bio->bi_iter.bi_sector & (c->sb.block_size - 1) || bio_sectors(bio) & (c->sb.block_size - 1)) { pr_debug("skipping unaligned io"); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index e35421d20d2e..34bcf49d737b 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -76,7 +76,9 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, if (would_skip) return false; - return op_is_sync(bio->bi_opf) || in_use <= CUTOFF_WRITEBACK; + return (op_is_sync(bio->bi_opf) || + bio->bi_opf & (REQ_META|REQ_PRIO) || + in_use <= CUTOFF_WRITEBACK); } static inline void bch_writeback_queue(struct cached_dev *dc) -- cgit v1.2.1 From 238501027abf0386fa5f5dcaf589f406eb187bc3 Mon Sep 17 00:00:00 2001 From: Yijing Wang Date: Fri, 13 Oct 2017 16:35:34 -0700 Subject: bcache: remove unused parameter Parameter bio is no longer used, clean it. Signed-off-by: Yijing Wang Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 9ee137e8d387..163a17a80874 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -26,12 +26,12 @@ struct kmem_cache *bch_search_cache; static void bch_data_insert_start(struct closure *); -static unsigned cache_mode(struct cached_dev *dc, struct bio *bio) +static unsigned cache_mode(struct cached_dev *dc) { return BDEV_CACHE_MODE(&dc->sb); } -static bool verify(struct cached_dev *dc, struct bio *bio) +static bool verify(struct cached_dev *dc) { return dc->verify; } @@ -369,7 +369,7 @@ static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k) static bool check_should_bypass(struct cached_dev *dc, struct bio *bio) { struct cache_set *c = dc->disk.c; - unsigned mode = cache_mode(dc, bio); + unsigned mode = cache_mode(dc); unsigned sectors, congested = bch_get_congested(c); struct task_struct *task = current; struct io *i; @@ -747,7 +747,7 @@ static void cached_dev_read_done(struct closure *cl) s->cache_miss = NULL; } - if (verify(dc, &s->bio.bio) && s->recoverable && !s->read_dirty_data) + if (verify(dc) && s->recoverable && !s->read_dirty_data) bch_data_verify(dc, s->orig_bio); bio_complete(s); @@ -772,7 +772,7 @@ static void cached_dev_read_done_bh(struct closure *cl) if (s->iop.status) continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq); - else if (s->iop.bio || verify(dc, &s->bio.bio)) + else if (s->iop.bio || verify(dc)) continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq); else continue_at_nobarrier(cl, cached_dev_bio_complete, NULL); @@ -899,7 +899,7 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) s->iop.bypass = true; if (should_writeback(dc, s->orig_bio, - cache_mode(dc, bio), + cache_mode(dc), s->iop.bypass)) { s->iop.bypass = false; s->iop.writeback = true; -- cgit v1.2.1 From 5fa89fb9a86bcc0f0b3f21ab6087a8a4170dcd2c Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:35 -0700 Subject: bcache: don't write back data if reading it failed If an IO operation fails, and we didn't successfully read data from the cache, don't writeback invalid/partial data to the backing disk. Signed-off-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index e663ca082183..5e65a392287d 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -179,13 +179,21 @@ static void write_dirty(struct closure *cl) struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; - dirty_init(w); - bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); - io->bio.bi_iter.bi_sector = KEY_START(&w->key); - bio_set_dev(&io->bio, io->dc->bdev); - io->bio.bi_end_io = dirty_endio; + /* + * IO errors are signalled using the dirty bit on the key. + * If we failed to read, we should not attempt to write to the + * backing device. Instead, immediately go to write_dirty_finish + * to clean up. + */ + if (KEY_DIRTY(&w->key)) { + dirty_init(w); + bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); + io->bio.bi_iter.bi_sector = KEY_START(&w->key); + bio_set_dev(&io->bio, io->dc->bdev); + io->bio.bi_end_io = dirty_endio; - closure_bio_submit(&io->bio, cl); + closure_bio_submit(&io->bio, cl); + } continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } -- cgit v1.2.1 From 1d316e658374f700fdfff9299c70ce65d8d145e6 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:36 -0700 Subject: bcache: implement PI controller for writeback rate bcache uses a control system to attempt to keep the amount of dirty data in cache at a user-configured level, while not responding excessively to transients and variations in write rate. Previously, the system was a PD controller; but the output from it was integrated, turning the Proportional term into an Integral term, and turning the Derivative term into a crude Proportional term. Performance of the controller has been uneven in production, and it has tended to respond slowly, oscillate, and overshoot. This patch set replaces the current control system with an explicit PI controller and tuning that should be correct for most hardware. By default, it attempts to write at a rate that would retire 1/40th of the current excess blocks per second. An integral term in turn works to remove steady state errors. IMO, this yields benefits in simplicity (removing weighted average filtering, etc) and system performance. Another small change is a tunable parameter is introduced to allow the user to specify a minimum rate at which dirty blocks are retired. There is a slight difference from earlier versions of the patch in integral handling to prevent excessive negative integral windup. Signed-off-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 9 ++--- drivers/md/bcache/sysfs.c | 18 +++++---- drivers/md/bcache/writeback.c | 91 ++++++++++++++++++++++++------------------- 3 files changed, 66 insertions(+), 52 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 2ed9bd231d84..eb83be693d60 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -265,9 +265,6 @@ struct bcache_device { atomic_t *stripe_sectors_dirty; unsigned long *full_dirty_stripes; - unsigned long sectors_dirty_last; - long sectors_dirty_derivative; - struct bio_set *bio_split; unsigned data_csum:1; @@ -362,12 +359,14 @@ struct cached_dev { uint64_t writeback_rate_target; int64_t writeback_rate_proportional; - int64_t writeback_rate_derivative; + int64_t writeback_rate_integral; + int64_t writeback_rate_integral_scaled; int64_t writeback_rate_change; unsigned writeback_rate_update_seconds; - unsigned writeback_rate_d_term; + unsigned writeback_rate_i_term_inverse; unsigned writeback_rate_p_term_inverse; + unsigned writeback_rate_minimum; }; enum alloc_reserve { diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 69f355b9650c..2290bffd4922 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -81,8 +81,9 @@ rw_attribute(writeback_delay); rw_attribute(writeback_rate); rw_attribute(writeback_rate_update_seconds); -rw_attribute(writeback_rate_d_term); +rw_attribute(writeback_rate_i_term_inverse); rw_attribute(writeback_rate_p_term_inverse); +rw_attribute(writeback_rate_minimum); read_attribute(writeback_rate_debug); read_attribute(stripe_size); @@ -130,15 +131,16 @@ SHOW(__bch_cached_dev) sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); var_print(writeback_rate_update_seconds); - var_print(writeback_rate_d_term); + var_print(writeback_rate_i_term_inverse); var_print(writeback_rate_p_term_inverse); + var_print(writeback_rate_minimum); if (attr == &sysfs_writeback_rate_debug) { char rate[20]; char dirty[20]; char target[20]; char proportional[20]; - char derivative[20]; + char integral[20]; char change[20]; s64 next_io; @@ -146,7 +148,7 @@ SHOW(__bch_cached_dev) bch_hprint(dirty, bcache_dev_sectors_dirty(&dc->disk) << 9); bch_hprint(target, dc->writeback_rate_target << 9); bch_hprint(proportional,dc->writeback_rate_proportional << 9); - bch_hprint(derivative, dc->writeback_rate_derivative << 9); + bch_hprint(integral, dc->writeback_rate_integral_scaled << 9); bch_hprint(change, dc->writeback_rate_change << 9); next_io = div64_s64(dc->writeback_rate.next - local_clock(), @@ -157,11 +159,11 @@ SHOW(__bch_cached_dev) "dirty:\t\t%s\n" "target:\t\t%s\n" "proportional:\t%s\n" - "derivative:\t%s\n" + "integral:\t%s\n" "change:\t\t%s/sec\n" "next io:\t%llims\n", rate, dirty, target, proportional, - derivative, change, next_io); + integral, change, next_io); } sysfs_hprint(dirty_data, @@ -213,7 +215,7 @@ STORE(__cached_dev) dc->writeback_rate.rate, 1, INT_MAX); d_strtoul_nonzero(writeback_rate_update_seconds); - d_strtoul(writeback_rate_d_term); + d_strtoul(writeback_rate_i_term_inverse); d_strtoul_nonzero(writeback_rate_p_term_inverse); d_strtoi_h(sequential_cutoff); @@ -319,7 +321,7 @@ static struct attribute *bch_cached_dev_files[] = { &sysfs_writeback_percent, &sysfs_writeback_rate, &sysfs_writeback_rate_update_seconds, - &sysfs_writeback_rate_d_term, + &sysfs_writeback_rate_i_term_inverse, &sysfs_writeback_rate_p_term_inverse, &sysfs_writeback_rate_debug, &sysfs_dirty_data, diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 5e65a392287d..cac8678da5d0 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -25,48 +25,62 @@ static void __update_writeback_rate(struct cached_dev *dc) bcache_flash_devs_sectors_dirty(c); uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); - int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), c->cached_dev_sectors); - /* PD controller */ - + /* + * PI controller: + * Figures out the amount that should be written per second. + * + * First, the error (number of sectors that are dirty beyond our + * target) is calculated. The error is accumulated (numerically + * integrated). + * + * Then, the proportional value and integral value are scaled + * based on configured values. These are stored as inverses to + * avoid fixed point math and to make configuration easy-- e.g. + * the default value of 40 for writeback_rate_p_term_inverse + * attempts to write at a rate that would retire all the dirty + * blocks in 40 seconds. + * + * The writeback_rate_i_inverse value of 10000 means that 1/10000th + * of the error is accumulated in the integral term per second. + * This acts as a slow, long-term average that is not subject to + * variations in usage like the p term. + */ int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); - int64_t derivative = dirty - dc->disk.sectors_dirty_last; - int64_t proportional = dirty - target; - int64_t change; - - dc->disk.sectors_dirty_last = dirty; - - /* Scale to sectors per second */ - - proportional *= dc->writeback_rate_update_seconds; - proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); - - derivative = div_s64(derivative, dc->writeback_rate_update_seconds); - - derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, - (dc->writeback_rate_d_term / - dc->writeback_rate_update_seconds) ?: 1, 0); - - derivative *= dc->writeback_rate_d_term; - derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); - - change = proportional + derivative; + int64_t error = dirty - target; + int64_t proportional_scaled = + div_s64(error, dc->writeback_rate_p_term_inverse); + int64_t integral_scaled, new_rate; + + if ((error < 0 && dc->writeback_rate_integral > 0) || + (error > 0 && time_before64(local_clock(), + dc->writeback_rate.next + NSEC_PER_MSEC))) { + /* + * Only decrease the integral term if it's more than + * zero. Only increase the integral term if the device + * is keeping up. (Don't wind up the integral + * ineffectively in either case). + * + * It's necessary to scale this by + * writeback_rate_update_seconds to keep the integral + * term dimensioned properly. + */ + dc->writeback_rate_integral += error * + dc->writeback_rate_update_seconds; + } - /* Don't increase writeback rate if the device isn't keeping up */ - if (change > 0 && - time_after64(local_clock(), - dc->writeback_rate.next + NSEC_PER_MSEC)) - change = 0; + integral_scaled = div_s64(dc->writeback_rate_integral, + dc->writeback_rate_i_term_inverse); - dc->writeback_rate.rate = - clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, - 1, NSEC_PER_MSEC); + new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_MSEC); - dc->writeback_rate_proportional = proportional; - dc->writeback_rate_derivative = derivative; - dc->writeback_rate_change = change; + dc->writeback_rate_proportional = proportional_scaled; + dc->writeback_rate_integral_scaled = integral_scaled; + dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; + dc->writeback_rate.rate = new_rate; dc->writeback_rate_target = target; } @@ -499,8 +513,6 @@ void bch_sectors_dirty_init(struct bcache_device *d) bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); - - d->sectors_dirty_last = bcache_dev_sectors_dirty(d); } void bch_cached_dev_writeback_init(struct cached_dev *dc) @@ -514,10 +526,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_percent = 10; dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; + dc->writeback_rate_minimum = 1; dc->writeback_rate_update_seconds = 5; - dc->writeback_rate_d_term = 30; - dc->writeback_rate_p_term_inverse = 6000; + dc->writeback_rate_p_term_inverse = 40; + dc->writeback_rate_i_term_inverse = 10000; INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); } -- cgit v1.2.1 From ae82ddbfeb359fcffa97be5fb5bcd59165f2864f Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:37 -0700 Subject: bcache: smooth writeback rate control This works in conjunction with the new PI controller. Currently, in real-world workloads, the rate controller attempts to write back 1 sector per second. In practice, these minimum-rate writebacks are between 4k and 60k in test scenarios, since bcache aggregates and attempts to do contiguous writes and because filesystems on top of bcachefs typically write 4k or more. Previously, bcache used to guarantee to write at least once per second. This means that the actual writeback rate would exceed the configured amount by a factor of 8-120 or more. This patch adjusts to be willing to sleep up to 2.5 seconds, and to target writing 4k/second. On the smallest writes, it will sleep 1 second like before, but many times it will sleep longer and load the backing device less. This keeps the loading on the cache and backing device related to writeback more consistent when writing back at low rates. Signed-off-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/util.c | 10 ++++++++-- drivers/md/bcache/writeback.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 176d3c2ef5f5..4dbe37e82877 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -232,8 +232,14 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) d->next += div_u64(done * NSEC_PER_SEC, d->rate); - if (time_before64(now + NSEC_PER_SEC, d->next)) - d->next = now + NSEC_PER_SEC; + /* Bound the time. Don't let us fall further than 2 seconds behind + * (this prevents unnecessary backlog that would make it impossible + * to catch up). If we're ahead of the desired writeback rate, + * don't let us sleep more than 2.5 seconds (so we can notice/respond + * if the control system tells us to speed up!). + */ + if (time_before64(now + NSEC_PER_SEC * 5 / 2, d->next)) + d->next = now + NSEC_PER_SEC * 5 / 2; if (time_after64(now - NSEC_PER_SEC * 2, d->next)) d->next = now - NSEC_PER_SEC * 2; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index cac8678da5d0..8deb721c355e 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -526,7 +526,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) dc->writeback_percent = 10; dc->writeback_delay = 30; dc->writeback_rate.rate = 1024; - dc->writeback_rate_minimum = 1; + dc->writeback_rate_minimum = 8; dc->writeback_rate_update_seconds = 5; dc->writeback_rate_p_term_inverse = 40; -- cgit v1.2.1 From e41166c5c44e30dbd620f7c77a27efe5d5cc551a Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:38 -0700 Subject: bcache: writeback rate shouldn't artifically clamp The previous code artificially limited writeback rate to 1000000 blocks/second (NSEC_PER_MSEC), which is a rate that can be met on fast hardware. The rate limiting code works fine (though with decreased precision) up to 3 orders of magnitude faster, so use NSEC_PER_SEC. Additionally, ensure that uint32_t is used as a type for rate throughout the rate management so that type checking/clamp_t can work properly. bch_next_delay should be rewritten for increased precision and better handling of high rates and long sleep periods, but this is adequate for now. Signed-off-by: Michael Lyle Reported-by: Coly Li Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/util.h | 4 ++-- drivers/md/bcache/writeback.c | 7 ++++--- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index eb83be693d60..d77c4829c497 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -361,7 +361,7 @@ struct cached_dev { int64_t writeback_rate_proportional; int64_t writeback_rate_integral; int64_t writeback_rate_integral_scaled; - int64_t writeback_rate_change; + int32_t writeback_rate_change; unsigned writeback_rate_update_seconds; unsigned writeback_rate_i_term_inverse; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index cb8d2ccbb6c6..8f509290bb02 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -441,10 +441,10 @@ struct bch_ratelimit { uint64_t next; /* - * Rate at which we want to do work, in units per nanosecond + * Rate at which we want to do work, in units per second * The units here correspond to the units passed to bch_next_delay() */ - unsigned rate; + uint32_t rate; }; static inline void bch_ratelimit_reset(struct bch_ratelimit *d) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 8deb721c355e..897d28050656 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -52,7 +52,8 @@ static void __update_writeback_rate(struct cached_dev *dc) int64_t error = dirty - target; int64_t proportional_scaled = div_s64(error, dc->writeback_rate_p_term_inverse); - int64_t integral_scaled, new_rate; + int64_t integral_scaled; + uint32_t new_rate; if ((error < 0 && dc->writeback_rate_integral > 0) || (error > 0 && time_before64(local_clock(), @@ -74,8 +75,8 @@ static void __update_writeback_rate(struct cached_dev *dc) integral_scaled = div_s64(dc->writeback_rate_integral, dc->writeback_rate_i_term_inverse); - new_rate = clamp_t(int64_t, (proportional_scaled + integral_scaled), - dc->writeback_rate_minimum, NSEC_PER_MSEC); + new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), + dc->writeback_rate_minimum, NSEC_PER_SEC); dc->writeback_rate_proportional = proportional_scaled; dc->writeback_rate_integral_scaled = integral_scaled; -- cgit v1.2.1 From a8500fc816b19795756d27c762daa5e19f5e1b6f Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Fri, 13 Oct 2017 16:35:39 -0700 Subject: bcache: rearrange writeback main thread ratelimit The time spent searching for things to write back "counts" for the actual rate achieved, so don't flush the accumulated rate with each chunk. This will maintain better fidelity to user-commanded rates, but it may slightly increase the burstiness of writeback. The writeback lock needs improvement to help mitigate this. Signed-off-by: Michael Lyle Reviewed-by: Kent Overstreet Signed-off-by: Jens Axboe --- drivers/md/bcache/writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 897d28050656..9b770b13bdf6 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -440,6 +440,8 @@ static int bch_writeback_thread(void *arg) struct cached_dev *dc = arg; bool searched_full_index; + bch_ratelimit_reset(&dc->writeback_rate); + while (!kthread_should_stop()) { down_write(&dc->writeback_lock); if (!atomic_read(&dc->has_dirty) || @@ -467,7 +469,6 @@ static int bch_writeback_thread(void *arg) up_write(&dc->writeback_lock); - bch_ratelimit_reset(&dc->writeback_rate); read_dirty(dc); if (searched_full_index) { @@ -477,6 +478,8 @@ static int bch_writeback_thread(void *arg) !kthread_should_stop() && !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) delay = schedule_timeout_interruptible(delay); + + bch_ratelimit_reset(&dc->writeback_rate); } } -- cgit v1.2.1 From 6446c684f9418d0175c9c3e5134e7744fe79181a Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Fri, 13 Oct 2017 16:35:40 -0700 Subject: bcache: safeguard a dangerous addressing in closure_queue The use of the union reduces the size of closure struct by taking advantage of the current size of its members. The offset of func in work_struct equals the size of the first three members, so that work.work_func will just reference the forth member - fn. This is smart but dangerous. It can be broken if work_struct or the other structs get changed, and can be a bit difficult to debug. Signed-off-by: Liang Chen Reviewed-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/closure.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers') diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h index 295b7e43f92c..00fb314cce57 100644 --- a/drivers/md/bcache/closure.h +++ b/drivers/md/bcache/closure.h @@ -251,6 +251,12 @@ static inline void set_closure_fn(struct closure *cl, closure_fn *fn, static inline void closure_queue(struct closure *cl) { struct workqueue_struct *wq = cl->wq; + /** + * Changes made to closure, work_struct, or a couple of other structs + * may cause work.func not pointing to the right location. + */ + BUILD_BUG_ON(offsetof(struct closure, fn) + != offsetof(struct work_struct, func)); if (wq) { INIT_WORK(&cl->work, cl->work.func); BUG_ON(!queue_work(wq, &cl->work)); -- cgit v1.2.1 From 9ce762e85bc95fd7aa1fb5f8cfc38ce5a228fc95 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Mon, 16 Oct 2017 10:34:47 -0700 Subject: bcache: writeback rate clamping: make 32 bit safe Sorry this got through to linux-block, was detected by the kbuilds test robot. NSEC_PER_SEC is a long constant; 2.5 * 10^9 doesn't fit in a signed long constant. Fixes: e41166c5c44e ("bcache: writeback rate shouldn't artifically clamp") Reviewed-by: Coly Li Signed-off-by: Michael Lyle Signed-off-by: Jens Axboe --- drivers/md/bcache/util.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index 4dbe37e82877..e548b8b51322 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -238,8 +238,8 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done) * don't let us sleep more than 2.5 seconds (so we can notice/respond * if the control system tells us to speed up!). */ - if (time_before64(now + NSEC_PER_SEC * 5 / 2, d->next)) - d->next = now + NSEC_PER_SEC * 5 / 2; + if (time_before64(now + NSEC_PER_SEC * 5LLU / 2LLU, d->next)) + d->next = now + NSEC_PER_SEC * 5LLU / 2LLU; if (time_after64(now - NSEC_PER_SEC * 2, d->next)) d->next = now - NSEC_PER_SEC * 2; -- cgit v1.2.1 From 30c516d750396c5f3ec9cb04c9e025c25e91495e Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 17 Oct 2017 12:11:46 +0000 Subject: nullb: fix error return code in null_init() Fix to return error code -ENOMEM from the null_alloc_dev() error handling case instead of 0, as done elsewhere in this function. Fixes: 2984c8684f96 ("nullb: factor disk parameters") Signed-off-by: Wei Yongjun Signed-off-by: Jens Axboe --- drivers/block/null_blk.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index bf2c8ca3242a..50c83c4b2ea0 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -1991,8 +1991,10 @@ static int __init null_init(void) for (i = 0; i < nr_devices; i++) { dev = null_alloc_dev(); - if (!dev) + if (!dev) { + ret = -ENOMEM; goto err_dev; + } ret = null_add_dev(dev); if (ret) { null_free_dev(dev); -- cgit v1.2.1 From 31b8446079757575e576b0516f0e4c0fcdfbd3dd Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:53:07 +0300 Subject: nvme: introduce nvme_reinit_tagset Move blk_mq_reinit_tagset from blk-mq to nvme core as the only user of it. Current transports that use it (rdma, fc) simply implement .reinit_request op. This patch does not change any functionality. Reviewed-by: Jens Axboe Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 10 ++++++++++ drivers/nvme/host/fc.c | 3 ++- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/rdma.c | 7 +++---- 4 files changed, 17 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 573cc3b59bfa..c95155696741 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2934,6 +2934,16 @@ void nvme_start_queues(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_start_queues); +int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set) +{ + if (!ctrl->ops->reinit_request) + return 0; + + return blk_mq_tagset_iter(set, set->driver_data, + ctrl->ops->reinit_request); +} +EXPORT_SYMBOL_GPL(nvme_reinit_tagset); + int __init nvme_core_init(void) { int result; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b8e0822127c1..372a0685b12a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2393,7 +2393,7 @@ nvme_fc_reinit_io_queues(struct nvme_fc_ctrl *ctrl) nvme_fc_init_io_queues(ctrl); - ret = blk_mq_reinit_tagset(&ctrl->tag_set, nvme_fc_reinit_request); + ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); if (ret) goto out_free_io_queues; @@ -2753,6 +2753,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .submit_async_event = nvme_fc_submit_async_event, .delete_ctrl = nvme_fc_del_nvme_ctrl, .get_address = nvmf_get_address, + .reinit_request = nvme_fc_reinit_request, }; static void diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index df787f39f4c1..cb9d93048f3d 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -237,6 +237,7 @@ struct nvme_ctrl_ops { void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); int (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); + int (*reinit_request)(void *data, struct request *rq); }; static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) @@ -311,6 +312,7 @@ void nvme_unfreeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze(struct nvme_ctrl *ctrl); void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout); void nvme_start_freeze(struct nvme_ctrl *ctrl); +int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 92a03ff5fb4d..039f38cb6987 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -774,8 +774,7 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, goto out_free_tagset; } } else { - error = blk_mq_reinit_tagset(&ctrl->admin_tag_set, - nvme_rdma_reinit_request); + error = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); if (error) goto out_free_queue; } @@ -855,8 +854,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_free_tag_set; } } else { - ret = blk_mq_reinit_tagset(&ctrl->tag_set, - nvme_rdma_reinit_request); + ret = nvme_reinit_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); if (ret) goto out_free_io_queues; @@ -1845,6 +1843,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .submit_async_event = nvme_rdma_submit_async_event, .delete_ctrl = nvme_rdma_del_ctrl, .get_address = nvmf_get_address, + .reinit_request = nvme_rdma_reinit_request, }; static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, -- cgit v1.2.1 From 60070c78ef7b624680ffdde30bca55e515504585 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:06 +0300 Subject: nvme-rdma: pass tagset to directly nvme_rdma_free_tagset Instead of flagging admin/io. Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 039f38cb6987..93c8578a2ddc 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -670,11 +670,10 @@ out_free_queues: return ret; } -static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, bool admin) +static void nvme_rdma_free_tagset(struct nvme_ctrl *nctrl, + struct blk_mq_tag_set *set) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - struct blk_mq_tag_set *set = admin ? - &ctrl->admin_tag_set : &ctrl->tag_set; blk_mq_free_tag_set(set); nvme_rdma_dev_put(ctrl->device); @@ -744,7 +743,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, nvme_rdma_stop_queue(&ctrl->queues[0]); if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); - nvme_rdma_free_tagset(&ctrl->ctrl, true); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); } nvme_rdma_free_queue(&ctrl->queues[0]); } @@ -818,7 +817,7 @@ out_cleanup_queue: blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_tagset: if (new) - nvme_rdma_free_tagset(&ctrl->ctrl, true); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.admin_tagset); out_free_queue: nvme_rdma_free_queue(&ctrl->queues[0]); return error; @@ -830,7 +829,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, nvme_rdma_stop_io_queues(ctrl); if (remove) { blk_cleanup_queue(ctrl->ctrl.connect_q); - nvme_rdma_free_tagset(&ctrl->ctrl, false); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); } nvme_rdma_free_io_queues(ctrl); } @@ -873,7 +872,7 @@ out_cleanup_connect_q: blk_cleanup_queue(ctrl->ctrl.connect_q); out_free_tag_set: if (new) - nvme_rdma_free_tagset(&ctrl->ctrl, false); + nvme_rdma_free_tagset(&ctrl->ctrl, ctrl->ctrl.tagset); out_free_io_queues: nvme_rdma_free_io_queues(ctrl); return ret; -- cgit v1.2.1 From d8bfceebc41762f7dde960996a24180f9a67cd51 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:07 +0300 Subject: nvme-rdma: fix wrong logging message Not necessarily address resolution failed. Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 93c8578a2ddc..02b3388af308 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -544,7 +544,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, ret = nvme_rdma_wait_for_cm(queue); if (ret) { dev_info(ctrl->ctrl.device, - "rdma_resolve_addr wait failed (%d).\n", ret); + "rdma connection establishment failed (%d)\n", ret); goto out_destroy_cm_id; } -- cgit v1.2.1 From 0c5b43b9c1fafc5af37915dde6670aaf1224e8cc Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:08 +0300 Subject: nvme-rdma: move assignment to declaration No need for the extra line for trivial assignments. Signed-off-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 02b3388af308..221ace23aaa5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -434,11 +434,9 @@ out_err: static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue) { - struct nvme_rdma_device *dev; - struct ib_device *ibdev; + struct nvme_rdma_device *dev = queue->device; + struct ib_device *ibdev = dev->dev; - dev = queue->device; - ibdev = dev->dev; rdma_destroy_qp(queue->cm_id); ib_free_cq(queue->ib_cq); -- cgit v1.2.1 From 0fc176dfdafc78e66bd74f54e487ca1fc59d01bf Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:09 +0300 Subject: nvme-rdma: Check that reinit_request got a proper mr Warn if req->mr is NULL as it should never happen. Reviewed-by: Johannes Thumshirn Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 221ace23aaa5..7e61e6447d2c 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -274,6 +274,9 @@ static int nvme_rdma_reinit_request(void *data, struct request *rq) struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); int ret = 0; + if (WARN_ON_ONCE(!req->mr)) + return 0; + ib_dereg_mr(req->mr); req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG, -- cgit v1.2.1 From 5e1fe61d4170b1f498e20de92c7ce4cd5e40c3c5 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:11 +0300 Subject: nvme-rdma: teardown admin/io queues once on error recovery Relying on the queue state while tearing down on every reconnect attempt is not a good design. We should do it once in err_work and simply try to establish the queues for each reconnect attempt. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 7e61e6447d2c..95837c5317b4 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -925,10 +925,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) ++ctrl->ctrl.nr_reconnects; - if (ctrl->ctrl.queue_count > 1) - nvme_rdma_destroy_io_queues(ctrl, false); - - nvme_rdma_destroy_admin_queue(ctrl, false); ret = nvme_rdma_configure_admin_queue(ctrl, false); if (ret) goto requeue; @@ -936,7 +932,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (ctrl->ctrl.queue_count > 1) { ret = nvme_rdma_configure_io_queues(ctrl, false); if (ret) - goto requeue; + goto destroy_admin; } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -946,14 +942,17 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) return; } - ctrl->ctrl.nr_reconnects = 0; - nvme_start_ctrl(&ctrl->ctrl); - dev_info(ctrl->ctrl.device, "Successfully reconnected\n"); + dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", + ctrl->ctrl.nr_reconnects); + + ctrl->ctrl.nr_reconnects = 0; return; +destroy_admin: + nvme_rdma_destroy_admin_queue(ctrl, false); requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", ctrl->ctrl.nr_reconnects); @@ -969,17 +968,15 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - } - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - - /* We must take care of fastfail/requeue all our inflight requests */ - if (ctrl->ctrl.queue_count > 1) blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_destroy_io_queues(ctrl, false); + } + + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, &ctrl->ctrl); + nvme_rdma_destroy_admin_queue(ctrl, false); /* * queues are not a live anymore, so restart the queues to fail fast -- cgit v1.2.1 From 60a518863368c7e124cd8411c2481d0938f53e08 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:10 +0300 Subject: nvme-rdma: Don't local invalidate if the queue is not live No chance for the local invalidate to succeed if the queue-pair is in error state. Most likely the target will do a remote invalidation of our mr so not a big loss on the test_bit. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 95837c5317b4..a58dd0c77a29 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1052,7 +1052,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, if (!blk_rq_bytes(rq)) return; - if (req->mr->need_inval) { + if (req->mr->need_inval && test_bit(NVME_RDMA_Q_LIVE, &req->queue->flags)) { res = nvme_rdma_inv_rkey(queue, req); if (unlikely(res < 0)) { dev_err(ctrl->ctrl.device, -- cgit v1.2.1 From 5013e98b5e8db50144e8f1ca5a96aed95d4d48a0 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 15:29:12 +0300 Subject: nvme-rdma: change queue flag semantics DELETING -> ALLOCATED Instead of marking we are deleting, mark we are allocated and check that instead. This makes the logic symmetrical to connected mark check. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a58dd0c77a29..ff67375982fe 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -79,8 +79,8 @@ struct nvme_rdma_request { }; enum nvme_rdma_queue_flags { - NVME_RDMA_Q_LIVE = 0, - NVME_RDMA_Q_DELETING = 1, + NVME_RDMA_Q_ALLOCATED = 0, + NVME_RDMA_Q_LIVE = 1, }; struct nvme_rdma_queue { @@ -549,7 +549,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl, goto out_destroy_cm_id; } - clear_bit(NVME_RDMA_Q_DELETING, &queue->flags); + set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags); return 0; @@ -569,7 +569,7 @@ static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue) static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue) { - if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags)) + if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags)) return; nvme_rdma_destroy_queue_ib(queue); -- cgit v1.2.1 From 0ad0bfa29822904b9186a1c04dbfb3014bc5dc8a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 11 Oct 2017 12:49:51 +0300 Subject: nvme-rdma: stop controller reset if the controller is deleting If the controller is deleting (in case the user decided to delete it), we have no point to continue reset sequence. Signed-off-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ff67375982fe..34d22ff42afb 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1818,7 +1818,11 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); + if (!changed) { + /* state change failure is ok if we're in DELETING state */ + WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); + return; + } nvme_start_ctrl(&ctrl->ctrl); -- cgit v1.2.1 From 16772ae6d9221b065607335a9b279375d8d3e2fd Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Wed, 18 Oct 2017 22:56:09 +0900 Subject: nvme-pci: fix typos in comments fixed comment typos in adapter_alloc_cq() and adapter_alloc_sq(). 'the the' duplications are replaced with 'that the'. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index cb73bc8cad3b..bafdc2ab5be3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -930,7 +930,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; /* - * Note: we (ab)use the fact the the prp fields survive if no data + * Note: we (ab)use the fact that the prp fields survive if no data * is attached to the request. */ memset(&c, 0, sizeof(c)); @@ -951,7 +951,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, int flags = NVME_QUEUE_PHYS_CONTIG; /* - * Note: we (ab)use the fact the the prp fields survive if no data + * Note: we (ab)use the fact that the prp fields survive if no data * is attached to the request. */ memset(&c, 0, sizeof(c)); -- cgit v1.2.1 From 94f29d4f7808e3a495fda8c5d337fc4ac4037ce7 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 18 Oct 2017 12:38:24 +0000 Subject: nvme-rdma: Add BLK_MQ_F_NO_SCHED flag to admin tag set Since commit b86dd81 "block: get rid of blk-mq default scheduler choice Kconfig entries", when setting nr_hw_queues to 1 the admin tag set uses mq-deadline scheduler. This flag is useful for admin queues that aren't used for normal IO. Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 34d22ff42afb..a5577e06a620 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -699,6 +699,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set->driver_data = ctrl; set->nr_hw_queues = 1; set->timeout = ADMIN_TIMEOUT; + set->flags = BLK_MQ_F_NO_SCHED; } else { set = &ctrl->tag_set; memset(set, 0, sizeof(*set)); -- cgit v1.2.1 From 5a22e2bf44fb311e5a6bfec6ed5680199d6e162a Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 18 Oct 2017 12:38:25 +0000 Subject: nvme-fc: Add BLK_MQ_F_NO_SCHED flag to admin tag set Since commit b86dd81 "block: get rid of blk-mq default scheduler choice Kconfig entries", when setting nr_hw_queues to 1 the admin tag set uses mq-deadline scheduler. This flag is useful for admin queues that aren't used for normal IO. Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 372a0685b12a..0d0e898912ad 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2853,6 +2853,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->admin_tag_set.driver_data = ctrl; ctrl->admin_tag_set.nr_hw_queues = 1; ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ret = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); if (ret) -- cgit v1.2.1 From 86f36b9c6c6cda28c86ead09323f55018e5e64df Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 18 Oct 2017 12:38:26 +0000 Subject: nvme-loop: Add BLK_MQ_F_NO_SCHED flag to admin tag set This flag is useful for admin queues that aren't used for normal IO. Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/loop.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 92628c432926..c56354e1e4c6 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -365,6 +365,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) ctrl->admin_tag_set.driver_data = ctrl; ctrl->admin_tag_set.nr_hw_queues = 1; ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + ctrl->admin_tag_set.flags = BLK_MQ_F_NO_SCHED; ctrl->queues[0].ctrl = ctrl; error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); -- cgit v1.2.1 From ba2dec35e431e62e11116b8036dc372493169b39 Mon Sep 17 00:00:00 2001 From: Roy Shterman Date: Wed, 18 Oct 2017 13:46:07 +0300 Subject: nvmet: Change max_nsid in subsystem due to ns_disable if needed In case we disable namespaces which has the nsid like subsystem max_nsid we need to search for the next largest nsid in this subsystem. If the subsystem don't has more namespaces we set it to 0, else we take nsid from the last namespace in namespaces list because the list is sorted while inserting. Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Roy Shterman Reviewed-by: Johannes Thumshirn [hch: slight refactor] Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 1b208beeef50..da088293eafc 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -57,6 +57,17 @@ u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) return 0; } +static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys) +{ + struct nvmet_ns *ns; + + if (list_empty(&subsys->namespaces)) + return 0; + + ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link); + return ns->nsid; +} + static u32 nvmet_async_event_result(struct nvmet_async_event *aen) { return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); @@ -334,6 +345,8 @@ void nvmet_ns_disable(struct nvmet_ns *ns) ns->enabled = false; list_del_rcu(&ns->dev_link); + if (ns->nsid == subsys->max_nsid) + subsys->max_nsid = nvmet_max_nsid(subsys); mutex_unlock(&subsys->lock); /* -- cgit v1.2.1 From 9843f685ae365d0c628e6a0d929772bca7274311 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 13:10:01 +0200 Subject: nvme: use ida_simple_{get,remove} for the controller instance Switch to the ida_simple_* helpers instead of opencoding them. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 40 +++++++--------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c95155696741..7fae42d595d5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -74,6 +74,8 @@ EXPORT_SYMBOL_GPL(nvme_wq); static LIST_HEAD(nvme_ctrl_list); static DEFINE_SPINLOCK(dev_list_lock); +static DEFINE_IDA(nvme_instance_ida); + static struct class *nvme_class; static __le32 nvme_get_log_dw10(u8 lid, size_t size) @@ -2696,35 +2698,6 @@ void nvme_queue_async_events(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_queue_async_events); -static DEFINE_IDA(nvme_instance_ida); - -static int nvme_set_instance(struct nvme_ctrl *ctrl) -{ - int instance, error; - - do { - if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) - return -ENODEV; - - spin_lock(&dev_list_lock); - error = ida_get_new(&nvme_instance_ida, &instance); - spin_unlock(&dev_list_lock); - } while (error == -EAGAIN); - - if (error) - return -ENODEV; - - ctrl->instance = instance; - return 0; -} - -static void nvme_release_instance(struct nvme_ctrl *ctrl) -{ - spin_lock(&dev_list_lock); - ida_remove(&nvme_instance_ida, ctrl->instance); - spin_unlock(&dev_list_lock); -} - void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_stop_keep_alive(ctrl); @@ -2762,7 +2735,7 @@ static void nvme_free_ctrl(struct kref *kref) struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); put_device(ctrl->device); - nvme_release_instance(ctrl); + ida_simple_remove(&nvme_instance_ida, ctrl->instance); ida_destroy(&ctrl->ns_ida); ctrl->ops->free_ctrl(ctrl); @@ -2796,9 +2769,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); - ret = nvme_set_instance(ctrl); - if (ret) + ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); + if (ret < 0) goto out; + ctrl->instance = ret; ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, MKDEV(nvme_char_major, ctrl->instance), @@ -2825,7 +2799,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, return 0; out_release_instance: - nvme_release_instance(ctrl); + ida_simple_remove(&nvme_instance_ida, ctrl->instance); out: return ret; } -- cgit v1.2.1 From a7a7cbe353a52665b8463e1822ce6ba46b0609d6 Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 16 Oct 2017 18:24:20 -0700 Subject: nvme-pci: add SGL support This adds SGL support for NVMe PCIe driver, based on an earlier patch from Rajiv Shanmugam Madeswaran . This patch refactors the original code and adds new module parameter sgl_threshold to determine whether to use SGL or PRP for IOs. The usage of SGLs is controlled by the sgl_threshold module parameter, which allows to conditionally use SGLs if average request segment size (avg_seg_size) is greater than sgl_threshold. In the original patch, the decision of using SGLs was dependent only on the IO size, with the new approach we consider not only IO size but also the number of physical segments present in the IO. We calculate avg_seg_size based on request payload bytes and number of physical segments present in the request. For e.g.:- 1. blk_rq_nr_phys_segments = 2 blk_rq_payload_bytes = 8k avg_seg_size = 4K use sgl if avg_seg_size >= sgl_threshold. 2. blk_rq_nr_phys_segments = 2 blk_rq_payload_bytes = 64k avg_seg_size = 32K use sgl if avg_seg_size >= sgl_threshold. 3. blk_rq_nr_phys_segments = 16 blk_rq_payload_bytes = 64k avg_seg_size = 4K use sgl if avg_seg_size >= sgl_threshold. Signed-off-by: Chaitanya Kulkarni Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 214 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 187 insertions(+), 27 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index bafdc2ab5be3..11e6fd9d0ba4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -45,6 +45,8 @@ */ #define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS) +#define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) + static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -57,6 +59,12 @@ module_param(max_host_mem_size_mb, uint, 0444); MODULE_PARM_DESC(max_host_mem_size_mb, "Maximum Host Memory Buffer (HMB) size per controller (in MiB)"); +static unsigned int sgl_threshold = SZ_32K; +module_param(sgl_threshold, uint, 0644); +MODULE_PARM_DESC(sgl_threshold, + "Use SGLs when average request segment size is larger or equal to " + "this size. Use 0 to disable SGLs."); + static int io_queue_depth_set(const char *val, const struct kernel_param *kp); static const struct kernel_param_ops io_queue_depth_ops = { .set = io_queue_depth_set, @@ -178,6 +186,7 @@ struct nvme_queue { struct nvme_iod { struct nvme_request req; struct nvme_queue *nvmeq; + bool use_sgl; int aborted; int npages; /* In the PRP list. 0 means small pool in use */ int nents; /* Used in scatterlist */ @@ -331,17 +340,35 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev) return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } -static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev, - unsigned int size, unsigned int nseg) +/* + * Calculates the number of pages needed for the SGL segments. For example a 4k + * page can accommodate 256 SGL descriptors. + */ +static int nvme_pci_npages_sgl(unsigned int num_seg) +{ + return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE); +} + +static unsigned int nvme_pci_iod_alloc_size(struct nvme_dev *dev, + unsigned int size, unsigned int nseg, bool use_sgl) { - return sizeof(__le64 *) * nvme_npages(size, dev) + - sizeof(struct scatterlist) * nseg; + size_t alloc_size; + + if (use_sgl) + alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg); + else + alloc_size = sizeof(__le64 *) * nvme_npages(size, dev); + + return alloc_size + sizeof(struct scatterlist) * nseg; } -static unsigned int nvme_cmd_size(struct nvme_dev *dev) +static unsigned int nvme_pci_cmd_size(struct nvme_dev *dev, bool use_sgl) { - return sizeof(struct nvme_iod) + - nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES); + unsigned int alloc_size = nvme_pci_iod_alloc_size(dev, + NVME_INT_BYTES(dev), NVME_INT_PAGES, + use_sgl); + + return sizeof(struct nvme_iod) + alloc_size; } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -425,10 +452,10 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, nvmeq->sq_tail = tail; } -static __le64 **iod_list(struct request *req) +static void **nvme_pci_iod_list(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); + return (void **)(iod->sg + blk_rq_nr_phys_segments(req)); } static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) @@ -438,7 +465,10 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) unsigned int size = blk_rq_payload_bytes(rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { - iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); + size_t alloc_size = nvme_pci_iod_alloc_size(dev, size, nseg, + iod->use_sgl); + + iod->sg = kmalloc(alloc_size, GFP_ATOMIC); if (!iod->sg) return BLK_STS_RESOURCE; } else { @@ -456,18 +486,31 @@ static blk_status_t nvme_init_iod(struct request *rq, struct nvme_dev *dev) static void nvme_free_iod(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - const int last_prp = dev->ctrl.page_size / 8 - 1; + const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; + dma_addr_t dma_addr = iod->first_dma, next_dma_addr; + int i; - __le64 **list = iod_list(req); - dma_addr_t prp_dma = iod->first_dma; if (iod->npages == 0) - dma_pool_free(dev->prp_small_pool, list[0], prp_dma); + dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], + dma_addr); + for (i = 0; i < iod->npages; i++) { - __le64 *prp_list = list[i]; - dma_addr_t next_prp_dma = le64_to_cpu(prp_list[last_prp]); - dma_pool_free(dev->prp_page_pool, prp_list, prp_dma); - prp_dma = next_prp_dma; + void *addr = nvme_pci_iod_list(req)[i]; + + if (iod->use_sgl) { + struct nvme_sgl_desc *sg_list = addr; + + next_dma_addr = + le64_to_cpu((sg_list[SGES_PER_PAGE - 1]).addr); + } else { + __le64 *prp_list = addr; + + next_dma_addr = le64_to_cpu(prp_list[last_prp]); + } + + dma_pool_free(dev->prp_page_pool, addr, dma_addr); + dma_addr = next_dma_addr; } if (iod->sg != iod->inline_sg) @@ -555,7 +598,8 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents) } } -static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) +static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct dma_pool *pool; @@ -566,14 +610,16 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) u32 page_size = dev->ctrl.page_size; int offset = dma_addr & (page_size - 1); __le64 *prp_list; - __le64 **list = iod_list(req); + void **list = nvme_pci_iod_list(req); dma_addr_t prp_dma; int nprps, i; + iod->use_sgl = false; + length -= (page_size - offset); if (length <= 0) { iod->first_dma = 0; - return BLK_STS_OK; + goto done; } dma_len -= (page_size - offset); @@ -587,7 +633,7 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) if (length <= page_size) { iod->first_dma = dma_addr; - return BLK_STS_OK; + goto done; } nprps = DIV_ROUND_UP(length, page_size); @@ -634,6 +680,10 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) dma_len = sg_dma_len(sg); } +done: + cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma); + return BLK_STS_OK; bad_sgl: @@ -643,6 +693,110 @@ static blk_status_t nvme_setup_prps(struct nvme_dev *dev, struct request *req) return BLK_STS_IOERR; } +static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge, + struct scatterlist *sg) +{ + sge->addr = cpu_to_le64(sg_dma_address(sg)); + sge->length = cpu_to_le32(sg_dma_len(sg)); + sge->type = NVME_SGL_FMT_DATA_DESC << 4; +} + +static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge, + dma_addr_t dma_addr, int entries) +{ + sge->addr = cpu_to_le64(dma_addr); + if (entries < SGES_PER_PAGE) { + sge->length = cpu_to_le32(entries * sizeof(*sge)); + sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4; + } else { + sge->length = cpu_to_le32(PAGE_SIZE); + sge->type = NVME_SGL_FMT_SEG_DESC << 4; + } +} + +static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, + struct request *req, struct nvme_rw_command *cmd) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + int length = blk_rq_payload_bytes(req); + struct dma_pool *pool; + struct nvme_sgl_desc *sg_list; + struct scatterlist *sg = iod->sg; + int entries = iod->nents, i = 0; + dma_addr_t sgl_dma; + + iod->use_sgl = true; + + /* setting the transfer type as SGL */ + cmd->flags = NVME_CMD_SGL_METABUF; + + if (length == sg_dma_len(sg)) { + nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg); + return BLK_STS_OK; + } + + if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { + pool = dev->prp_small_pool; + iod->npages = 0; + } else { + pool = dev->prp_page_pool; + iod->npages = 1; + } + + sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); + if (!sg_list) { + iod->npages = -1; + return BLK_STS_RESOURCE; + } + + nvme_pci_iod_list(req)[0] = sg_list; + iod->first_dma = sgl_dma; + + nvme_pci_sgl_set_seg(&cmd->dptr.sgl, sgl_dma, entries); + + do { + if (i == SGES_PER_PAGE) { + struct nvme_sgl_desc *old_sg_desc = sg_list; + struct nvme_sgl_desc *link = &old_sg_desc[i - 1]; + + sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); + if (!sg_list) + return BLK_STS_RESOURCE; + + i = 0; + nvme_pci_iod_list(req)[iod->npages++] = sg_list; + sg_list[i++] = *link; + nvme_pci_sgl_set_seg(link, sgl_dma, entries); + } + + nvme_pci_sgl_set_data(&sg_list[i++], sg); + + length -= sg_dma_len(sg); + sg = sg_next(sg); + entries--; + } while (length > 0); + + WARN_ON(entries > 0); + return BLK_STS_OK; +} + +static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + unsigned int avg_seg_size; + + avg_seg_size = DIV_ROUND_UP(blk_rq_payload_bytes(req), + blk_rq_nr_phys_segments(req)); + + if (!(dev->ctrl.sgls & ((1 << 0) | (1 << 1)))) + return false; + if (!iod->nvmeq->qid) + return false; + if (!sgl_threshold || avg_seg_size < sgl_threshold) + return false; + return true; +} + static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, struct nvme_command *cmnd) { @@ -662,7 +816,11 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, DMA_ATTR_NO_WARN)) goto out; - ret = nvme_setup_prps(dev, req); + if (nvme_pci_use_sgls(dev, req)) + ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw); + else + ret = nvme_pci_setup_prps(dev, req, &cmnd->rw); + if (ret != BLK_STS_OK) goto out_unmap; @@ -682,8 +840,6 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, goto out_unmap; } - cmnd->rw.dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd->rw.dptr.prp2 = cpu_to_le64(iod->first_dma); if (blk_integrity_rq(req)) cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); return BLK_STS_OK; @@ -1379,7 +1535,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); - dev->admin_tagset.cmd_size = nvme_cmd_size(dev); + dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false); dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; dev->admin_tagset.driver_data = dev; @@ -1906,7 +2062,11 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.numa_node = dev_to_node(dev->dev); dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; - dev->tagset.cmd_size = nvme_cmd_size(dev); + dev->tagset.cmd_size = nvme_pci_cmd_size(dev, false); + if ((dev->ctrl.sgls & ((1 << 0) | (1 << 1))) && sgl_threshold) { + dev->tagset.cmd_size = max(dev->tagset.cmd_size, + nvme_pci_cmd_size(dev, true)); + } dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; -- cgit v1.2.1 From 0a02e39fd1eb2ceb3dd0dc765cb2de4d09697a14 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 19 Oct 2017 16:11:38 -0700 Subject: nvme-fc: correct io termination handling The io completion handling for i/o's that are failing due to to a transport error or association termination had issues, causing io failures (DNR set so retries didn't kick in) or long stalls. Change the io completion handler for the following items: When an io has been completed due to a transport abort (based on an exchange error) or when marked as aborted as part of an association termination (FCOP_FLAGS_TERMIO), set the NVME completion status to NVME_SC_ABORTED. By default, do not set DNR on the status so that a retry can be attempted after association recreate. In cases where an io is failed (non-successful nvme status including aborted), if the controller is being deleted (blk_queue_dying) or the io was part of the ios used for association creation (ctrl state is NEW or RECONNECTING), then additionally set the DNR bit so the io will not be retried. If the failed io was part of association creation, the failure will tear down the partially completioned association and typically restart a new reconnect attempt (another create association later). Rearranged code flow to remove a largely unneeded local variable. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 0d0e898912ad..e1a7e5cdfcf4 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1387,7 +1387,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) struct nvme_command *sqe = &op->cmd_iu.sqe; __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); union nvme_result result; - bool complete_rq, terminate_assoc = true; + bool terminate_assoc = true; /* * WARNING: @@ -1429,8 +1429,9 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, sizeof(op->rsp_iu), DMA_FROM_DEVICE); - if (atomic_read(&op->state) == FCPOP_STATE_ABORTED) - status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); + if (atomic_read(&op->state) == FCPOP_STATE_ABORTED || + op->flags & FCOP_FLAGS_TERMIO) + status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); else if (freq->status) status = cpu_to_le16(NVME_SC_INTERNAL << 1); @@ -1494,23 +1495,27 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) done: if (op->flags & FCOP_FLAGS_AEN) { nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); - complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); + __nvme_fc_fcpop_chk_teardowns(ctrl, op); atomic_set(&op->state, FCPOP_STATE_IDLE); op->flags = FCOP_FLAGS_AEN; /* clear other flags */ nvme_fc_ctrl_put(ctrl); goto check_error; } - complete_rq = __nvme_fc_fcpop_chk_teardowns(ctrl, op); - if (!complete_rq) { - if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { - status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); - if (blk_queue_dying(rq->q)) - status |= cpu_to_le16(NVME_SC_DNR << 1); - } - nvme_end_request(rq, status, result); - } else + /* + * Force failures of commands if we're killing the controller + * or have an error on a command used to create an new association + */ + if (status && + (blk_queue_dying(rq->q) || + ctrl->ctrl.state == NVME_CTRL_NEW || + ctrl->ctrl.state == NVME_CTRL_RECONNECTING)) + status |= cpu_to_le16(NVME_SC_DNR << 1); + + if (__nvme_fc_fcpop_chk_teardowns(ctrl, op)) __nvme_fc_final_op_cleanup(rq); + else + nvme_end_request(rq, status, result); check_error: if (terminate_assoc) -- cgit v1.2.1 From 134aedc9c157d49069e9a98636b0a917678586ee Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 19 Oct 2017 16:11:39 -0700 Subject: nvme-fc: correct io timeout behavior The transport io timeout behavior wasn't quite correct. It ignored that the io error handler is supposed to be synchronous so it possibly allowed the blk request to be restarted while the io associated was still aborting. Timeouts on reserved commands, those used for association create, were never timing out thus they hung out forever. To correct: If an io is times out while a remoteport is not connected, just restart the io timer. The lack of connectivity will simultaneously be resetting the controller, so the reset path will abort and terminate the io. If an io is times out while it was marked for transport abort, just reset the io timer. The abort process is underway and will complete the io. Otherwise, if an io times out, abort the io. If the abort was unsuccessful (unlikely) give up and return not handled. If the abort was successful, as the abort process is underway it will terminate the io, so rather than synchronously waiting, just restart the io timer. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e1a7e5cdfcf4..c6c903f1b172 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1903,13 +1903,14 @@ nvme_fc_timeout(struct request *rq, bool reserved) struct nvme_fc_ctrl *ctrl = op->ctrl; int ret; - if (reserved) + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE || + atomic_read(&op->state) == FCPOP_STATE_ABORTED) return BLK_EH_RESET_TIMER; ret = __nvme_fc_abort_op(ctrl, op); if (ret) - /* io wasn't active to abort consider it done */ - return BLK_EH_HANDLED; + /* io wasn't active to abort */ + return BLK_EH_NOT_HANDLED; /* * we can't individually ABTS an io without affecting the queue, @@ -1920,7 +1921,12 @@ nvme_fc_timeout(struct request *rq, bool reserved) */ nvme_fc_error_recovery(ctrl, "io timeout error"); - return BLK_EH_HANDLED; + /* + * the io abort has been initiated. Have the reset timer + * restarted and the abort completion will complete the io + * shortly. Avoids a synchronous wait while the abort finishes. + */ + return BLK_EH_RESET_TIMER; } static int -- cgit v1.2.1 From f87c89ad93c95857c4d4b19ca580e27994254b7f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 23 Oct 2017 12:59:27 +0300 Subject: nvme-rdma: align nvme_rdma_device structure Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a5577e06a620..ff4635d317b5 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -50,8 +50,8 @@ (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) struct nvme_rdma_device { - struct ib_device *dev; - struct ib_pd *pd; + struct ib_device *dev; + struct ib_pd *pd; struct kref ref; struct list_head entry; }; -- cgit v1.2.1 From e62a538da2e7401bad5293d210ce00583ac12f39 Mon Sep 17 00:00:00 2001 From: Nitzan Carmi Date: Sun, 22 Oct 2017 09:37:04 +0000 Subject: nvme-rdma: Add debug message when reaches timeout Signed-off-by: Nitzan Carmi Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ff4635d317b5..4552744eff45 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1578,6 +1578,10 @@ nvme_rdma_timeout(struct request *rq, bool reserved) { struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); + dev_warn(req->queue->ctrl->ctrl.device, + "I/O %d QID %d timeout, reset controller\n", + rq->tag, nvme_rdma_queue_idx(req->queue)); + /* queue error recovery */ nvme_rdma_error_recovery(req->queue->ctrl); -- cgit v1.2.1 From 75bc5f06617fe1bc9a79ba9e3baccdcae3743404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Tue, 24 Oct 2017 15:56:13 +0200 Subject: lightnvm: pblk: remove leftover testing function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A previous patch inadvertently left an unused test function in the header, kill it. Fixes: 8bd400204bd5 ("lightnvm: pblk: cleanup unused and static functions") Signed-off-by: Javier González Signed-off-by: Jens Axboe --- drivers/lightnvm/pblk.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'drivers') diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h index 6b64288de6f7..90961033a79f 100644 --- a/drivers/lightnvm/pblk.h +++ b/drivers/lightnvm/pblk.h @@ -860,11 +860,6 @@ int pblk_rl_is_limit(struct pblk_rl *rl); int pblk_sysfs_init(struct gendisk *tdisk); void pblk_sysfs_exit(struct gendisk *tdisk); -static inline void test(size_t a) -{ - a += 1; -} - static inline void *pblk_malloc(size_t size, int type, gfp_t flags) { if (type == PBLK_KMALLOC_META) -- cgit v1.2.1 From 2dd4122854f697afc777582d18548dded03ce5dd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 13:20:01 +0200 Subject: nvme: use kref_get_unless_zero in nvme_find_get_ns For kref_get_unless_zero to protect against lookup vs free races we need to use it in all places where we aren't guaranteed to already hold a reference. There is no such guarantee in nvme_find_get_ns, so switch to kref_get_unless_zero in this function. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7fae42d595d5..1d931deac83b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2290,7 +2290,8 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { if (ns->ns_id == nsid) { - kref_get(&ns->kref); + if (!kref_get_unless_zero(&ns->kref)) + continue; ret = ns; break; } -- cgit v1.2.1 From c6424a90da446cff6c67be06767fc0d0be787110 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 13:22:00 +0200 Subject: nvme: simplify nvme_open Now that we are protected against lookup vs free races for the namespace by using kref_get_unless_zero we don't need the hack of NULLing out the disk private data during removal. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1d931deac83b..9f8ae15c9fe8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -253,12 +253,6 @@ static void nvme_free_ns(struct kref *kref) if (ns->ndev) nvme_nvm_unregister(ns); - if (ns->disk) { - spin_lock(&dev_list_lock); - ns->disk->private_data = NULL; - spin_unlock(&dev_list_lock); - } - put_disk(ns->disk); ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); nvme_put_ctrl(ns->ctrl); @@ -270,29 +264,6 @@ static void nvme_put_ns(struct nvme_ns *ns) kref_put(&ns->kref, nvme_free_ns); } -static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) -{ - struct nvme_ns *ns; - - spin_lock(&dev_list_lock); - ns = disk->private_data; - if (ns) { - if (!kref_get_unless_zero(&ns->kref)) - goto fail; - if (!try_module_get(ns->ctrl->ops->module)) - goto fail_put_ns; - } - spin_unlock(&dev_list_lock); - - return ns; - -fail_put_ns: - kref_put(&ns->kref, nvme_free_ns); -fail: - spin_unlock(&dev_list_lock); - return NULL; -} - struct request *nvme_alloc_request(struct request_queue *q, struct nvme_command *cmd, unsigned int flags, int qid) { @@ -1056,7 +1027,16 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, static int nvme_open(struct block_device *bdev, fmode_t mode) { - return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; + struct nvme_ns *ns = bdev->bd_disk->private_data; + + if (!kref_get_unless_zero(&ns->kref)) + return -ENXIO; + if (!try_module_get(ns->ctrl->ops->module)) { + kref_put(&ns->kref, nvme_free_ns); + return -ENXIO; + } + + return 0; } static void nvme_release(struct gendisk *disk, fmode_t mode) -- cgit v1.2.1 From d22524a4782a943bb02a9cf6885ac470210aabfc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 13:25:42 +0200 Subject: nvme: switch controller refcounting to use struct device Instead of allocating a separate struct device for the character device handle embedd it into struct nvme_ctrl and use it for the main controller refcounting. This removes double refcounting and gets us an automatic reference for the character device operations. We keep ctrl->device as a pointer for now to avoid chaning printks all over, but in the future we could look into message printing helpers that take a controller structure similar to what other subsystems do. Note the delete_ctrl operation always already has a reference (either through sysfs due this change, or because every open file on the /dev/nvme-fabrics node has a refernece) when it is entered now, so we don't need to do the unless_zero variant there. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke --- drivers/nvme/host/core.c | 43 ++++++++++++++++++++++--------------------- drivers/nvme/host/fc.c | 8 ++------ drivers/nvme/host/nvme.h | 12 +++++++++++- drivers/nvme/host/pci.c | 2 +- drivers/nvme/host/rdma.c | 5 ++--- drivers/nvme/target/loop.c | 2 +- 6 files changed, 39 insertions(+), 33 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9f8ae15c9fe8..3a97daa163f6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1915,7 +1915,7 @@ static int nvme_dev_open(struct inode *inode, struct file *file) ret = -EWOULDBLOCK; break; } - if (!kref_get_unless_zero(&ctrl->kref)) + if (!kobject_get_unless_zero(&ctrl->device->kobj)) break; file->private_data = ctrl; ret = 0; @@ -2374,7 +2374,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) list_add_tail(&ns->list, &ctrl->namespaces); mutex_unlock(&ctrl->namespaces_mutex); - kref_get(&ctrl->kref); + nvme_get_ctrl(ctrl); kfree(id); @@ -2703,7 +2703,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { - device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); + device_del(ctrl->device); spin_lock(&dev_list_lock); list_del(&ctrl->node); @@ -2711,23 +2711,17 @@ void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); -static void nvme_free_ctrl(struct kref *kref) +static void nvme_free_ctrl(struct device *dev) { - struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); + struct nvme_ctrl *ctrl = + container_of(dev, struct nvme_ctrl, ctrl_device); - put_device(ctrl->device); ida_simple_remove(&nvme_instance_ida, ctrl->instance); ida_destroy(&ctrl->ns_ida); ctrl->ops->free_ctrl(ctrl); } -void nvme_put_ctrl(struct nvme_ctrl *ctrl) -{ - kref_put(&ctrl->kref, nvme_free_ctrl); -} -EXPORT_SYMBOL_GPL(nvme_put_ctrl); - /* * Initialize a NVMe controller structures. This needs to be called during * earliest initialization so that we have the initialized structured around @@ -2742,7 +2736,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, spin_lock_init(&ctrl->lock); INIT_LIST_HEAD(&ctrl->namespaces); mutex_init(&ctrl->namespaces_mutex); - kref_init(&ctrl->kref); ctrl->dev = dev; ctrl->ops = ops; ctrl->quirks = quirks; @@ -2755,15 +2748,21 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, goto out; ctrl->instance = ret; - ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, - MKDEV(nvme_char_major, ctrl->instance), - ctrl, nvme_dev_attr_groups, - "nvme%d", ctrl->instance); - if (IS_ERR(ctrl->device)) { - ret = PTR_ERR(ctrl->device); + device_initialize(&ctrl->ctrl_device); + ctrl->device = &ctrl->ctrl_device; + ctrl->device->devt = MKDEV(nvme_char_major, ctrl->instance); + ctrl->device->class = nvme_class; + ctrl->device->parent = ctrl->dev; + ctrl->device->groups = nvme_dev_attr_groups; + ctrl->device->release = nvme_free_ctrl; + dev_set_drvdata(ctrl->device, ctrl); + ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); + if (ret) goto out_release_instance; - } - get_device(ctrl->device); + ret = device_add(ctrl->device); + if (ret) + goto out_free_name; + ida_init(&ctrl->ns_ida); spin_lock(&dev_list_lock); @@ -2779,6 +2778,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, min(default_ps_max_latency_us, (unsigned long)S32_MAX)); return 0; +out_free_name: + kfree_const(dev->kobj.name); out_release_instance: ida_simple_remove(&nvme_instance_ida, ctrl->instance); out: diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index c6c903f1b172..aa9aec6923bb 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2692,14 +2692,10 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); int ret; - if (!kref_get_unless_zero(&ctrl->ctrl.kref)) - return -EBUSY; - + nvme_get_ctrl(&ctrl->ctrl); ret = __nvme_fc_del_ctrl(ctrl); - if (!ret) flush_workqueue(nvme_wq); - nvme_put_ctrl(&ctrl->ctrl); return ret; @@ -2918,7 +2914,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, return ERR_PTR(ret); } - kref_get(&ctrl->ctrl.kref); + nvme_get_ctrl(&ctrl->ctrl); dev_info(ctrl->ctrl.device, "NVME-FC{%d}: new ctrl: NQN \"%s\"\n", diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cb9d93048f3d..ae60d8342e60 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -127,12 +127,12 @@ struct nvme_ctrl { struct request_queue *admin_q; struct request_queue *connect_q; struct device *dev; - struct kref kref; int instance; struct blk_mq_tag_set *tagset; struct blk_mq_tag_set *admin_tagset; struct list_head namespaces; struct mutex namespaces_mutex; + struct device ctrl_device; struct device *device; /* char device */ struct list_head node; struct ida ns_ida; @@ -279,6 +279,16 @@ static inline void nvme_end_request(struct request *req, __le16 status, blk_mq_complete_request(req); } +static inline void nvme_get_ctrl(struct nvme_ctrl *ctrl) +{ + get_device(ctrl->device); +} + +static inline void nvme_put_ctrl(struct nvme_ctrl *ctrl) +{ + put_device(ctrl->device); +} + void nvme_complete_rq(struct request *req); void nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 11e6fd9d0ba4..7735571ffc9a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2292,7 +2292,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) { dev_warn(dev->ctrl.device, "Removing after probe failure status: %d\n", status); - kref_get(&dev->ctrl.kref); + nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); if (!schedule_work(&dev->remove_work)) nvme_put_ctrl(&dev->ctrl); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 4552744eff45..62b58f9b7d00 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1793,8 +1793,7 @@ static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) * Keep a reference until all work is flushed since * __nvme_rdma_del_ctrl can free the ctrl mem */ - if (!kref_get_unless_zero(&ctrl->ctrl.kref)) - return -EBUSY; + nvme_get_ctrl(&ctrl->ctrl); ret = __nvme_rdma_del_ctrl(ctrl); if (!ret) flush_work(&ctrl->delete_work); @@ -1955,7 +1954,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - kref_get(&ctrl->ctrl.kref); + nvme_get_ctrl(&ctrl->ctrl); mutex_lock(&nvme_rdma_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index c56354e1e4c6..f83e925fe64a 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -642,7 +642,7 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, dev_info(ctrl->ctrl.device, "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); - kref_get(&ctrl->ctrl.kref); + nvme_get_ctrl(&ctrl->ctrl); changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); WARN_ON_ONCE(!changed); -- cgit v1.2.1 From a6a5149b10ec8ab8b4a9479a8230265c1b573be0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 16:59:25 +0200 Subject: nvme: get rid of nvme_ctrl_list Use the core chrdev code to set up the link between the character device and the nvme controller. This allows us to get rid of the global list of all controllers, and also ensures that we have both a reference to the controller and the transport module before the open method of the character device is called. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke --- drivers/nvme/host/core.c | 76 ++++++++++-------------------------------------- drivers/nvme/host/nvme.h | 3 +- 2 files changed, 18 insertions(+), 61 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3a97daa163f6..a56a1e0432e7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -52,9 +52,6 @@ static u8 nvme_max_retries = 5; module_param_named(max_retries, nvme_max_retries, byte, 0644); MODULE_PARM_DESC(max_retries, "max number of retries a command may have"); -static int nvme_char_major; -module_param(nvme_char_major, int, 0); - static unsigned long default_ps_max_latency_us = 100000; module_param(default_ps_max_latency_us, ulong, 0644); MODULE_PARM_DESC(default_ps_max_latency_us, @@ -71,11 +68,8 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); struct workqueue_struct *nvme_wq; EXPORT_SYMBOL_GPL(nvme_wq); -static LIST_HEAD(nvme_ctrl_list); -static DEFINE_SPINLOCK(dev_list_lock); - static DEFINE_IDA(nvme_instance_ida); - +static dev_t nvme_chr_devt; static struct class *nvme_class; static __le32 nvme_get_log_dw10(u8 lid, size_t size) @@ -1031,20 +1025,12 @@ static int nvme_open(struct block_device *bdev, fmode_t mode) if (!kref_get_unless_zero(&ns->kref)) return -ENXIO; - if (!try_module_get(ns->ctrl->ops->module)) { - kref_put(&ns->kref, nvme_free_ns); - return -ENXIO; - } - return 0; } static void nvme_release(struct gendisk *disk, fmode_t mode) { - struct nvme_ns *ns = disk->private_data; - - module_put(ns->ctrl->ops->module); - nvme_put_ns(ns); + nvme_put_ns(disk->private_data); } static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) @@ -1902,33 +1888,12 @@ EXPORT_SYMBOL_GPL(nvme_init_identify); static int nvme_dev_open(struct inode *inode, struct file *file) { - struct nvme_ctrl *ctrl; - int instance = iminor(inode); - int ret = -ENODEV; - - spin_lock(&dev_list_lock); - list_for_each_entry(ctrl, &nvme_ctrl_list, node) { - if (ctrl->instance != instance) - continue; - - if (!ctrl->admin_q) { - ret = -EWOULDBLOCK; - break; - } - if (!kobject_get_unless_zero(&ctrl->device->kobj)) - break; - file->private_data = ctrl; - ret = 0; - break; - } - spin_unlock(&dev_list_lock); - - return ret; -} + struct nvme_ctrl *ctrl = + container_of(inode->i_cdev, struct nvme_ctrl, cdev); -static int nvme_dev_release(struct inode *inode, struct file *file) -{ - nvme_put_ctrl(file->private_data); + if (!ctrl->admin_q) + return -EWOULDBLOCK; + file->private_data = ctrl; return 0; } @@ -1992,7 +1957,6 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, static const struct file_operations nvme_dev_fops = { .owner = THIS_MODULE, .open = nvme_dev_open, - .release = nvme_dev_release, .unlocked_ioctl = nvme_dev_ioctl, .compat_ioctl = nvme_dev_ioctl, }; @@ -2703,11 +2667,7 @@ EXPORT_SYMBOL_GPL(nvme_start_ctrl); void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) { - device_del(ctrl->device); - - spin_lock(&dev_list_lock); - list_del(&ctrl->node); - spin_unlock(&dev_list_lock); + cdev_device_del(&ctrl->cdev, ctrl->device); } EXPORT_SYMBOL_GPL(nvme_uninit_ctrl); @@ -2750,7 +2710,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, device_initialize(&ctrl->ctrl_device); ctrl->device = &ctrl->ctrl_device; - ctrl->device->devt = MKDEV(nvme_char_major, ctrl->instance); + ctrl->device->devt = MKDEV(MAJOR(nvme_chr_devt), ctrl->instance); ctrl->device->class = nvme_class; ctrl->device->parent = ctrl->dev; ctrl->device->groups = nvme_dev_attr_groups; @@ -2759,16 +2719,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance); if (ret) goto out_release_instance; - ret = device_add(ctrl->device); + + cdev_init(&ctrl->cdev, &nvme_dev_fops); + ctrl->cdev.owner = ops->module; + ret = cdev_device_add(&ctrl->cdev, ctrl->device); if (ret) goto out_free_name; ida_init(&ctrl->ns_ida); - spin_lock(&dev_list_lock); - list_add_tail(&ctrl->node, &nvme_ctrl_list); - spin_unlock(&dev_list_lock); - /* * Initialize latency tolerance controls. The sysfs files won't * be visible to userspace unless the device actually supports APST. @@ -2909,12 +2868,9 @@ int __init nvme_core_init(void) if (!nvme_wq) return -ENOMEM; - result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", - &nvme_dev_fops); + result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); if (result < 0) goto destroy_wq; - else if (result > 0) - nvme_char_major = result; nvme_class = class_create(THIS_MODULE, "nvme"); if (IS_ERR(nvme_class)) { @@ -2925,7 +2881,7 @@ int __init nvme_core_init(void) return 0; unregister_chrdev: - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); destroy_wq: destroy_workqueue(nvme_wq); return result; @@ -2934,7 +2890,7 @@ destroy_wq: void nvme_core_exit(void) { class_destroy(nvme_class); - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); destroy_workqueue(nvme_wq); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ae60d8342e60..1bb2bc165e54 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -15,6 +15,7 @@ #define _NVME_H #include +#include #include #include #include @@ -134,7 +135,7 @@ struct nvme_ctrl { struct mutex namespaces_mutex; struct device ctrl_device; struct device *device; /* char device */ - struct list_head node; + struct cdev cdev; struct ida ns_ida; struct work_struct reset_work; -- cgit v1.2.1 From 999ada28713d7bcf4a2c70cbab47b08cd95f2cf8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 18 Oct 2017 17:09:31 +0200 Subject: nvme: check for a live controller in nvme_dev_open This is a much more sensible check than just the admin queue. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a56a1e0432e7..df525ab42fcd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1891,7 +1891,7 @@ static int nvme_dev_open(struct inode *inode, struct file *file) struct nvme_ctrl *ctrl = container_of(inode->i_cdev, struct nvme_ctrl, cdev); - if (!ctrl->admin_q) + if (ctrl->state != NVME_CTRL_LIVE) return -EWOULDBLOCK; file->private_data = ctrl; return 0; -- cgit v1.2.1 From 3b3387620780fc9699021c85bdce5cb45a763d41 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 20 Oct 2017 16:17:06 -0700 Subject: nvme: add duplicate_connect option Add the "duplicate_connect" boolean option (presence means true). Default is false. When false, the transport should validate whether a new controller request is targeted for the same host transport addressing and target transport addressing as an existing controller. If so, the new controller request should be rejected. When true, the callee is explicitly requesting a duplicate controller connection to be made and the new request should be attempted. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 7 ++++++- drivers/nvme/host/fabrics.h | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 8bca36a46924..4a83137b0268 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -548,6 +548,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, { NVMF_OPT_HOST_ID, "hostid=%s" }, + { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, { NVMF_OPT_ERR, NULL } }; @@ -566,6 +567,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->nr_io_queues = num_online_cpus(); opts->reconnect_delay = NVMF_DEF_RECONNECT_DELAY; opts->kato = NVME_DEFAULT_KATO; + opts->duplicate_connect = false; options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -742,6 +744,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, goto out; } break; + case NVMF_OPT_DUP_CONNECT: + opts->duplicate_connect = true; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -823,7 +828,7 @@ EXPORT_SYMBOL_GPL(nvmf_free_options); #define NVMF_REQUIRED_OPTS (NVMF_OPT_TRANSPORT | NVMF_OPT_NQN) #define NVMF_ALLOWED_OPTS (NVMF_OPT_QUEUE_SIZE | NVMF_OPT_NR_IO_QUEUES | \ NVMF_OPT_KATO | NVMF_OPT_HOSTNQN | \ - NVMF_OPT_HOST_ID) + NVMF_OPT_HOST_ID | NVMF_OPT_DUP_CONNECT) static struct nvme_ctrl * nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index bf33663218cd..a7590cc59ba2 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -57,6 +57,7 @@ enum { NVMF_OPT_HOST_TRADDR = 1 << 10, NVMF_OPT_CTRL_LOSS_TMO = 1 << 11, NVMF_OPT_HOST_ID = 1 << 12, + NVMF_OPT_DUP_CONNECT = 1 << 13, }; /** @@ -96,6 +97,7 @@ struct nvmf_ctrl_options { unsigned int nr_io_queues; unsigned int reconnect_delay; bool discovery_nqn; + bool duplicate_connect; unsigned int kato; struct nvmf_host *host; int max_reconnects; -- cgit v1.2.1 From 991231dc48ae9bcdd363ee231f10beb771d455c1 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 20 Oct 2017 16:17:07 -0700 Subject: nvme: add helper to compare options to controller Adds a helper function that compares the host and subsytem specified in a connect options list vs a controller. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a7590cc59ba2..42232e731f19 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -133,6 +133,18 @@ struct nvmf_transport_ops { struct nvmf_ctrl_options *opts); }; +static inline bool +nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, + struct nvmf_ctrl_options *opts) +{ + if (strcmp(opts->subsysnqn, ctrl->opts->subsysnqn) || + strcmp(opts->host->nqn, ctrl->opts->host->nqn) || + memcmp(&opts->host->id, &ctrl->opts->host->id, sizeof(uuid_t))) + return false; + + return true; +} + int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val); int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val); int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val); -- cgit v1.2.1 From 36e835f2432d8f16845307b5fb7e88d1e5af041d Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 20 Oct 2017 16:17:09 -0700 Subject: nvme-rdma: add support for duplicate_connect option Adds support for the duplicate_connect option. When set to true, checks whether there's an existing controller via the same target address (traddr), target port (trsvcid), and if specified, host address (host_traddr). Fails the connection request if there is an existing controller. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 62b58f9b7d00..32b0a9ef26e6 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1851,6 +1851,83 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .reinit_request = nvme_rdma_reinit_request, }; +static inline bool +__nvme_rdma_options_match(struct nvme_rdma_ctrl *ctrl, + struct nvmf_ctrl_options *opts) +{ + char *stdport = __stringify(NVME_RDMA_IP_PORT); + + + if (!nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts) || + strcmp(opts->traddr, ctrl->ctrl.opts->traddr)) + return false; + + if (opts->mask & NVMF_OPT_TRSVCID && + ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) { + if (strcmp(opts->trsvcid, ctrl->ctrl.opts->trsvcid)) + return false; + } else if (opts->mask & NVMF_OPT_TRSVCID) { + if (strcmp(opts->trsvcid, stdport)) + return false; + } else if (ctrl->ctrl.opts->mask & NVMF_OPT_TRSVCID) { + if (strcmp(stdport, ctrl->ctrl.opts->trsvcid)) + return false; + } + /* else, it's a match as both have stdport. Fall to next checks */ + + /* + * checking the local address is rough. In most cases, one + * is not specified and the host port is selected by the stack. + * + * Assume no match if: + * local address is specified and address is not the same + * local address is not specified but remote is, or vice versa + * (admin using specific host_traddr when it matters). + */ + if (opts->mask & NVMF_OPT_HOST_TRADDR && + ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { + if (strcmp(opts->host_traddr, ctrl->ctrl.opts->host_traddr)) + return false; + } else if (opts->mask & NVMF_OPT_HOST_TRADDR || + ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) + return false; + /* + * if neither controller had an host port specified, assume it's + * a match as everything else matched. + */ + + return true; +} + +/* + * Fails a connection request if it matches an existing controller + * (association) with the same tuple: + * + * + * if local address is not specified in the request, it will match an + * existing controller with all the other parameters the same and no + * local port address specified as well. + * + * The ports don't need to be compared as they are intrinsically + * already matched by the port pointers supplied. + */ +static bool +nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts) +{ + struct nvme_rdma_ctrl *ctrl; + bool found = false; + + mutex_lock(&nvme_rdma_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) { + found = __nvme_rdma_options_match(ctrl, opts); + if (found) + break; + } + mutex_unlock(&nvme_rdma_ctrl_mutex); + + return found; +} + static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { @@ -1887,6 +1964,11 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, } } + if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) { + ret = -EALREADY; + goto out_free_ctrl; + } + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops, 0 /* no quirks, we're perfect! */); if (ret) -- cgit v1.2.1 From 56d5f4f108efd4e439d2320738e2b894af0920bb Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 20 Oct 2017 16:17:08 -0700 Subject: nvme-fc: add support for duplicate_connect option Adds support for the duplicate_connect option. When set to true, checks whether there's an existing controller via the same host port and target port for the same host (hostnqn, hostid) to the same subsystem. Fails the connection request if an existing controller. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index aa9aec6923bb..d12775d12983 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2792,6 +2792,33 @@ static const struct blk_mq_ops nvme_fc_admin_mq_ops = { }; +/* + * Fails a controller request if it matches an existing controller + * (association) with the same tuple: + * + * + * The ports don't need to be compared as they are intrinsically + * already matched by the port pointers supplied. + */ +static bool +nvme_fc_existing_controller(struct nvme_fc_rport *rport, + struct nvmf_ctrl_options *opts) +{ + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + bool found = false; + + spin_lock_irqsave(&rport->lock, flags); + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + found = nvmf_ctlr_matches_baseopts(&ctrl->ctrl, opts); + if (found) + break; + } + spin_unlock_irqrestore(&rport->lock, flags); + + return found; +} + static struct nvme_ctrl * nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, struct nvme_fc_lport *lport, struct nvme_fc_rport *rport) @@ -2806,6 +2833,12 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_fail; } + if (!opts->duplicate_connect && + nvme_fc_existing_controller(rport, opts)) { + ret = -EALREADY; + goto out_fail; + } + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); if (!ctrl) { ret = -ENOMEM; -- cgit v1.2.1 From ecad0d2cb8a7997afdc95031ee3328b997aba5c4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Mon, 23 Oct 2017 15:11:36 -0700 Subject: nvme-fc: remove NVME_FC_MAX_SEGMENTS The define is an arbitrary limit to the io size on the initiator, capping the io to 1MB-4KB. Remove the define from the transport. I/O size will solely be limited by the LLDD sg limits. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index d12775d12983..b600c07803cf 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2435,7 +2435,6 @@ static int nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) { struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - u32 segs; int ret; bool changed; @@ -2486,9 +2485,8 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_disconnect_admin_queue; - segs = min_t(u32, NVME_FC_MAX_SEGMENTS, - ctrl->lport->ops->max_sgl_segments); - ctrl->ctrl.max_hw_sectors = (segs - 1) << (PAGE_SHIFT - 9); + ctrl->ctrl.max_hw_sectors = + (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); ret = nvme_init_identify(&ctrl->ctrl); if (ret) -- cgit v1.2.1 From d59b23795933678c9638fd20c942d2b4f3cd6185 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Mon, 30 Oct 2017 14:46:31 -0700 Subject: bcache: only permit to recovery read error when cache device is clean When bcache does read I/Os, for example in writeback or writethrough mode, if a read request on cache device is failed, bcache will try to recovery the request by reading from cached device. If the data on cached device is not synced with cache device, then requester will get a stale data. For critical storage system like database, providing stale data from recovery may result an application level data corruption, which is unacceptible. With this patch, for a failed read request in writeback or writethrough mode, recovery a recoverable read request only happens when cache device is clean. That is to say, all data on cached device is up to update. For other cache modes in bcache, read request will never hit cached_dev_read_error(), they don't need this patch. Please note, because cache mode can be switched arbitrarily in run time, a writethrough mode might be switched from a writeback mode. Therefore checking dc->has_data in writethrough mode still makes sense. Changelog: V4: Fix parens error pointed by Michael Lyle. v3: By response from Kent Oversteet, he thinks recovering stale data is a bug to fix, and option to permit it is unnecessary. So this version the sysfs file is removed. v2: rename sysfs entry from allow_stale_data_on_failure to allow_stale_data_on_failure, and fix the confusing commit log. v1: initial patch posted. [small change to patch comment spelling by mlyle] Signed-off-by: Coly Li Signed-off-by: Michael Lyle Reported-by: Arne Wolf Reviewed-by: Michael Lyle Cc: Kent Overstreet Cc: Nix Cc: Kai Krakow Cc: Eric Wheeler Cc: Junhui Tang Cc: stable@vger.kernel.org Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 163a17a80874..886e4b6643f1 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -705,8 +705,16 @@ static void cached_dev_read_error(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); struct bio *bio = &s->bio.bio; + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); - if (s->recoverable) { + /* + * If cache device is dirty (dc->has_dirty is non-zero), then + * recovery a failed read request from cached device may get a + * stale data back. So read failure recovery is only permitted + * when cache device is clean. + */ + if (s->recoverable && + (dc && !atomic_read(&dc->has_dirty))) { /* Retry from the backing device: */ trace_bcache_read_retry(s->orig_bio); -- cgit v1.2.1 From 3b304d24a718ae779ee9c7f2014dd3b2d0893b70 Mon Sep 17 00:00:00 2001 From: Elena Reshetova Date: Mon, 30 Oct 2017 14:46:32 -0700 Subject: bcache: convert cached_dev.count from atomic_t to refcount_t atomic_t variables are currently used to implement reference counters with the following properties: - counter is initialized to 1 using atomic_set() - a resource is freed upon counter reaching zero - once counter reaches zero, its further increments aren't allowed - counter schema uses basic atomic operations (set, inc, inc_not_zero, dec_and_test, etc.) Such atomic variables should be converted to a newly provided refcount_t type and API that prevents accidental counter overflows and underflows. This is important since overflows and underflows can lead to use-after-free situation and be exploitable. The variable cached_dev.count is used as pure reference counter. Convert it to refcount_t and fix up the operations. Suggested-by: Kees Cook Reviewed-by: David Windsor Reviewed-by: Hans Liljestrand Reviewed-by: Michael Lyle Signed-off-by: Elena Reshetova Signed-off-by: Jens Axboe --- drivers/md/bcache/bcache.h | 7 ++++--- drivers/md/bcache/super.c | 6 +++--- drivers/md/bcache/writeback.h | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index d77c4829c497..363ea6256b39 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -184,6 +184,7 @@ #include #include #include +#include #include #include @@ -296,7 +297,7 @@ struct cached_dev { struct semaphore sb_write_mutex; /* Refcount on the cache set. Always nonzero when we're caching. */ - atomic_t count; + refcount_t count; struct work_struct detach; /* @@ -805,13 +806,13 @@ do { \ static inline void cached_dev_put(struct cached_dev *dc) { - if (atomic_dec_and_test(&dc->count)) + if (refcount_dec_and_test(&dc->count)) schedule_work(&dc->detach); } static inline bool cached_dev_get(struct cached_dev *dc) { - if (!atomic_inc_not_zero(&dc->count)) + if (!refcount_inc_not_zero(&dc->count)) return false; /* Paired with the mb in cached_dev_attach */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 72c3b2929ef0..46134c45c6f6 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -902,7 +902,7 @@ static void cached_dev_detach_finish(struct work_struct *w) closure_init_stack(&cl); BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)); - BUG_ON(atomic_read(&dc->count)); + BUG_ON(refcount_read(&dc->count)); mutex_lock(&bch_register_lock); @@ -1029,7 +1029,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) * dc->c must be set before dc->count != 0 - paired with the mb in * cached_dev_get() */ - atomic_set(&dc->count, 1); + refcount_set(&dc->count, 1); /* Block writeback thread, but spawn it */ down_write(&dc->writeback_lock); @@ -1041,7 +1041,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); - atomic_inc(&dc->count); + refcount_inc(&dc->count); bch_writeback_queue(dc); } diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 34bcf49d737b..7d25bff37a9b 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -91,7 +91,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) { if (!atomic_read(&dc->has_dirty) && !atomic_xchg(&dc->has_dirty, 1)) { - atomic_inc(&dc->count); + refcount_inc(&dc->count); if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); -- cgit v1.2.1 From d44c2f9e7cc0041f0cd88df1fe7a1fceb713ab14 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Mon, 30 Oct 2017 14:46:33 -0700 Subject: bcache: update bucket_in_use in real time bucket_in_use is updated in gc thread which triggered by invalidating or writing sectors_to_gc dirty data, It's a long interval. Therefore, when we use it to compare with the threshold, it is often not timely, which leads to inaccurate judgment and often results in bucket depletion. We have send a patch before, by the means of updating bucket_in_use periodically In gc thread, which Coly thought that would lead high latency, In this patch, we add avail_nbuckets to record the count of available buckets, and we calculate bucket_in_use when alloc or free bucket in real time. [edited by ML: eliminated some whitespace errors] Signed-off-by: Tang Junhui Signed-off-by: Michael Lyle Reviewed-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/alloc.c | 10 ++++++++++ drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/btree.c | 17 ++++++++++------- drivers/md/bcache/btree.h | 2 +- 4 files changed, 22 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 4c40870e99f5..8c5a626343d4 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -442,6 +442,11 @@ out: b->prio = INITIAL_PRIO; } + if (ca->set->avail_nbuckets > 0) { + ca->set->avail_nbuckets--; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } + return r; } @@ -449,6 +454,11 @@ void __bch_bucket_free(struct cache *ca, struct bucket *b) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); + + if (ca->set->avail_nbuckets < ca->set->nbuckets) { + ca->set->avail_nbuckets++; + bch_update_bucket_in_use(ca->set, &ca->set->gc_stats); + } } void bch_bucket_free(struct cache_set *c, struct bkey *k) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 363ea6256b39..e274082330dc 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -581,6 +581,7 @@ struct cache_set { uint8_t need_gc; struct gc_stat gc_stats; size_t nbuckets; + size_t avail_nbuckets; struct task_struct *gc_thread; /* Where in the btree gc currently is */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 866dcf78ff8e..d8865e6ead37 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1240,6 +1240,11 @@ void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) __bch_btree_mark_key(c, level, k); } +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) +{ + stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; +} + static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; @@ -1651,9 +1656,8 @@ static void btree_gc_start(struct cache_set *c) mutex_unlock(&c->bucket_lock); } -static size_t bch_btree_gc_finish(struct cache_set *c) +static void bch_btree_gc_finish(struct cache_set *c) { - size_t available = 0; struct bucket *b; struct cache *ca; unsigned i; @@ -1690,6 +1694,7 @@ static size_t bch_btree_gc_finish(struct cache_set *c) } rcu_read_unlock(); + c->avail_nbuckets = 0; for_each_cache(ca, c, i) { uint64_t *i; @@ -1711,18 +1716,16 @@ static size_t bch_btree_gc_finish(struct cache_set *c) BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) - available++; + c->avail_nbuckets++; } } mutex_unlock(&c->bucket_lock); - return available; } static void bch_btree_gc(struct cache_set *c) { int ret; - unsigned long available; struct gc_stat stats; struct closure writes; struct btree_op op; @@ -1745,14 +1748,14 @@ static void bch_btree_gc(struct cache_set *c) pr_warn("gc failed!"); } while (ret); - available = bch_btree_gc_finish(c); + bch_btree_gc_finish(c); wake_up_allocators(c); bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); stats.data <<= 9; - stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; + bch_update_bucket_in_use(c, &stats); memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); trace_bcache_gc_end(c); diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h index 73da1f5626cb..4073aca09a49 100644 --- a/drivers/md/bcache/btree.h +++ b/drivers/md/bcache/btree.h @@ -305,5 +305,5 @@ void bch_keybuf_del(struct keybuf *, struct keybuf_key *); struct keybuf_key *bch_keybuf_next(struct keybuf *); struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, struct bkey *, keybuf_pred_fn *); - +void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); #endif -- cgit v1.2.1 From c157313791a999646901b3e3c6888514ebc36d62 Mon Sep 17 00:00:00 2001 From: "tang.junhui" Date: Mon, 30 Oct 2017 14:46:34 -0700 Subject: bcache: fix wrong cache_misses statistics Currently, Cache missed IOs are identified by s->cache_miss, but actually, there are many situations that missed IOs are not assigned a value for s->cache_miss in cached_dev_cache_miss(), for example, a bypassed IO (s->iop.bypass = 1), or the cache_bio allocate failed. In these situations, it will go to out_put or out_submit, and s->cache_miss is null, which leads bch_mark_cache_accounting() to treat this IO as a hit IO. [ML: applied by 3-way merge] Signed-off-by: tang.junhui Reviewed-by: Michael Lyle Reviewed-by: Coly Li Signed-off-by: Jens Axboe --- drivers/md/bcache/request.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 886e4b6643f1..597dd1e87bea 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -470,6 +470,7 @@ struct search { unsigned recoverable:1; unsigned write:1; unsigned read_dirty_data:1; + unsigned cache_missed:1; unsigned long start_time; @@ -656,6 +657,7 @@ static inline struct search *search_alloc(struct bio *bio, s->orig_bio = bio; s->cache_miss = NULL; + s->cache_missed = 0; s->d = d; s->recoverable = 1; s->write = op_is_write(bio_op(bio)); @@ -775,7 +777,7 @@ static void cached_dev_read_done_bh(struct closure *cl) struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); bch_mark_cache_accounting(s->iop.c, s->d, - !s->cache_miss, s->iop.bypass); + !s->cache_missed, s->iop.bypass); trace_bcache_read(s->orig_bio, !s->cache_miss, s->iop.bypass); if (s->iop.status) @@ -794,6 +796,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); struct bio *miss, *cache_bio; + s->cache_missed = 1; + if (s->cache_miss || s->iop.bypass) { miss = bio_next_split(bio, sectors, GFP_NOIO, s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; -- cgit v1.2.1 From 330a4db89d39a6b43f36da16824eaa7a7509d34d Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Mon, 30 Oct 2017 14:46:35 -0700 Subject: bcache: explicitly destroy mutex while exiting mutex_destroy does nothing most of time, but it's better to call it to make the code future proof and it also has some meaning for like mutex debug. As Coly pointed out in a previous review, bcache_exit() may not be able to handle all the references properly if userspace registers cache and backing devices right before bch_debug_init runs and bch_debug_init failes later. So not exposing userspace interface until everything is ready to avoid that issue. Signed-off-by: Liang Chen Reviewed-by: Michael Lyle Reviewed-by: Coly Li Reviewed-by: Eric Wheeler Signed-off-by: Jens Axboe --- drivers/md/bcache/super.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 46134c45c6f6..b4d28928dec5 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2095,6 +2095,7 @@ static void bcache_exit(void) if (bcache_major) unregister_blkdev(bcache_major, "bcache"); unregister_reboot_notifier(&reboot); + mutex_destroy(&bch_register_lock); } static int __init bcache_init(void) @@ -2113,14 +2114,15 @@ static int __init bcache_init(void) bcache_major = register_blkdev(0, "bcache"); if (bcache_major < 0) { unregister_reboot_notifier(&reboot); + mutex_destroy(&bch_register_lock); return bcache_major; } if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || - sysfs_create_files(bcache_kobj, files) || bch_request_init() || - bch_debug_init(bcache_kobj)) + bch_debug_init(bcache_kobj) || + sysfs_create_files(bcache_kobj, files)) goto err; return 0; -- cgit v1.2.1 From aeec77629a4ac6f8c248f3a82e80d4170a881f22 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:31 +0800 Subject: scsi: allow passing in null rq to scsi_prep_state_check() In the following patch, we will implement scsi_get_budget() which need to call scsi_prep_state_check() when rq isn't dequeued yet. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9cf6a80fe297..d159bb085714 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1301,7 +1301,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req) /* * If the devices is blocked we defer normal commands. */ - if (!(req->rq_flags & RQF_PREEMPT)) + if (req && !(req->rq_flags & RQF_PREEMPT)) ret = BLKPREP_DEFER; break; default: @@ -1310,7 +1310,7 @@ scsi_prep_state_check(struct scsi_device *sdev, struct request *req) * special commands. In particular any user initiated * command is not allowed. */ - if (!(req->rq_flags & RQF_PREEMPT)) + if (req && !(req->rq_flags & RQF_PREEMPT)) ret = BLKPREP_KILL; break; } -- cgit v1.2.1 From 0df21c86bdbfd17dec9ab898312af9bfb74d5d86 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 14 Oct 2017 17:22:32 +0800 Subject: scsi: implement .get_budget and .put_budget for blk-mq We need to tell blk-mq to reserve resources before queuing one request, so implement these two callbacks. Then blk-mq can avoid to dequeue request too early, and IO merging can be improved a lot. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 75 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 52 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d159bb085714..6f10afaca25b 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1946,25 +1946,32 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) blk_mq_complete_request(cmd->request); } -static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) +static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) { - struct request *req = bd->rq; - struct request_queue *q = req->q; + struct request_queue *q = hctx->queue; + struct scsi_device *sdev = q->queuedata; + struct Scsi_Host *shost = sdev->host; + + atomic_dec(&shost->host_busy); + if (scsi_target(sdev)->can_queue > 0) + atomic_dec(&scsi_target(sdev)->target_busy); + atomic_dec(&sdev->device_busy); + put_device(&sdev->sdev_gendev); +} + +static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; - struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); blk_status_t ret; - int reason; - ret = prep_to_mq(scsi_prep_state_check(sdev, req)); - if (ret != BLK_STS_OK) - goto out; + ret = prep_to_mq(scsi_prep_state_check(sdev, NULL)); + if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK) + return ret; - ret = BLK_STS_RESOURCE; if (!get_device(&sdev->sdev_gendev)) goto out; - if (!scsi_dev_queue_ready(q, sdev)) goto out_put_device; if (!scsi_target_queue_ready(shost, sdev)) @@ -1972,10 +1979,38 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (!scsi_host_queue_ready(q, shost, sdev)) goto out_dec_target_busy; + return BLK_STS_OK; + +out_dec_target_busy: + if (scsi_target(sdev)->can_queue > 0) + atomic_dec(&scsi_target(sdev)->target_busy); +out_dec_device_busy: + atomic_dec(&sdev->device_busy); +out_put_device: + put_device(&sdev->sdev_gendev); +out: + return BLK_STS_RESOURCE; +} + +static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *req = bd->rq; + struct request_queue *q = req->q; + struct scsi_device *sdev = q->queuedata; + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); + blk_status_t ret; + int reason; + + ret = prep_to_mq(scsi_prep_state_check(sdev, req)); + if (ret != BLK_STS_OK) + goto out_put_budget; + + ret = BLK_STS_RESOURCE; if (!(req->rq_flags & RQF_DONTPREP)) { ret = prep_to_mq(scsi_mq_prep_fn(req)); if (ret != BLK_STS_OK) - goto out_dec_host_busy; + goto out_put_budget; req->rq_flags |= RQF_DONTPREP; } else { blk_mq_start_request(req); @@ -1993,21 +2028,13 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (reason) { scsi_set_blocked(cmd, reason); ret = BLK_STS_RESOURCE; - goto out_dec_host_busy; + goto out_put_budget; } return BLK_STS_OK; -out_dec_host_busy: - atomic_dec(&shost->host_busy); -out_dec_target_busy: - if (scsi_target(sdev)->can_queue > 0) - atomic_dec(&scsi_target(sdev)->target_busy); -out_dec_device_busy: - atomic_dec(&sdev->device_busy); -out_put_device: - put_device(&sdev->sdev_gendev); -out: +out_put_budget: + scsi_mq_put_budget(hctx); switch (ret) { case BLK_STS_OK: break; @@ -2211,6 +2238,8 @@ struct request_queue *scsi_old_alloc_queue(struct scsi_device *sdev) } static const struct blk_mq_ops scsi_mq_ops = { + .get_budget = scsi_mq_get_budget, + .put_budget = scsi_mq_put_budget, .queue_rq = scsi_queue_rq, .complete = scsi_softirq_done, .timeout = scsi_timeout, -- cgit v1.2.1 From 2a750166a5bee7fda186c12269e37ba52b26c017 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Mon, 30 Oct 2017 09:02:19 -0700 Subject: block: Rework drivers/cdrom/Makefile Instead of referring from inside drivers/cdrom/Makefile to all the drivers that use this driver, let these drivers select the cdrom driver. This change makes the cdrom build code follow the approach that is used for most other drivers, namely refer from the higher layers to the lower layer instead of from the lower layer to the higher layers. Reviewed-by: Hannes Reinecke Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 12 ++++++++++++ drivers/block/paride/Kconfig | 1 + drivers/cdrom/Makefile | 15 ++------------- drivers/ide/Kconfig | 1 + drivers/scsi/Kconfig | 1 + 5 files changed, 17 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 4a438b8abe27..b998b3c9ea86 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -67,9 +67,20 @@ config AMIGA_Z2RAM To compile this driver as a module, choose M here: the module will be called z2ram. +config CDROM + tristate "CD-ROM driver" + help + A CD-ROM is a pre-pressed optical compact disc which contains + data. The name is an acronym which stands for "Compact Disc + Read-Only Memory". This driver implements functionality that is + shared by all CD-ROM drivers, e.g. the IDE CD-ROM driver and the + SCSI CD-ROM driver. For a list of the ioctls implemented by this + driver, see also Documentation/ioctl/cdrom.txt. + config GDROM tristate "SEGA Dreamcast GD-ROM drive" depends on SH_DREAMCAST + select CDROM select BLK_SCSI_REQUEST # only for the generic cdrom code help A standard SEGA Dreamcast comes with a modified CD ROM drive called a @@ -347,6 +358,7 @@ config BLK_DEV_RAM_DAX config CDROM_PKTCDVD tristate "Packet writing on CD/DVD media (DEPRECATED)" depends on !UML + select CDROM select BLK_SCSI_REQUEST help Note: This driver is deprecated and will be removed from the diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig index 3a15247942e4..1d5057f5080b 100644 --- a/drivers/block/paride/Kconfig +++ b/drivers/block/paride/Kconfig @@ -25,6 +25,7 @@ config PARIDE_PD config PARIDE_PCD tristate "Parallel port ATAPI CD-ROMs" depends on PARIDE + select CDROM select BLK_SCSI_REQUEST # only for the generic cdrom code ---help--- This option enables the high-level driver for ATAPI CD-ROM devices diff --git a/drivers/cdrom/Makefile b/drivers/cdrom/Makefile index 8ffde4f8ab9a..7f3f43cc2257 100644 --- a/drivers/cdrom/Makefile +++ b/drivers/cdrom/Makefile @@ -1,13 +1,2 @@ -# Makefile for the kernel cdrom device drivers. -# -# 30 Jan 1998, Michael Elizabeth Chastain, -# Rewritten to use lists instead of if-statements. - -# Each configuration option enables a list of files. - -obj-$(CONFIG_BLK_DEV_IDECD) += cdrom.o -obj-$(CONFIG_BLK_DEV_SR) += cdrom.o -obj-$(CONFIG_PARIDE_PCD) += cdrom.o -obj-$(CONFIG_CDROM_PKTCDVD) += cdrom.o - -obj-$(CONFIG_GDROM) += gdrom.o cdrom.o +obj-$(CONFIG_CDROM) += cdrom.o +obj-$(CONFIG_GDROM) += gdrom.o diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index c99a25c075bc..7b92c591e4c1 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -118,6 +118,7 @@ config BLK_DEV_DELKIN config BLK_DEV_IDECD tristate "Include IDE/ATAPI CDROM support" select IDE_ATAPI + select CDROM ---help--- If you have a CD-ROM drive using the ATAPI protocol, say Y. ATAPI is a newer protocol used by IDE CD-ROM and TAPE drives, similar to the diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index 41366339b950..cca0b03c2ead 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -131,6 +131,7 @@ config CHR_DEV_OSST config BLK_DEV_SR tristate "SCSI CDROM support" depends on SCSI + select CDROM ---help--- If you want to use a CD or DVD drive attached to your computer by SCSI, FireWire, USB or ATAPI, say Y and read the SCSI-HOWTO -- cgit v1.2.1 From 71c691fd06cc2625966620c93ce21bdcce32ed95 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 28 Sep 2017 09:56:31 -0700 Subject: nvme-fc: avoid workqueue flush stalls There's no need to wait for the full nvme_wq, which is now shared, to flush. flush only the delete_work item. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b600c07803cf..50cc17e99475 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2693,7 +2693,7 @@ nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) nvme_get_ctrl(&ctrl->ctrl); ret = __nvme_fc_del_ctrl(ctrl); if (!ret) - flush_workqueue(nvme_wq); + flush_work(&ctrl->delete_work); nvme_put_ctrl(&ctrl->ctrl); return ret; -- cgit v1.2.1 From 29c09648734b94e42802f021658a47361169403b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 29 Oct 2017 10:44:28 +0200 Subject: nvme-fc: merge __nvme_fc_schedule_delete_work into __nvme_fc_del_ctrl No need to have two functions doing the same thing. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 50cc17e99475..827e9efbe556 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2663,22 +2663,14 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) nvme_put_ctrl(&ctrl->ctrl); } -static bool -__nvme_fc_schedule_delete_work(struct nvme_fc_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return true; - - if (!queue_work(nvme_wq, &ctrl->delete_work)) - return true; - - return false; -} - static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl) { - return __nvme_fc_schedule_delete_work(ctrl) ? -EBUSY : 0; + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + if (!queue_work(nvme_wq, &ctrl->delete_work)) + return -EBUSY; + return 0; } /* @@ -2724,7 +2716,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", ctrl->cnum, ctrl->ctrl.nr_reconnects); - WARN_ON(__nvme_fc_schedule_delete_work(ctrl)); + WARN_ON(__nvme_fc_del_ctrl(ctrl)); } } -- cgit v1.2.1 From c5017e85705bfea721732e153305d1988ff965c2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 29 Oct 2017 10:44:29 +0200 Subject: nvme: move controller deletion to common code Move the ->delete_work and the associated helpers to common code instead of duplicating them in every driver. This also adds the missing reference get/put for the loop driver. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 38 ++++++++++++++++++++++++++++++++++- drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/host/fc.c | 42 +++++---------------------------------- drivers/nvme/host/nvme.h | 5 ++++- drivers/nvme/host/rdma.c | 45 ++++++------------------------------------ drivers/nvme/target/loop.c | 48 +++++++++------------------------------------ 6 files changed, 62 insertions(+), 118 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index df525ab42fcd..d835ac05bbf7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -97,6 +97,41 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) return ret; } +static void nvme_delete_ctrl_work(struct work_struct *work) +{ + struct nvme_ctrl *ctrl = + container_of(work, struct nvme_ctrl, delete_work); + + ctrl->ops->delete_ctrl(ctrl); +} + +int nvme_delete_ctrl(struct nvme_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + if (!queue_work(nvme_wq, &ctrl->delete_work)) + return -EBUSY; + return 0; +} +EXPORT_SYMBOL_GPL(nvme_delete_ctrl); + +int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) +{ + int ret = 0; + + /* + * Keep a reference until the work is flushed since ->delete_ctrl + * can free the controller. + */ + nvme_get_ctrl(ctrl); + ret = nvme_delete_ctrl(ctrl); + if (!ret) + flush_work(&ctrl->delete_work); + nvme_put_ctrl(ctrl); + return ret; +} +EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync); + static blk_status_t nvme_error_status(struct request *req) { switch (nvme_req(req)->status & 0x7ff) { @@ -2122,7 +2157,7 @@ static ssize_t nvme_sysfs_delete(struct device *dev, struct nvme_ctrl *ctrl = dev_get_drvdata(dev); if (device_remove_file_self(dev, attr)) - ctrl->ops->delete_ctrl(ctrl); + nvme_delete_ctrl_sync(ctrl); return count; } static DEVICE_ATTR(delete_controller, S_IWUSR, NULL, nvme_sysfs_delete); @@ -2702,6 +2737,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, INIT_WORK(&ctrl->scan_work, nvme_scan_work); INIT_WORK(&ctrl->async_event_work, nvme_async_event_work); INIT_WORK(&ctrl->fw_act_work, nvme_fw_act_work); + INIT_WORK(&ctrl->delete_work, nvme_delete_ctrl_work); ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL); if (ret < 0) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 4a83137b0268..a4967de5bb25 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -887,7 +887,7 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) "controller returned incorrect NQN: \"%s\".\n", ctrl->subnqn); up_read(&nvmf_transports_rwsem); - ctrl->ops->delete_ctrl(ctrl); + nvme_delete_ctrl_sync(ctrl); return ERR_PTR(-EINVAL); } diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 827e9efbe556..a7bdb17de29d 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -157,7 +157,6 @@ struct nvme_fc_ctrl { struct blk_mq_tag_set admin_tag_set; struct blk_mq_tag_set tag_set; - struct work_struct delete_work; struct delayed_work connect_work; struct kref ref; @@ -223,7 +222,6 @@ static struct device *fc_udev_device; /* *********************** FC-NVME Port Management ************************ */ -static int __nvme_fc_del_ctrl(struct nvme_fc_ctrl *); static void __nvme_fc_delete_hw_queue(struct nvme_fc_ctrl *, struct nvme_fc_queue *, unsigned int); @@ -662,7 +660,7 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr) /* tear down all associations to the remote port */ list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) - __nvme_fc_del_ctrl(ctrl); + nvme_delete_ctrl(&ctrl->ctrl); spin_unlock_irqrestore(&rport->lock, flags); @@ -2636,10 +2634,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) } static void -nvme_fc_delete_ctrl_work(struct work_struct *work) +nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) { - struct nvme_fc_ctrl *ctrl = - container_of(work, struct nvme_fc_ctrl, delete_work); + struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); @@ -2663,34 +2660,6 @@ nvme_fc_delete_ctrl_work(struct work_struct *work) nvme_put_ctrl(&ctrl->ctrl); } -static int -__nvme_fc_del_ctrl(struct nvme_fc_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return -EBUSY; - if (!queue_work(nvme_wq, &ctrl->delete_work)) - return -EBUSY; - return 0; -} - -/* - * Request from nvme core layer to delete the controller - */ -static int -nvme_fc_del_nvme_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - int ret; - - nvme_get_ctrl(&ctrl->ctrl); - ret = __nvme_fc_del_ctrl(ctrl); - if (!ret) - flush_work(&ctrl->delete_work); - nvme_put_ctrl(&ctrl->ctrl); - - return ret; -} - static void nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) { @@ -2716,7 +2685,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", ctrl->cnum, ctrl->ctrl.nr_reconnects); - WARN_ON(__nvme_fc_del_ctrl(ctrl)); + WARN_ON(nvme_delete_ctrl(&ctrl->ctrl)); } } @@ -2748,7 +2717,7 @@ static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { .reg_write32 = nvmf_reg_write32, .free_ctrl = nvme_fc_nvme_ctrl_freed, .submit_async_event = nvme_fc_submit_async_event, - .delete_ctrl = nvme_fc_del_nvme_ctrl, + .delete_ctrl = nvme_fc_delete_ctrl, .get_address = nvmf_get_address, .reinit_request = nvme_fc_reinit_request, }; @@ -2851,7 +2820,6 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, get_device(ctrl->dev); kref_init(&ctrl->ref); - INIT_WORK(&ctrl->delete_work, nvme_fc_delete_ctrl_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work); INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work); spin_lock_init(&ctrl->lock); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 1bb2bc165e54..35d9cee515f1 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -138,6 +138,7 @@ struct nvme_ctrl { struct cdev cdev; struct ida ns_ida; struct work_struct reset_work; + struct work_struct delete_work; struct opal_dev *opal_dev; @@ -236,7 +237,7 @@ struct nvme_ctrl_ops { int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); void (*free_ctrl)(struct nvme_ctrl *ctrl); void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); - int (*delete_ctrl)(struct nvme_ctrl *ctrl); + void (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); int (*reinit_request)(void *data, struct request *rq); }; @@ -339,6 +340,8 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); +int nvme_delete_ctrl(struct nvme_ctrl *ctrl); +int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); #ifdef CONFIG_NVM int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 32b0a9ef26e6..5175b465997d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -105,7 +105,6 @@ struct nvme_rdma_ctrl { /* other member variables */ struct blk_mq_tag_set tag_set; - struct work_struct delete_work; struct work_struct err_work; struct nvme_rdma_qe async_event_sqe; @@ -913,7 +912,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_info(ctrl->ctrl.device, "Removing controller...\n"); - queue_work(nvme_wq, &ctrl->delete_work); + queue_work(nvme_wq, &ctrl->ctrl.delete_work); } } @@ -1764,41 +1763,10 @@ static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl) nvme_put_ctrl(&ctrl->ctrl); } -static void nvme_rdma_del_ctrl_work(struct work_struct *work) +static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) { - struct nvme_rdma_ctrl *ctrl = container_of(work, - struct nvme_rdma_ctrl, delete_work); - - nvme_stop_ctrl(&ctrl->ctrl); - nvme_rdma_remove_ctrl(ctrl); -} - -static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return -EBUSY; - - if (!queue_work(nvme_wq, &ctrl->delete_work)) - return -EBUSY; - - return 0; -} - -static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); - int ret = 0; - - /* - * Keep a reference until all work is flushed since - * __nvme_rdma_del_ctrl can free the ctrl mem - */ - nvme_get_ctrl(&ctrl->ctrl); - ret = __nvme_rdma_del_ctrl(ctrl); - if (!ret) - flush_work(&ctrl->delete_work); - nvme_put_ctrl(&ctrl->ctrl); - return ret; + nvme_stop_ctrl(ctrl); + nvme_rdma_remove_ctrl(to_rdma_ctrl(ctrl)); } static void nvme_rdma_reset_ctrl_work(struct work_struct *work) @@ -1846,7 +1814,7 @@ static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { .reg_write32 = nvmf_reg_write32, .free_ctrl = nvme_rdma_free_ctrl, .submit_async_event = nvme_rdma_submit_async_event, - .delete_ctrl = nvme_rdma_del_ctrl, + .delete_ctrl = nvme_rdma_delete_ctrl, .get_address = nvmf_get_address, .reinit_request = nvme_rdma_reinit_request, }; @@ -1977,7 +1945,6 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, INIT_DELAYED_WORK(&ctrl->reconnect_work, nvme_rdma_reconnect_ctrl_work); INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work); - INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work); ctrl->ctrl.queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */ @@ -2081,7 +2048,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) dev_info(ctrl->ctrl.device, "Removing ctrl: NQN \"%s\", addr %pISp\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - __nvme_rdma_del_ctrl(ctrl); + nvme_delete_ctrl(&ctrl->ctrl); } mutex_unlock(&nvme_rdma_ctrl_mutex); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f83e925fe64a..7f9f3fc3fb2a 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -53,7 +53,6 @@ struct nvme_loop_ctrl { struct nvme_ctrl ctrl; struct nvmet_ctrl *target_ctrl; - struct work_struct delete_work; }; static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) @@ -439,41 +438,13 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) nvme_loop_destroy_admin_queue(ctrl); } -static void nvme_loop_del_ctrl_work(struct work_struct *work) +static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl) { - struct nvme_loop_ctrl *ctrl = container_of(work, - struct nvme_loop_ctrl, delete_work); - - nvme_stop_ctrl(&ctrl->ctrl); - nvme_remove_namespaces(&ctrl->ctrl); - nvme_loop_shutdown_ctrl(ctrl); - nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); -} - -static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl) -{ - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) - return -EBUSY; - - if (!queue_work(nvme_wq, &ctrl->delete_work)) - return -EBUSY; - - return 0; -} - -static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl) -{ - struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); - int ret; - - ret = __nvme_loop_del_ctrl(ctrl); - if (ret) - return ret; - - flush_work(&ctrl->delete_work); - - return 0; + nvme_stop_ctrl(ctrl); + nvme_remove_namespaces(ctrl); + nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl)); + nvme_uninit_ctrl(ctrl); + nvme_put_ctrl(ctrl); } static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) @@ -483,7 +454,7 @@ static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) mutex_lock(&nvme_loop_ctrl_mutex); list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) { if (ctrl->ctrl.cntlid == nctrl->cntlid) - __nvme_loop_del_ctrl(ctrl); + nvme_delete_ctrl(&ctrl->ctrl); } mutex_unlock(&nvme_loop_ctrl_mutex); } @@ -539,7 +510,7 @@ static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { .reg_write32 = nvmf_reg_write32, .free_ctrl = nvme_loop_free_ctrl, .submit_async_event = nvme_loop_submit_async_event, - .delete_ctrl = nvme_loop_del_ctrl, + .delete_ctrl = nvme_loop_delete_ctrl_host, }; static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) @@ -601,7 +572,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, ctrl->ctrl.opts = opts; INIT_LIST_HEAD(&ctrl->list); - INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work); INIT_WORK(&ctrl->ctrl.reset_work, nvme_loop_reset_ctrl_work); ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, @@ -731,7 +701,7 @@ static void __exit nvme_loop_cleanup_module(void) mutex_lock(&nvme_loop_ctrl_mutex); list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list) - __nvme_loop_del_ctrl(ctrl); + nvme_delete_ctrl(&ctrl->ctrl); mutex_unlock(&nvme_loop_ctrl_mutex); flush_workqueue(nvme_wq); -- cgit v1.2.1 From e9bc25874c0bde47b65c58ccd01e339a603a7f40 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 29 Oct 2017 10:44:30 +0200 Subject: nvme-rdma: remove nvme_rdma_remove_ctrl It is only used in two places, and some of the work done by it will be taken into common code soon. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 5175b465997d..a3521b852ea8 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1755,18 +1755,13 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) nvme_rdma_destroy_admin_queue(ctrl, shutdown); } -static void nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl) -{ - nvme_remove_namespaces(&ctrl->ctrl); - nvme_rdma_shutdown_ctrl(ctrl, true); - nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); -} - static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) { nvme_stop_ctrl(ctrl); - nvme_rdma_remove_ctrl(to_rdma_ctrl(ctrl)); + nvme_remove_namespaces(ctrl); + nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true); + nvme_uninit_ctrl(ctrl); + nvme_put_ctrl(ctrl); } static void nvme_rdma_reset_ctrl_work(struct work_struct *work) @@ -1802,7 +1797,10 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) out_fail: dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - nvme_rdma_remove_ctrl(ctrl); + nvme_remove_namespaces(&ctrl->ctrl); + nvme_rdma_shutdown_ctrl(ctrl, true); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { -- cgit v1.2.1 From 6cd53d14aaa006b5543f06fbf5e1680ce61c6c6e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 29 Oct 2017 10:44:31 +0200 Subject: nvme: consolidate common code from ->reset_work No change in behavior except that the FC code cancels two work items a little later now. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Reviewed-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 4 ++++ drivers/nvme/host/fc.c | 13 ------------- drivers/nvme/host/rdma.c | 4 ---- drivers/nvme/target/loop.c | 4 ---- 4 files changed, 4 insertions(+), 21 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d835ac05bbf7..4fa748c9a3f6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -102,7 +102,11 @@ static void nvme_delete_ctrl_work(struct work_struct *work) struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, delete_work); + nvme_stop_ctrl(ctrl); + nvme_remove_namespaces(ctrl); ctrl->ops->delete_ctrl(ctrl); + nvme_uninit_ctrl(ctrl); + nvme_put_ctrl(ctrl); } int nvme_delete_ctrl(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index a7bdb17de29d..e447b532b9ee 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2640,24 +2640,11 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); - nvme_stop_ctrl(&ctrl->ctrl); - nvme_remove_namespaces(&ctrl->ctrl); /* * kill the association on the link side. this will block * waiting for io to terminate */ nvme_fc_delete_association(ctrl); - - /* - * tear down the controller - * After the last reference on the nvme ctrl is removed, - * the transport nvme_fc_nvme_ctrl_freed() callback will be - * invoked. From there, the transport will tear down it's - * logical queues and association. - */ - nvme_uninit_ctrl(&ctrl->ctrl); - - nvme_put_ctrl(&ctrl->ctrl); } static void diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index a3521b852ea8..ed6e05018a92 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1757,11 +1757,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) { - nvme_stop_ctrl(ctrl); - nvme_remove_namespaces(ctrl); nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true); - nvme_uninit_ctrl(ctrl); - nvme_put_ctrl(ctrl); } static void nvme_rdma_reset_ctrl_work(struct work_struct *work) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 7f9f3fc3fb2a..bc95c6ed531a 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -440,11 +440,7 @@ static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) static void nvme_loop_delete_ctrl_host(struct nvme_ctrl *ctrl) { - nvme_stop_ctrl(ctrl); - nvme_remove_namespaces(ctrl); nvme_loop_shutdown_ctrl(to_loop_ctrl(ctrl)); - nvme_uninit_ctrl(ctrl); - nvme_put_ctrl(ctrl); } static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) -- cgit v1.2.1 From 12fa1304badde26c5cf495d5af48df22a31c3a52 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 29 Oct 2017 14:21:01 +0200 Subject: nvme-rdma: reuse nvme_delete_ctrl when reconnect attempts expire instead of just queueing delete work Reported-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ed6e05018a92..03644ecf68d2 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -912,7 +912,7 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) ctrl->ctrl.opts->reconnect_delay * HZ); } else { dev_info(ctrl->ctrl.device, "Removing controller...\n"); - queue_work(nvme_wq, &ctrl->ctrl.delete_work); + nvme_delete_ctrl(&ctrl->ctrl); } } -- cgit v1.2.1 From 4054637c9b4fbe9feef0cf6f2516ef00d8053560 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 29 Oct 2017 14:21:02 +0200 Subject: nvme: flush reset_work before safely continuing with delete operation Prevent racing controller reset and delete flows. reset_work must not ever self-requeue so flushing it suffices. Reported-by: Christoph Hellwig Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/fc.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4fa748c9a3f6..003314eb6341 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -102,6 +102,7 @@ static void nvme_delete_ctrl_work(struct work_struct *work) struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, delete_work); + flush_work(&ctrl->reset_work); nvme_stop_ctrl(ctrl); nvme_remove_namespaces(ctrl); ctrl->ops->delete_ctrl(ctrl); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e447b532b9ee..6a025a8d8c32 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2638,7 +2638,6 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); - cancel_work_sync(&ctrl->ctrl.reset_work); cancel_delayed_work_sync(&ctrl->connect_work); /* * kill the association on the link side. this will block -- cgit v1.2.1 From 44c6ec77e12c387aaba420b30a54b94966f0d9e8 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:14 -0700 Subject: nvme-fc: change ctlr state assignments during reset/reconnect Clean up some of the controller state checks and add the RESETTING->RECONNECTING state transition. Specifically: - the movement of the RESETTING state change and schedule of reset_work to core doesn't work wiht nvme_fc_error_recovery setting state to RECONNECTING before attempting to reset. Remove the state change as the reset request does it. - In the rare cases where an error occurs right as we're transitioning to LIVE, defer the controller start actions. - In error handling on teardown of associations while performing initial controller creation - avoid quiesce calls on the admin_q. They are unneeded. - Add the RESETTING->RECONNECTING transition in the reset handler. Signed-off-by: James Smart Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 6a025a8d8c32..e37c69f7921d 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1884,13 +1884,6 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: resetting controller\n", ctrl->cnum); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { - dev_err(ctrl->ctrl.device, - "NVME-FC{%d}: error_recovery: Couldn't change state " - "to RECONNECTING\n", ctrl->cnum); - return; - } - nvme_reset_ctrl(&ctrl->ctrl); } @@ -2528,11 +2521,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); ctrl->ctrl.nr_reconnects = 0; - nvme_start_ctrl(&ctrl->ctrl); + if (changed) + nvme_start_ctrl(&ctrl->ctrl); return 0; /* Success */ @@ -2600,7 +2593,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) * use blk_mq_tagset_busy_itr() and the transport routine to * terminate the exchanges. */ - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + if (ctrl->ctrl.state != NVME_CTRL_NEW) + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); @@ -2649,12 +2643,8 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) static void nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) { - /* If we are resetting/deleting then do nothing */ - if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { - WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || - ctrl->ctrl.state == NVME_CTRL_LIVE); + if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) return; - } dev_info(ctrl->ctrl.device, "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", @@ -2683,9 +2673,17 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) int ret; nvme_stop_ctrl(&ctrl->ctrl); + /* will block will waiting for io to terminate */ nvme_fc_delete_association(ctrl); + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + dev_err(ctrl->ctrl.device, + "NVME-FC{%d}: error_recovery: Couldn't change state " + "to RECONNECTING\n", ctrl->cnum); + return; + } + ret = nvme_fc_create_association(ctrl); if (ret) nvme_fc_reconnect_or_delete(ctrl, ret); -- cgit v1.2.1 From ac7fe82b6fcf77e757e88005c33b8147c1b7b73f Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:15 -0700 Subject: nvme-fc: add a dev_loss_tmo field to the remoteport Add a dev_loss_tmo value, paralleling the SCSI FC transport, for device connectivity loss. The transport initializes the value in the nvme_fc_register_remoteport() call. If the value is not set, a default of 60s is set. Add a new routine to the api, nvme_fc_set_remoteport_devloss() routine, which allows the lldd to dynamically update the value on an existing remoteport. Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e37c69f7921d..25479d3031fa 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -45,6 +45,8 @@ enum nvme_fc_queue_flags { #define NVMEFC_QUEUE_DELAY 3 /* ms units */ +#define NVME_FC_DEFAULT_DEV_LOSS_TMO 60 /* seconds */ + struct nvme_fc_queue { struct nvme_fc_ctrl *ctrl; struct device *dev; @@ -585,6 +587,11 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, newrec->remoteport.port_id = pinfo->port_id; newrec->remoteport.port_state = FC_OBJSTATE_ONLINE; newrec->remoteport.port_num = idx; + /* a registration value of dev_loss_tmo=0 results in the default */ + if (pinfo->dev_loss_tmo) + newrec->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo; + else + newrec->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO; spin_lock_irqsave(&nvme_fc_lock, flags); list_add_tail(&newrec->endp_list, &lport->endp_list); @@ -688,6 +695,30 @@ nvme_fc_rescan_remoteport(struct nvme_fc_remote_port *remoteport) } EXPORT_SYMBOL_GPL(nvme_fc_rescan_remoteport); +int +nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr, + u32 dev_loss_tmo) +{ + struct nvme_fc_rport *rport = remoteport_to_rport(portptr); + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + + spin_lock_irqsave(&rport->lock, flags); + + if (portptr->port_state != FC_OBJSTATE_ONLINE) { + spin_unlock_irqrestore(&rport->lock, flags); + return -EINVAL; + } + + /* a dev_loss_tmo of 0 (immediate) is allowed to be set */ + rport->remoteport.dev_loss_tmo = dev_loss_tmo; + + spin_unlock_irqrestore(&rport->lock, flags); + + return 0; +} +EXPORT_SYMBOL_GPL(nvme_fc_set_remoteport_devloss); + /* *********************** FC-NVME DMA Handling **************************** */ -- cgit v1.2.1 From 96e24801056467c6483f68ef71d642fb925c3100 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:16 -0700 Subject: nvme-fc: check connectivity before initiating reconnects Check remoteport connectivity before initiating reconnects Signed-off-by: James Smart Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 25479d3031fa..b64cc1002452 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -808,7 +808,6 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, dma_unmap_sg(dev, sg, nents, dir); } - /* *********************** FC-NVME LS Handling **************************** */ static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *); @@ -2462,6 +2461,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) ++ctrl->ctrl.nr_reconnects; + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) + return -ENODEV; + /* * Create the admin queue */ @@ -2682,6 +2684,10 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) ctrl->cnum, status); if (nvmf_should_reconnect(&ctrl->ctrl)) { + /* Only schedule the reconnect if the remote port is online */ + if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) + return; + dev_info(ctrl->ctrl.device, "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); @@ -2715,12 +2721,15 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) return; } - ret = nvme_fc_create_association(ctrl); - if (ret) - nvme_fc_reconnect_or_delete(ctrl, ret); - else - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: controller reset complete\n", ctrl->cnum); + if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) { + ret = nvme_fc_create_association(ctrl); + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: controller reset complete\n", + ctrl->cnum); + } } static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { -- cgit v1.2.1 From 3cec7f9de448131778a1428100621fd371db75f4 Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:13 -0700 Subject: nvme: allow controller RESETTING to RECONNECTING transition Transport will typically transition from LIVE to RESETTING when initially performing a reset or recovering from an error. Adding this transition allows a transport to transition to RECONNECTING when it checks/waits for connectivity then creates new transport connections and reinits the controller. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 003314eb6341..07b9f4e1c283 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -241,6 +241,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_RECONNECTING: switch (old_state) { case NVME_CTRL_LIVE: + case NVME_CTRL_RESETTING: changed = true; /* FALLTHRU */ default: -- cgit v1.2.1 From 2b632970da4f288e6dbdc826c34fbf74f40ec94c Mon Sep 17 00:00:00 2001 From: James Smart Date: Wed, 25 Oct 2017 16:43:17 -0700 Subject: nvme-fc: add dev_loss_tmo timeout and remoteport resume support When a remoteport is unregistered (connectivity lost), the following actions are taken: - the remoteport is marked DELETED - the time when dev_loss_tmo would expire is set in the remoteport - all controllers on the remoteport are reset. After a controller resets, it will stall in a RECONNECTING state waiting for one of the following: - the controller will continue to attempt reconnect per max_retries and reconnect_delay. As no remoteport connectivity, the reconnect attempt will immediately fail. If max reconnects has not been reached, a new reconnect_delay timer will be schedule. If the current time plus another reconnect_delay exceeds when dev_loss_tmo expires on the remote port, then the reconnect_delay will be shortend to schedule no later than when dev_loss_tmo expires. If max reconnect attempts are reached (e.g. ctrl_loss_tmo reached) or dev_loss_tmo ix exceeded without connectivity, the controller is deleted. - the remoteport is re-registered prior to dev_loss_tmo expiring. The resume of the remoteport will immediately attempt to reconnect each of its suspended controllers. Signed-off-by: James Smart Reviewed-by: Hannes Reinecke [hch: updated to use nvme_delete_ctrl] Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 278 ++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 239 insertions(+), 39 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b64cc1002452..113c30be7276 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -138,6 +138,7 @@ struct nvme_fc_rport { struct nvme_fc_lport *lport; spinlock_t lock; struct kref ref; + unsigned long dev_loss_end; } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ enum nvme_fcctrl_flags { @@ -528,6 +529,102 @@ nvme_fc_rport_get(struct nvme_fc_rport *rport) return kref_get_unless_zero(&rport->ref); } +static void +nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl) +{ + switch (ctrl->ctrl.state) { + case NVME_CTRL_NEW: + case NVME_CTRL_RECONNECTING: + /* + * As all reconnects were suppressed, schedule a + * connect. + */ + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: connectivity re-established. " + "Attempting reconnect\n", ctrl->cnum); + + queue_delayed_work(nvme_wq, &ctrl->connect_work, 0); + break; + + case NVME_CTRL_RESETTING: + /* + * Controller is already in the process of terminating the + * association. No need to do anything further. The reconnect + * step will naturally occur after the reset completes. + */ + break; + + default: + /* no action to take - let it delete */ + break; + } +} + +static struct nvme_fc_rport * +nvme_fc_attach_to_suspended_rport(struct nvme_fc_lport *lport, + struct nvme_fc_port_info *pinfo) +{ + struct nvme_fc_rport *rport; + struct nvme_fc_ctrl *ctrl; + unsigned long flags; + + spin_lock_irqsave(&nvme_fc_lock, flags); + + list_for_each_entry(rport, &lport->endp_list, endp_list) { + if (rport->remoteport.node_name != pinfo->node_name || + rport->remoteport.port_name != pinfo->port_name) + continue; + + if (!nvme_fc_rport_get(rport)) { + rport = ERR_PTR(-ENOLCK); + goto out_done; + } + + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + spin_lock_irqsave(&rport->lock, flags); + + /* has it been unregistered */ + if (rport->remoteport.port_state != FC_OBJSTATE_DELETED) { + /* means lldd called us twice */ + spin_unlock_irqrestore(&rport->lock, flags); + nvme_fc_rport_put(rport); + return ERR_PTR(-ESTALE); + } + + rport->remoteport.port_state = FC_OBJSTATE_ONLINE; + rport->dev_loss_end = 0; + + /* + * kick off a reconnect attempt on all associations to the + * remote port. A successful reconnects will resume i/o. + */ + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) + nvme_fc_resume_controller(ctrl); + + spin_unlock_irqrestore(&rport->lock, flags); + + return rport; + } + + rport = NULL; + +out_done: + spin_unlock_irqrestore(&nvme_fc_lock, flags); + + return rport; +} + +static inline void +__nvme_fc_set_dev_loss_tmo(struct nvme_fc_rport *rport, + struct nvme_fc_port_info *pinfo) +{ + if (pinfo->dev_loss_tmo) + rport->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo; + else + rport->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO; +} + /** * nvme_fc_register_remoteport - transport entry point called by an * LLDD to register the existence of a NVME @@ -554,22 +651,45 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, unsigned long flags; int ret, idx; + if (!nvme_fc_lport_get(lport)) { + ret = -ESHUTDOWN; + goto out_reghost_failed; + } + + /* + * look to see if there is already a remoteport that is waiting + * for a reconnect (within dev_loss_tmo) with the same WWN's. + * If so, transition to it and reconnect. + */ + newrec = nvme_fc_attach_to_suspended_rport(lport, pinfo); + + /* found an rport, but something about its state is bad */ + if (IS_ERR(newrec)) { + ret = PTR_ERR(newrec); + goto out_lport_put; + + /* found existing rport, which was resumed */ + } else if (newrec) { + nvme_fc_lport_put(lport); + __nvme_fc_set_dev_loss_tmo(newrec, pinfo); + nvme_fc_signal_discovery_scan(lport, newrec); + *portptr = &newrec->remoteport; + return 0; + } + + /* nothing found - allocate a new remoteport struct */ + newrec = kmalloc((sizeof(*newrec) + lport->ops->remote_priv_sz), GFP_KERNEL); if (!newrec) { ret = -ENOMEM; - goto out_reghost_failed; - } - - if (!nvme_fc_lport_get(lport)) { - ret = -ESHUTDOWN; - goto out_kfree_rport; + goto out_lport_put; } idx = ida_simple_get(&lport->endp_cnt, 0, 0, GFP_KERNEL); if (idx < 0) { ret = -ENOSPC; - goto out_lport_put; + goto out_kfree_rport; } INIT_LIST_HEAD(&newrec->endp_list); @@ -587,11 +707,7 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, newrec->remoteport.port_id = pinfo->port_id; newrec->remoteport.port_state = FC_OBJSTATE_ONLINE; newrec->remoteport.port_num = idx; - /* a registration value of dev_loss_tmo=0 results in the default */ - if (pinfo->dev_loss_tmo) - newrec->remoteport.dev_loss_tmo = pinfo->dev_loss_tmo; - else - newrec->remoteport.dev_loss_tmo = NVME_FC_DEFAULT_DEV_LOSS_TMO; + __nvme_fc_set_dev_loss_tmo(newrec, pinfo); spin_lock_irqsave(&nvme_fc_lock, flags); list_add_tail(&newrec->endp_list, &lport->endp_list); @@ -602,10 +718,10 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, *portptr = &newrec->remoteport; return 0; -out_lport_put: - nvme_fc_lport_put(lport); out_kfree_rport: kfree(newrec); +out_lport_put: + nvme_fc_lport_put(lport); out_reghost_failed: *portptr = NULL; return ret; @@ -636,6 +752,58 @@ restart: return 0; } +static void +nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) +{ + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: controller connectivity lost. Awaiting " + "Reconnect", ctrl->cnum); + + switch (ctrl->ctrl.state) { + case NVME_CTRL_NEW: + case NVME_CTRL_LIVE: + /* + * Schedule a controller reset. The reset will terminate the + * association and schedule the reconnect timer. Reconnects + * will be attempted until either the ctlr_loss_tmo + * (max_retries * connect_delay) expires or the remoteport's + * dev_loss_tmo expires. + */ + if (nvme_reset_ctrl(&ctrl->ctrl)) { + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: Couldn't schedule reset. " + "Deleting controller.\n", + ctrl->cnum); + nvme_delete_ctrl(&ctrl->ctrl); + } + break; + + case NVME_CTRL_RECONNECTING: + /* + * The association has already been terminated and the + * controller is attempting reconnects. No need to do anything + * futher. Reconnects will be attempted until either the + * ctlr_loss_tmo (max_retries * connect_delay) expires or the + * remoteport's dev_loss_tmo expires. + */ + break; + + case NVME_CTRL_RESETTING: + /* + * Controller is already in the process of terminating the + * association. No need to do anything further. The reconnect + * step will kick in naturally after the association is + * terminated. + */ + break; + + case NVME_CTRL_DELETING: + default: + /* no action to take - let it delete */ + break; + } +} + /** * nvme_fc_unregister_remoteport - transport entry point called by an * LLDD to deregister/remove a previously @@ -665,15 +833,31 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr) } portptr->port_state = FC_OBJSTATE_DELETED; - /* tear down all associations to the remote port */ - list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) - nvme_delete_ctrl(&ctrl->ctrl); + rport->dev_loss_end = jiffies + (portptr->dev_loss_tmo * HZ); + + list_for_each_entry(ctrl, &rport->ctrl_list, ctrl_list) { + /* if dev_loss_tmo==0, dev loss is immediate */ + if (!portptr->dev_loss_tmo) { + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: controller connectivity lost. " + "Deleting controller.\n", + ctrl->cnum); + nvme_delete_ctrl(&ctrl->ctrl); + } else + nvme_fc_ctrl_connectivity_loss(ctrl); + } spin_unlock_irqrestore(&rport->lock, flags); nvme_fc_abort_lsops(rport); + /* + * release the reference, which will allow, if all controllers + * go away, which should only occur after dev_loss_tmo occurs, + * for the rport to be torn down. + */ nvme_fc_rport_put(rport); + return 0; } EXPORT_SYMBOL_GPL(nvme_fc_unregister_remoteport); @@ -700,7 +884,6 @@ nvme_fc_set_remoteport_devloss(struct nvme_fc_remote_port *portptr, u32 dev_loss_tmo) { struct nvme_fc_rport *rport = remoteport_to_rport(portptr); - struct nvme_fc_ctrl *ctrl; unsigned long flags; spin_lock_irqsave(&rport->lock, flags); @@ -2676,28 +2859,43 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) static void nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) { + struct nvme_fc_rport *rport = ctrl->rport; + struct nvme_fc_remote_port *portptr = &rport->remoteport; + unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ; + bool recon = true; + if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) return; - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", - ctrl->cnum, status); + if (portptr->port_state == FC_OBJSTATE_ONLINE) + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: reset: Reconnect attempt failed (%d)\n", + ctrl->cnum, status); + else if (time_after_eq(jiffies, rport->dev_loss_end)) + recon = false; - if (nvmf_should_reconnect(&ctrl->ctrl)) { - /* Only schedule the reconnect if the remote port is online */ - if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) - return; + if (recon && nvmf_should_reconnect(&ctrl->ctrl)) { + if (portptr->port_state == FC_OBJSTATE_ONLINE) + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: Reconnect attempt in %ld " + "seconds\n", + ctrl->cnum, recon_delay / HZ); + else if (time_after(jiffies + recon_delay, rport->dev_loss_end)) + recon_delay = rport->dev_loss_end - jiffies; - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: Reconnect attempt in %d seconds.\n", - ctrl->cnum, ctrl->ctrl.opts->reconnect_delay); - queue_delayed_work(nvme_wq, &ctrl->connect_work, - ctrl->ctrl.opts->reconnect_delay * HZ); + queue_delayed_work(nvme_wq, &ctrl->connect_work, recon_delay); } else { - dev_warn(ctrl->ctrl.device, + if (portptr->port_state == FC_OBJSTATE_ONLINE) + dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: Max reconnect attempts (%d) " "reached. Removing controller\n", ctrl->cnum, ctrl->ctrl.nr_reconnects); + else + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: dev_loss_tmo (%d) expired " + "while waiting for remoteport connectivity. " + "Removing controller\n", ctrl->cnum, + portptr->dev_loss_tmo); WARN_ON(nvme_delete_ctrl(&ctrl->ctrl)); } } @@ -2721,15 +2919,17 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) return; } - if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) { + if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) ret = nvme_fc_create_association(ctrl); - if (ret) - nvme_fc_reconnect_or_delete(ctrl, ret); - else - dev_info(ctrl->ctrl.device, - "NVME-FC{%d}: controller reset complete\n", - ctrl->cnum); - } + else + ret = -ENOTCONN; + + if (ret) + nvme_fc_reconnect_or_delete(ctrl, ret); + else + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: controller reset complete\n", + ctrl->cnum); } static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = { -- cgit v1.2.1 From a96d4bd867129e6124e3cd1006fcbcfea3341179 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 27 Oct 2017 13:12:49 -0700 Subject: nvmet: fix fatal_err_work deadlock Below is a stack trace for an issue that was reported. What's happening is that the nvmet layer had it's controller kato timeout fire, which causes it to schedule its fatal error handler via the fatal_err_work element. The error handler is invoked, which calls the transport delete_ctrl() entry point, and as the transport tears down the controller, nvmet_sq_destroy ends up doing the final put on the ctlr causing it to enter its free routine. The ctlr free routine does a cancel_work_sync() on fatal_err_work element, which then does a flush_work and wait_for_completion. But, as the wait is in the context of the work element being flushed, its in a catch-22 and the thread hangs. [ 326.903131] nvmet: ctrl 1 keep-alive timer (15 seconds) expired! [ 326.909832] nvmet: ctrl 1 fatal error occurred! [ 327.643100] lpfc 0000:04:00.0: 0:6313 NVMET Defer ctx release xri x114 flg x2 [ 494.582064] INFO: task kworker/0:2:243 blocked for more than 120 seconds. [ 494.589638] Not tainted 4.14.0-rc1.James+ #1 [ 494.594986] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 494.603718] kworker/0:2 D 0 243 2 0x80000000 [ 494.609839] Workqueue: events nvmet_fatal_error_handler [nvmet] [ 494.616447] Call Trace: [ 494.619177] __schedule+0x28d/0x890 [ 494.623070] schedule+0x36/0x80 [ 494.626571] schedule_timeout+0x1dd/0x300 [ 494.631044] ? dequeue_task_fair+0x592/0x840 [ 494.635810] ? pick_next_task_fair+0x23b/0x5c0 [ 494.640756] wait_for_completion+0x121/0x180 [ 494.645521] ? wake_up_q+0x80/0x80 [ 494.649315] flush_work+0x11d/0x1a0 [ 494.653206] ? wake_up_worker+0x30/0x30 [ 494.657484] __cancel_work_timer+0x10b/0x190 [ 494.662249] cancel_work_sync+0x10/0x20 [ 494.666525] nvmet_ctrl_put+0xa3/0x100 [nvmet] [ 494.671482] nvmet_sq_:q+0x64/0xd0 [nvmet] [ 494.676540] nvmet_fc_delete_target_queue+0x202/0x220 [nvmet_fc] [ 494.683245] nvmet_fc_delete_target_assoc+0x6d/0xc0 [nvmet_fc] [ 494.689743] nvmet_fc_delete_ctrl+0x137/0x1a0 [nvmet_fc] [ 494.695673] nvmet_fatal_error_handler+0x30/0x40 [nvmet] [ 494.701589] process_one_work+0x149/0x360 [ 494.706064] worker_thread+0x4d/0x3c0 [ 494.710148] kthread+0x109/0x140 [ 494.713751] ? rescuer_thread+0x380/0x380 [ 494.718214] ? kthread_park+0x60/0x60 Correct by having the fc transport convert to a different workq context for the actual controller teardown which may call the cancel_work_sync. Signed-off-by: James Smart Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fc.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 58e010bdda3e..5f2570f5dfa9 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -150,6 +150,7 @@ struct nvmet_fc_tgt_assoc { struct list_head a_list; struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1]; struct kref ref; + struct work_struct del_work; }; @@ -232,6 +233,7 @@ static void nvmet_fc_tgtport_put(struct nvmet_fc_tgtport *tgtport); static int nvmet_fc_tgtport_get(struct nvmet_fc_tgtport *tgtport); static void nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, struct nvmet_fc_fcp_iod *fod); +static void nvmet_fc_delete_target_assoc(struct nvmet_fc_tgt_assoc *assoc); /* *********************** FC-NVME DMA Handling **************************** */ @@ -802,6 +804,16 @@ nvmet_fc_find_target_queue(struct nvmet_fc_tgtport *tgtport, return NULL; } +static void +nvmet_fc_delete_assoc(struct work_struct *work) +{ + struct nvmet_fc_tgt_assoc *assoc = + container_of(work, struct nvmet_fc_tgt_assoc, del_work); + + nvmet_fc_delete_target_assoc(assoc); + nvmet_fc_tgt_a_put(assoc); +} + static struct nvmet_fc_tgt_assoc * nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport) { @@ -826,6 +838,7 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport) assoc->a_id = idx; INIT_LIST_HEAD(&assoc->a_list); kref_init(&assoc->ref); + INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc); while (needrandom) { get_random_bytes(&ran, sizeof(ran) - BYTES_FOR_QID); @@ -1118,8 +1131,7 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl) nvmet_fc_tgtport_put(tgtport); if (found_ctrl) { - nvmet_fc_delete_target_assoc(assoc); - nvmet_fc_tgt_a_put(assoc); + schedule_work(&assoc->del_work); return; } -- cgit v1.2.1 From 3639efef8fb128f99ae38bf603189639efc3850d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 20 Oct 2017 16:18:03 -0600 Subject: nvme: Remove unused headers Signed-off-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7735571ffc9a..32c413ec818c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -13,7 +13,6 @@ */ #include -#include #include #include #include @@ -26,12 +25,9 @@ #include #include #include -#include #include -#include #include #include -#include #include #include "nvme.h" -- cgit v1.2.1 From c091fbe9a26ee116d99e2c0ed010afb957a10365 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Nov 2017 12:19:32 +0100 Subject: block: fix CDROM dependency on BLK_DEV After the cdrom cleanup, I get randconfig warnings for some configurations: warning: (BLK_DEV_IDECD && BLK_DEV_SR) selects CDROM which has unmet direct dependencies (BLK_DEV) This adds an explicit BLK_DEV dependency for both drivers. The other drivers that select 'CDROM' already have this and don't need a change. Fixes: 2a750166a5be ("block: Rework drivers/cdrom/Makefile") Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- drivers/ide/Kconfig | 1 + drivers/scsi/Kconfig | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/ide/Kconfig b/drivers/ide/Kconfig index 7b92c591e4c1..cf1fb3fb5d26 100644 --- a/drivers/ide/Kconfig +++ b/drivers/ide/Kconfig @@ -117,6 +117,7 @@ config BLK_DEV_DELKIN config BLK_DEV_IDECD tristate "Include IDE/ATAPI CDROM support" + depends on BLK_DEV select IDE_ATAPI select CDROM ---help--- diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index cca0b03c2ead..766955318005 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -130,7 +130,7 @@ config CHR_DEV_OSST config BLK_DEV_SR tristate "SCSI CDROM support" - depends on SCSI + depends on SCSI && BLK_DEV select CDROM ---help--- If you want to use a CD or DVD drive attached to your computer -- cgit v1.2.1 From 474f5da2354e9fd5376b968e4c5a0f1807dc19e8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Nov 2017 12:42:00 +0100 Subject: skd: use ktime_get_real_seconds() Like many storage drivers, skd uses an unsigned 32-bit number for interchanging the current time with the firmware. This will overflow in y2106 and is otherwise safe. However, the get_seconds() function is generally considered deprecated since the behavior is different between 32-bit and 64-bit architectures, and using it may indicate a bigger problem. To annotate that we've thought about this, let's add a comment here and migrate to the ktime_get_real_seconds() function that consistently returns a 64-bit number. Signed-off-by: Arnd Bergmann Signed-off-by: Jens Axboe --- drivers/block/skd_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 7cedb4295e9d..802ab9f7a8c1 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -1967,7 +1967,8 @@ static void skd_isr_msg_from_dev(struct skd_device *skdev) break; case FIT_MTD_CMD_LOG_HOST_ID: - skdev->connect_time_stamp = get_seconds(); + /* hardware interface overflows in y2106 */ + skdev->connect_time_stamp = (u32)ktime_get_real_seconds(); data = skdev->connect_time_stamp & 0xFFFF; mtd = FIT_MXD_CONS(FIT_MTD_CMD_LOG_TIME_STAMP_LO, 0, data); SKD_WRITEL(skdev, mtd, FIT_MSG_TO_DEVICE); -- cgit v1.2.1 From a806c6c81e6c0d07c8a8b2643bad4a37a196d681 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Thu, 2 Nov 2017 18:07:44 +0900 Subject: nvme: comment typo fixed in clearing AER a tiny typo fixed in a comment. Signed-off-by: Minwoo Im Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 07b9f4e1c283..64744355aa88 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2637,7 +2637,7 @@ static void nvme_fw_act_work(struct work_struct *work) return; nvme_start_queues(ctrl); - /* read FW slot informationi to clear the AER*/ + /* read FW slot information to clear the AER */ nvme_get_fw_slot_info(ctrl); } -- cgit v1.2.1 From ea435e1b9392a33deceaea2a16ebaa3397bead93 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:29:54 +0300 Subject: block: add a poll_fn callback to struct request_queue That we we can also poll non blk-mq queues. Mostly needed for the NVMe multipath code, but could also be useful elsewhere. Signed-off-by: Christoph Hellwig Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 0d4c23dc4532..db632818777d 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -94,7 +94,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) cookie = submit_bio(bio); - blk_mq_poll(bdev_get_queue(req->ns->bdev), cookie); + blk_poll(bdev_get_queue(req->ns->bdev), cookie); } static void nvmet_execute_flush(struct nvmet_req *req) -- cgit v1.2.1 From a116895fc7b61de4f26548c9b635a06515ec7563 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 3 Nov 2017 11:00:03 -0600 Subject: cdrom: hide CONFIG_CDROM menu selection We don't need to expose this. The point is that drivers select the uniform CDROM layer, if they need it, the user should not have to make a conscious decision on whether to include this separately or not. Fixes: 2a750166a5be ("block: Rework drivers/cdrom/Makefile") Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index b998b3c9ea86..95e678716b5e 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -68,14 +68,7 @@ config AMIGA_Z2RAM module will be called z2ram. config CDROM - tristate "CD-ROM driver" - help - A CD-ROM is a pre-pressed optical compact disc which contains - data. The name is an acronym which stands for "Compact Disc - Read-Only Memory". This driver implements functionality that is - shared by all CD-ROM drivers, e.g. the IDE CD-ROM driver and the - SCSI CD-ROM driver. For a list of the ioctls implemented by this - driver, see also Documentation/ioctl/cdrom.txt. + tristate config GDROM tristate "SEGA Dreamcast GD-ROM drive" -- cgit v1.2.1 From 826a70a08b1210bbfdbda812ab43eb986e25b5c2 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 4 Nov 2017 09:55:34 +0800 Subject: SCSI: don't get target/host busy_count in scsi_mq_get_budget() It is very expensive to atomic_inc/atomic_dec the host wide counter of host->busy_count, and it should have been avoided via blk-mq's mechanism of getting driver tag, which uses the more efficient way of sbitmap queue. Also we don't check atomic_read(&sdev->device_busy) in scsi_mq_get_budget() and don't run queue if the counter becomes zero, so IO hang may be caused if all requests are completed just before the current SCSI device is added to shost->starved_list. Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq) Reported-by: Bart Van Assche Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 6f10afaca25b..22a7e4c47207 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1950,11 +1950,7 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; - struct Scsi_Host *shost = sdev->host; - atomic_dec(&shost->host_busy); - if (scsi_target(sdev)->can_queue > 0) - atomic_dec(&scsi_target(sdev)->target_busy); atomic_dec(&sdev->device_busy); put_device(&sdev->sdev_gendev); } @@ -1963,7 +1959,6 @@ static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; - struct Scsi_Host *shost = sdev->host; blk_status_t ret; ret = prep_to_mq(scsi_prep_state_check(sdev, NULL)); @@ -1974,18 +1969,9 @@ static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) goto out; if (!scsi_dev_queue_ready(q, sdev)) goto out_put_device; - if (!scsi_target_queue_ready(shost, sdev)) - goto out_dec_device_busy; - if (!scsi_host_queue_ready(q, shost, sdev)) - goto out_dec_target_busy; return BLK_STS_OK; -out_dec_target_busy: - if (scsi_target(sdev)->can_queue > 0) - atomic_dec(&scsi_target(sdev)->target_busy); -out_dec_device_busy: - atomic_dec(&sdev->device_busy); out_put_device: put_device(&sdev->sdev_gendev); out: @@ -1998,6 +1984,7 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req = bd->rq; struct request_queue *q = req->q; struct scsi_device *sdev = q->queuedata; + struct Scsi_Host *shost = sdev->host; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); blk_status_t ret; int reason; @@ -2007,10 +1994,15 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, goto out_put_budget; ret = BLK_STS_RESOURCE; + if (!scsi_target_queue_ready(shost, sdev)) + goto out_put_budget; + if (!scsi_host_queue_ready(q, shost, sdev)) + goto out_dec_target_busy; + if (!(req->rq_flags & RQF_DONTPREP)) { ret = prep_to_mq(scsi_mq_prep_fn(req)); if (ret != BLK_STS_OK) - goto out_put_budget; + goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; } else { blk_mq_start_request(req); @@ -2028,11 +2020,16 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, if (reason) { scsi_set_blocked(cmd, reason); ret = BLK_STS_RESOURCE; - goto out_put_budget; + goto out_dec_host_busy; } return BLK_STS_OK; +out_dec_host_busy: + atomic_dec(&shost->host_busy); +out_dec_target_busy: + if (scsi_target(sdev)->can_queue > 0) + atomic_dec(&scsi_target(sdev)->target_busy); out_put_budget: scsi_mq_put_budget(hctx); switch (ret) { -- cgit v1.2.1 From 88022d7201e96b43f1754b0358fc6bcd8dbdcde1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 5 Nov 2017 02:21:12 +0800 Subject: blk-mq: don't handle failure in .get_budget It is enough to just check if we can get the budget via .get_budget(). And we don't need to deal with device state change in .get_budget(). For SCSI, one issue to be fixed is that we have to call scsi_mq_uninit_cmd() to free allocated ressources if SCSI device fails to handle the request. And it isn't enough to simply call blk_mq_end_request() to do that if this request is marked as RQF_DONTPREP. Fixes: 0df21c86bdbf(scsi: implement .get_budget and .put_budget for blk-mq) Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 22a7e4c47207..286ea983c9e3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1955,27 +1955,22 @@ static void scsi_mq_put_budget(struct blk_mq_hw_ctx *hctx) put_device(&sdev->sdev_gendev); } -static blk_status_t scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) +static bool scsi_mq_get_budget(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; - blk_status_t ret; - - ret = prep_to_mq(scsi_prep_state_check(sdev, NULL)); - if (ret == BLK_STS_RESOURCE || ret != BLK_STS_OK) - return ret; if (!get_device(&sdev->sdev_gendev)) goto out; if (!scsi_dev_queue_ready(q, sdev)) goto out_put_device; - return BLK_STS_OK; + return true; out_put_device: put_device(&sdev->sdev_gendev); out: - return BLK_STS_RESOURCE; + return false; } static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, -- cgit v1.2.1 From ff57dc94faec023abc267cdc45766fccff497557 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 6 Nov 2017 16:11:57 -0500 Subject: nbd: wait uninterruptible for the dead timeout If we have a pending signal or the user kills their application then it'll bring down the whole device, which is less than awesome. Instead wait uninterruptible for the dead timeout so we're sure we gave it our best shot. Fixes: 560bc4b39952 ("nbd: handle dead connections") Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 3684e21d543f..b94a735bfe3c 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -715,9 +715,9 @@ static int wait_for_reconnect(struct nbd_device *nbd) return 0; if (test_bit(NBD_DISCONNECTED, &config->runtime_flags)) return 0; - wait_event_interruptible_timeout(config->conn_wait, - atomic_read(&config->live_connections), - config->dead_conn_timeout); + wait_event_timeout(config->conn_wait, + atomic_read(&config->live_connections), + config->dead_conn_timeout); return atomic_read(&config->live_connections); } -- cgit v1.2.1 From 6a468d5990ecd1c2d07dd85f8633bbdd0ba61c40 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 6 Nov 2017 16:11:58 -0500 Subject: nbd: don't start req until after the dead connection logic We can end up sleeping for a while waiting for the dead timeout, which means we could get the per request timer to fire. We did handle this case, but if the dead timeout happened right after we submitted we'd either tear down the connection or possibly requeue as we're handling an error and race with the endio which can lead to panics and other hilarity. Fixes: 560bc4b39952 ("nbd: handle dead connections") Cc: stable@vger.kernel.org Signed-off-by: Josef Bacik Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index b94a735bfe3c..95cab69d9c8b 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -289,15 +289,6 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, cmd->status = BLK_STS_TIMEOUT; return BLK_EH_HANDLED; } - - /* If we are waiting on our dead timer then we could get timeout - * callbacks for our request. For this we just want to reset the timer - * and let the queue side take care of everything. - */ - if (!completion_done(&cmd->send_complete)) { - nbd_config_put(nbd); - return BLK_EH_RESET_TIMER; - } config = nbd->config; if (config->num_connections > 1) { @@ -732,6 +723,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) if (!refcount_inc_not_zero(&nbd->config_refs)) { dev_err_ratelimited(disk_to_dev(nbd->disk), "Socks array is empty\n"); + blk_mq_start_request(req); return -EINVAL; } config = nbd->config; @@ -740,6 +732,7 @@ static int nbd_handle_cmd(struct nbd_cmd *cmd, int index) dev_err_ratelimited(disk_to_dev(nbd->disk), "Attempted send on invalid socket\n"); nbd_config_put(nbd); + blk_mq_start_request(req); return -EINVAL; } cmd->status = BLK_STS_OK; @@ -763,6 +756,7 @@ again: */ sock_shutdown(nbd); nbd_config_put(nbd); + blk_mq_start_request(req); return -EIO; } goto again; @@ -773,6 +767,7 @@ again: * here so that it gets put _after_ the request that is already on the * dispatch list. */ + blk_mq_start_request(req); if (unlikely(nsock->pending && nsock->pending != req)) { blk_mq_requeue_request(req, true); ret = 0; @@ -785,10 +780,10 @@ again: ret = nbd_send_cmd(nbd, cmd, index); if (ret == -EAGAIN) { dev_err_ratelimited(disk_to_dev(nbd->disk), - "Request send failed trying another connection\n"); + "Request send failed, requeueing\n"); nbd_mark_nsock_dead(nbd, nsock, 1); - mutex_unlock(&nsock->tx_lock); - goto again; + blk_mq_requeue_request(req, true); + ret = 0; } out: mutex_unlock(&nsock->tx_lock); @@ -812,7 +807,6 @@ static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx, * done sending everything over the wire. */ init_completion(&cmd->send_complete); - blk_mq_start_request(bd->rq); /* We can be called directly from the user space process, which means we * could possibly have signals pending so our sendmsg will fail. In -- cgit v1.2.1 From e54b064cb24c8268252336ccdd6523e08c0dcbde Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:51 +0300 Subject: nvme: move the dying queue check from cancel to completion With multipath we don't want a hard DNR bit on a request that is cancelled by a controller reset, but instead want to be able to retry it on another patch. To archive this don't always set the DNR bit when the queue is dying in nvme_cancel_request, but defer that decision to nvme_req_needs_retry. Note that it applies to any command there and not just cancelled commands, but one the queue is dying that is the right thing to do anyway. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 64744355aa88..a9c7ad304ec5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -172,6 +172,8 @@ static inline bool nvme_req_needs_retry(struct request *req) return false; if (nvme_req(req)->retries >= nvme_max_retries) return false; + if (blk_queue_dying(req->q)) + return false; return true; } @@ -189,18 +191,13 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq); void nvme_cancel_request(struct request *req, void *data, bool reserved) { - int status; - if (!blk_mq_request_started(req)) return; dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag); - status = NVME_SC_ABORT_REQ; - if (blk_queue_dying(req->q)) - status |= NVME_SC_DNR; - nvme_req(req)->status = status; + nvme_req(req)->status = NVME_SC_ABORT_REQ; blk_mq_complete_request(req); } -- cgit v1.2.1 From b5be3b392998a5f3763dfbdb1866f81d8ee631c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:52 +0300 Subject: nvme: always unregister the integrity profile in __nvme_revalidate_disk This is safe because the queue is always frozen when we revalidate, and it simplifies both the existing code as well as the multipath implementation. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a9c7ad304ec5..da5357c66a0f 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1081,29 +1081,6 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } #ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, - u16 bs) -{ - struct nvme_ns *ns = disk->private_data; - u16 old_ms = ns->ms; - u8 pi_type = 0; - - ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); - - /* PI implementation requires metadata equal t10 pi tuple size */ - if (ns->ms == sizeof(struct t10_pi_tuple)) - pi_type = id->dps & NVME_NS_DPS_PI_MASK; - - if (blk_get_integrity(disk) && - (ns->pi_type != pi_type || ns->ms != old_ms || - bs != queue_logical_block_size(disk->queue) || - (ns->ms && ns->ext))) - blk_integrity_unregister(disk); - - ns->pi_type = pi_type; -} - static void nvme_init_integrity(struct nvme_ns *ns) { struct blk_integrity integrity; @@ -1130,10 +1107,6 @@ static void nvme_init_integrity(struct nvme_ns *ns) blk_queue_max_integrity_segments(ns->queue, 1); } #else -static void nvme_prep_integrity(struct gendisk *disk, struct nvme_id_ns *id, - u16 bs) -{ -} static void nvme_init_integrity(struct nvme_ns *ns) { } @@ -1202,15 +1175,22 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) ns->lba_shift = 9; bs = 1 << ns->lba_shift; ns->noiob = le16_to_cpu(id->noiob); + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); + /* the PI implementation requires metadata equal t10 pi tuple size */ + if (ns->ms == sizeof(struct t10_pi_tuple)) + ns->pi_type = id->dps & NVME_NS_DPS_PI_MASK; + else + ns->pi_type = 0; blk_mq_freeze_queue(disk->queue); + blk_integrity_unregister(disk); - if (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED) - nvme_prep_integrity(disk, id, bs); blk_queue_logical_block_size(ns->queue, bs); if (ns->noiob) nvme_set_chunk_size(ns); - if (ns->ms && !blk_get_integrity(disk) && !ns->ext) + if (ns->ms && !ns->ext && + (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) nvme_init_integrity(ns); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) set_capacity(disk, 0); -- cgit v1.2.1 From 39b7baa410fd00a0fe9da864400d95b3d5008de7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:53 +0300 Subject: nvme: don't pass struct nvme_ns to nvme_init_integrity To allow reusing this function for the multipath node. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index da5357c66a0f..88d886b390f4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1081,12 +1081,12 @@ static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) } #ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_init_integrity(struct nvme_ns *ns) +static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) { struct blk_integrity integrity; memset(&integrity, 0, sizeof(integrity)); - switch (ns->pi_type) { + switch (pi_type) { case NVME_NS_DPS_PI_TYPE3: integrity.profile = &t10_pi_type3_crc; integrity.tag_size = sizeof(u16) + sizeof(u32); @@ -1102,12 +1102,12 @@ static void nvme_init_integrity(struct nvme_ns *ns) integrity.profile = NULL; break; } - integrity.tuple_size = ns->ms; - blk_integrity_register(ns->disk, &integrity); - blk_queue_max_integrity_segments(ns->queue, 1); + integrity.tuple_size = ms; + blk_integrity_register(disk, &integrity); + blk_queue_max_integrity_segments(disk->queue, 1); } #else -static void nvme_init_integrity(struct nvme_ns *ns) +static void nvme_init_integrity(struct gendisk *disk, u16 ms, u8 pi_type) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ @@ -1191,7 +1191,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) nvme_set_chunk_size(ns); if (ns->ms && !ns->ext && (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - nvme_init_integrity(ns); + nvme_init_integrity(disk, ns->ms, ns->pi_type); if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) set_capacity(disk, 0); else -- cgit v1.2.1 From 30e5e929c7bfaf87a65f926715773ef14339b79f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:54 +0300 Subject: nvme: don't pass struct nvme_ns to nvme_config_discard To allow reusing this function for the multipath node. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 88d886b390f4..551ec5df6a97 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1118,29 +1118,26 @@ static void nvme_set_chunk_size(struct nvme_ns *ns) blk_queue_chunk_sectors(ns->queue, rounddown_pow_of_two(chunk_size)); } -static void nvme_config_discard(struct nvme_ns *ns) +static void nvme_config_discard(struct nvme_ctrl *ctrl, + unsigned stream_alignment, struct request_queue *queue) { - struct nvme_ctrl *ctrl = ns->ctrl; - u32 logical_block_size = queue_logical_block_size(ns->queue); + u32 size = queue_logical_block_size(queue); + + if (stream_alignment) + size *= stream_alignment; BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - if (ctrl->nr_streams && ns->sws && ns->sgs) { - unsigned int sz = logical_block_size * ns->sws * ns->sgs; + queue->limits.discard_alignment = size; + queue->limits.discard_granularity = size; - ns->queue->limits.discard_alignment = sz; - ns->queue->limits.discard_granularity = sz; - } else { - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; - } - blk_queue_max_discard_sectors(ns->queue, UINT_MAX); - blk_queue_max_discard_segments(ns->queue, NVME_DSM_MAX_RANGES); - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); + blk_queue_max_discard_sectors(queue, UINT_MAX); + blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, queue); if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) - blk_queue_max_write_zeroes_sectors(ns->queue, UINT_MAX); + blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); } static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, @@ -1164,6 +1161,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; + unsigned stream_alignment = 0; u16 bs; /* @@ -1183,6 +1181,9 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) else ns->pi_type = 0; + if (ctrl->nr_streams && ns->sws && ns->sgs) + stream_alignment = ns->sws * ns->sgs; + blk_mq_freeze_queue(disk->queue); blk_integrity_unregister(disk); @@ -1198,7 +1199,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); if (ctrl->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns); + nvme_config_discard(ctrl, stream_alignment, disk->queue); blk_mq_unfreeze_queue(disk->queue); } -- cgit v1.2.1 From 6e78f21ae4488cdab4318f781de05c8ffc4de951 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:55 +0300 Subject: nvme: set the chunk size before freezing the queue We don't need a frozen queue to update the chunk_size, which just is a hint, and moving it a little earlier will allow for some better code reuse with the multipath code. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 551ec5df6a97..a4be843334ba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1184,12 +1184,13 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ctrl->nr_streams && ns->sws && ns->sgs) stream_alignment = ns->sws * ns->sgs; + if (ns->noiob) + nvme_set_chunk_size(ns); + blk_mq_freeze_queue(disk->queue); blk_integrity_unregister(disk); blk_queue_logical_block_size(ns->queue, bs); - if (ns->noiob) - nvme_set_chunk_size(ns); if (ns->ms && !ns->ext && (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) nvme_init_integrity(disk, ns->ms, ns->pi_type); -- cgit v1.2.1 From 24b0b58c5b4a9f6f4bbd1c1efd5afb9a565da3ea Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 21:28:56 +0300 Subject: nvme: split __nvme_revalidate_disk Split out the code that applies the calculate value to a given disk/queue into new helper that can be reused by the multipath code. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 49 +++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index a4be843334ba..28d58d353a42 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1157,12 +1157,34 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, } } +static void nvme_update_disk_info(struct gendisk *disk, + struct nvme_ns *ns, struct nvme_id_ns *id) +{ + sector_t capacity = le64_to_cpup(&id->nsze) << (ns->lba_shift - 9); + unsigned stream_alignment = 0; + + if (ns->ctrl->nr_streams && ns->sws && ns->sgs) + stream_alignment = ns->sws * ns->sgs; + + blk_mq_freeze_queue(disk->queue); + blk_integrity_unregister(disk); + + blk_queue_logical_block_size(disk->queue, 1 << ns->lba_shift); + if (ns->ms && !ns->ext && + (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) + nvme_init_integrity(disk, ns->ms, ns->pi_type); + if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) + capacity = 0; + set_capacity(disk, capacity); + + if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns->ctrl, stream_alignment, disk->queue); + blk_mq_unfreeze_queue(disk->queue); +} + static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) { struct nvme_ns *ns = disk->private_data; - struct nvme_ctrl *ctrl = ns->ctrl; - unsigned stream_alignment = 0; - u16 bs; /* * If identify namespace failed, use default 512 byte block size so @@ -1171,7 +1193,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) ns->lba_shift = id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ds; if (ns->lba_shift == 0) ns->lba_shift = 9; - bs = 1 << ns->lba_shift; ns->noiob = le16_to_cpu(id->noiob); ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); ns->ms = le16_to_cpu(id->lbaf[id->flbas & NVME_NS_FLBAS_LBA_MASK].ms); @@ -1181,27 +1202,9 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) else ns->pi_type = 0; - if (ctrl->nr_streams && ns->sws && ns->sgs) - stream_alignment = ns->sws * ns->sgs; - if (ns->noiob) nvme_set_chunk_size(ns); - - blk_mq_freeze_queue(disk->queue); - blk_integrity_unregister(disk); - - blk_queue_logical_block_size(ns->queue, bs); - if (ns->ms && !ns->ext && - (ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - nvme_init_integrity(disk, ns->ms, ns->pi_type); - if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) - set_capacity(disk, 0); - else - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - if (ctrl->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ctrl, stream_alignment, disk->queue); - blk_mq_unfreeze_queue(disk->queue); + nvme_update_disk_info(disk, ns, id); } static int nvme_revalidate_disk(struct gendisk *disk) -- cgit v1.2.1 From 715ea9e09dc81f18cad5ed64800f49b1c4c444df Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 7 Nov 2017 17:27:34 +0100 Subject: nvme: fix and clarify the check for missing metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update the check in nvme_setup_rw for missing metadata so that it is together with the other metadata handling, does not contain impossible to reach conditions and warns if we get an impossible requests for a (non-PI) metadata-enabled namespace when CONFIG_BLK_DEV_INTEGRITY is not set. Also add a little helper that checks if a given metadata configuration contains protection information Signed-off-by: Christoph Hellwig Reported-by: Javier González Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 28d58d353a42..3aabaa103e10 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -137,6 +137,11 @@ int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_delete_ctrl_sync); +static inline bool nvme_ns_has_pi(struct nvme_ns *ns) +{ + return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); +} + static blk_status_t nvme_error_status(struct request *req) { switch (nvme_req(req)->status & 0x7ff) { @@ -472,16 +477,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, u16 control = 0; u32 dsmgmt = 0; - /* - * If formated with metadata, require the block layer provide a buffer - * unless this namespace is formated such that the metadata can be - * stripped/generated by the controller with PRACT=1. - */ - if (ns && ns->ms && - (!ns->pi_type || ns->ms != sizeof(struct t10_pi_tuple)) && - !blk_integrity_rq(req) && !blk_rq_is_passthrough(req)) - return BLK_STS_NOTSUPP; - if (req->cmd_flags & REQ_FUA) control |= NVME_RW_FUA; if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) @@ -500,6 +495,18 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, nvme_assign_write_stream(ctrl, req, &control, &dsmgmt); if (ns->ms) { + /* + * If formated with metadata, the block layer always provides a + * metadata buffer if CONFIG_BLK_DEV_INTEGRITY is enabled. Else + * we enable the PRACT bit for protection information or set the + * namespace capacity to zero to prevent any I/O. + */ + if (!blk_integrity_rq(req)) { + if (WARN_ON_ONCE(!nvme_ns_has_pi(ns))) + return BLK_STS_NOTSUPP; + control |= NVME_RW_PRINFO_PRACT; + } + switch (ns->pi_type) { case NVME_NS_DPS_PI_TYPE3: control |= NVME_RW_PRINFO_PRCHK_GUARD; @@ -512,8 +519,6 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, nvme_block_nr(ns, blk_rq_pos(req))); break; } - if (!blk_integrity_rq(req)) - control |= NVME_RW_PRINFO_PRACT; } cmnd->rw.control = cpu_to_le16(control); @@ -1173,7 +1178,7 @@ static void nvme_update_disk_info(struct gendisk *disk, if (ns->ms && !ns->ext && (ns->ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) nvme_init_integrity(disk, ns->ms, ns->pi_type); - if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) + if (ns->ms && !nvme_ns_has_pi(ns) && !blk_get_integrity(disk)) capacity = 0; set_capacity(disk, capacity); -- cgit v1.2.1 From c627c487ec727350c6aaa1cf217b666099e83ee2 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 10:28:31 -0700 Subject: nvme: factor get log into a helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit And fix the warning on a successful firmware log. Reviewed-by: Javier González Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 3aabaa103e10..90b6375a9da5 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1747,6 +1747,18 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off); } +static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, + size_t size) +{ + struct nvme_command c = { }; + + c.common.opcode = nvme_admin_get_log_page; + c.common.nsid = cpu_to_le32(NVME_NSID_ALL); + c.common.cdw10[0] = nvme_get_log_dw10(log_page, size); + + return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); +} + /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as @@ -2579,18 +2591,13 @@ static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl) { - struct nvme_command c = { }; struct nvme_fw_slot_info_log *log; log = kmalloc(sizeof(*log), GFP_KERNEL); if (!log) return; - c.common.opcode = nvme_admin_get_log_page; - c.common.nsid = cpu_to_le32(NVME_NSID_ALL); - c.common.cdw10[0] = nvme_get_log_dw10(NVME_LOG_FW_SLOT, sizeof(*log)); - - if (!nvme_submit_sync_cmd(ctrl->admin_q, &c, log, sizeof(*log))) + if (nvme_get_log(ctrl, NVME_LOG_FW_SLOT, log, sizeof(*log))) dev_warn(ctrl->device, "Get FW SLOT INFO log error\n"); kfree(log); -- cgit v1.2.1 From 84fef62d135b6e47b52f4e9280b5dbc5bb0050ba Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 10:28:32 -0700 Subject: nvme: check admin passthru command effects The NVMe standard provides a command effects log page so the host may be aware of special requirements it may need to do for a particular command. For example, the command may need to run with IO quiesced to prevent timeouts or undefined behavior, or it may change the logical block formats that determine how the host needs to construct future commands. This patch saves the nvme command effects log page if the controller supports it, and performs appropriate actions before and after an admin passthrough command is completed. If the controller does not support the command effects log page, the driver will define the effects for known opcodes. The nvme format and santize are the only commands in this patch with known effects. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 107 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 1 + 2 files changed, 108 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 90b6375a9da5..65fd2fc1ae3c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -72,6 +72,9 @@ static DEFINE_IDA(nvme_instance_ida); static dev_t nvme_chr_devt; static struct class *nvme_class; +static void nvme_ns_remove(struct nvme_ns *ns); +static int nvme_revalidate_disk(struct gendisk *disk); + static __le32 nvme_get_log_dw10(u8 lid, size_t size) { return cpu_to_le32((((size / 4) - 1) << 16) | lid); @@ -992,12 +995,87 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) metadata, meta_len, io.slba, NULL, 0); } +static u32 nvme_known_admin_effects(u8 opcode) +{ + switch (opcode) { + case nvme_admin_format_nvm: + return NVME_CMD_EFFECTS_CSUPP | NVME_CMD_EFFECTS_LBCC | + NVME_CMD_EFFECTS_CSE_MASK; + case nvme_admin_sanitize_nvm: + return NVME_CMD_EFFECTS_CSE_MASK; + default: + break; + } + return 0; +} + +static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + u8 opcode) +{ + u32 effects = 0; + + if (ns) { + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->iocs[opcode]); + if (effects & ~NVME_CMD_EFFECTS_CSUPP) + dev_warn(ctrl->device, + "IO command:%02x has unhandled effects:%08x\n", + opcode, effects); + return 0; + } + + if (ctrl->effects) + effects = le32_to_cpu(ctrl->effects->iocs[opcode]); + else + effects = nvme_known_admin_effects(opcode); + + /* + * For simplicity, IO to all namespaces is quiesced even if the command + * effects say only one namespace is affected. + */ + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) { + nvme_start_freeze(ctrl); + nvme_wait_freeze(ctrl); + } + return effects; +} + +static void nvme_update_formats(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->disk && nvme_revalidate_disk(ns->disk)) + nvme_ns_remove(ns); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + +static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) +{ + /* + * Revalidate LBA changes prior to unfreezing. This is necessary to + * prevent memory corruption if a logical block size was changed by + * this command. + */ + if (effects & NVME_CMD_EFFECTS_LBCC) + nvme_update_formats(ctrl); + if (effects & (NVME_CMD_EFFECTS_LBCC | NVME_CMD_EFFECTS_CSE_MASK)) + nvme_unfreeze(ctrl); + if (effects & NVME_CMD_EFFECTS_CCC) + nvme_init_identify(ctrl); + if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) + nvme_queue_scan(ctrl); +} + static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; struct nvme_command c; unsigned timeout = 0; + u32 effects; int status; if (!capable(CAP_SYS_ADMIN)) @@ -1023,10 +1101,13 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, if (cmd.timeout_ms) timeout = msecs_to_jiffies(cmd.timeout_ms); + effects = nvme_passthru_start(ctrl, ns, cmd.opcode); status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, (void __user *)(uintptr_t)cmd.metadata, cmd.metadata, 0, &cmd.result, timeout); + nvme_passthru_end(ctrl, effects); + if (status >= 0) { if (put_user(cmd.result, &ucmd->result)) return -EFAULT; @@ -1759,6 +1840,25 @@ static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, return nvme_submit_sync_cmd(ctrl->admin_q, &c, log, size); } +static int nvme_get_effects_log(struct nvme_ctrl *ctrl) +{ + int ret; + + if (!ctrl->effects) + ctrl->effects = kzalloc(sizeof(*ctrl->effects), GFP_KERNEL); + + if (!ctrl->effects) + return 0; + + ret = nvme_get_log(ctrl, NVME_LOG_CMD_EFFECTS, ctrl->effects, + sizeof(*ctrl->effects)); + if (ret) { + kfree(ctrl->effects); + ctrl->effects = NULL; + } + return ret; +} + /* * Initialize the cached copies of the Identify data and various controller * register in our nvme_ctrl structure. This should be called as soon as @@ -1794,6 +1894,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return -EIO; } + if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) { + ret = nvme_get_effects_log(ctrl); + if (ret < 0) + return ret; + } + nvme_init_subnqn(ctrl, id); if (!ctrl->identified) { @@ -2713,6 +2819,7 @@ static void nvme_free_ctrl(struct device *dev) ida_simple_remove(&nvme_instance_ida, ctrl->instance); ida_destroy(&ctrl->ns_ida); + kfree(ctrl->effects); ctrl->ops->free_ctrl(ctrl); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 35d9cee515f1..f7c21cea9485 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -174,6 +174,7 @@ struct nvme_ctrl { bool subsystem; unsigned long quirks; struct nvme_id_power_state psd[32]; + struct nvme_effects_log *effects; struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; -- cgit v1.2.1 From c5760f300e25f7d2cea9a9002b1e058a5c18b73f Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 3 Nov 2017 08:13:16 -0700 Subject: nvme-fc: fix localport resume using stale values The localport resume was not updating the lldd ops structure. If the lldd is unloaded and reloaded, the ops pointers will differ. Additionally, as there are device references taken by the localport, ensure that resume only resumes if the device matches as well. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 113c30be7276..c49971d1a051 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -268,7 +268,9 @@ nvme_fc_lport_get(struct nvme_fc_lport *lport) static struct nvme_fc_lport * -nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) +nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo, + struct nvme_fc_port_template *ops, + struct device *dev) { struct nvme_fc_lport *lport; unsigned long flags; @@ -280,6 +282,11 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) lport->localport.port_name != pinfo->port_name) continue; + if (lport->dev != dev) { + lport = ERR_PTR(-EXDEV); + goto out_done; + } + if (lport->localport.port_state != FC_OBJSTATE_DELETED) { lport = ERR_PTR(-EEXIST); goto out_done; @@ -296,6 +303,7 @@ nvme_fc_attach_to_unreg_lport(struct nvme_fc_port_info *pinfo) /* resume the lport */ + lport->ops = ops; lport->localport.port_role = pinfo->port_role; lport->localport.port_id = pinfo->port_id; lport->localport.port_state = FC_OBJSTATE_ONLINE; @@ -356,7 +364,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, * expired, we can simply re-enable the localport. Remoteports * and controller reconnections should resume naturally. */ - newrec = nvme_fc_attach_to_unreg_lport(pinfo); + newrec = nvme_fc_attach_to_unreg_lport(pinfo, template, dev); /* found an lport, but something about its state is bad */ if (IS_ERR(newrec)) { -- cgit v1.2.1 From 158bfb8888c3f2beab747ab4f7a8197639f03e54 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 3 Nov 2017 08:13:17 -0700 Subject: nvme-fc: decouple ns references from lldd references In the lldd api, a lldd may unregister a remoteport (loss of connectivity or driver unload) or localport (driver unload). The lldd must wait for the remoteport_delete or localport_delete before completing its actions post the unregister. The xxx_deletes currently occur only when the xxxport structure is fully freed after all references are removed. Thus the lldd may be held hostage until an app or in-kernel entity that has a namespace open finally closes so the namespace can be removed, the controller removed, thus the transport objects, thus the lldd. This patch decouples the transport and os-facing objects from the lldd and the remoteport and localport. There is a point in all deletions where the transport will no longer interact with the lldd on behalf of a controller. That point centers around the association established with the target/subsystem. It will access the lldd whenever it attempts to create an association and while the association is active. New associations may only be created if the remoteport is live (thus the localport is live). It will not access the lldd after deleting the association. Therefore, the patch tracks the count of active controllers - those with associations being created or that are active - on a remoteport. It also tracks the number of remoteports that have active controllers, on a a localport. When a remoteport is unregistered, as soon as there are no active controllers, the lldd's remoteport_delete may be called and the lldd may continue. Similarly, when a localport is unregistered, as soon as there are no remoteports with active controllers, the localport_delete callback may be made. This significantly speeds up unregistration with the lldd. The transport objects continue in suspended status with reconnect timers running, and upon expiration, normal ref-counting will occur and the objects will be freed. The transport object may still be held hostage by the application/kernel module, but that is acceptable. With this change, the lldd may be fully unloaded and reloaded, and if registrations occur prior to the timeouts, the nvme controller and namespaces will resume normally as if a link bounce. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 78 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index c49971d1a051..da9296a1eb26 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -126,6 +126,7 @@ struct nvme_fc_lport { struct device *dev; /* physical device for dma */ struct nvme_fc_port_template *ops; struct kref ref; + atomic_t act_rport_cnt; } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ struct nvme_fc_rport { @@ -138,6 +139,7 @@ struct nvme_fc_rport { struct nvme_fc_lport *lport; spinlock_t lock; struct kref ref; + atomic_t act_ctrl_cnt; unsigned long dev_loss_end; } __aligned(sizeof(u64)); /* alignment for other things alloc'd with */ @@ -153,6 +155,7 @@ struct nvme_fc_ctrl { struct nvme_fc_rport *rport; u32 cnum; + bool assoc_active; u64 association_id; struct list_head ctrl_list; /* rport->ctrl_list */ @@ -243,9 +246,6 @@ nvme_fc_free_lport(struct kref *ref) list_del(&lport->port_list); spin_unlock_irqrestore(&nvme_fc_lock, flags); - /* let the LLDD know we've finished tearing it down */ - lport->ops->localport_delete(&lport->localport); - ida_simple_remove(&nvme_fc_local_port_cnt, lport->localport.port_num); ida_destroy(&lport->endp_cnt); @@ -400,6 +400,7 @@ nvme_fc_register_localport(struct nvme_fc_port_info *pinfo, INIT_LIST_HEAD(&newrec->port_list); INIT_LIST_HEAD(&newrec->endp_list); kref_init(&newrec->ref); + atomic_set(&newrec->act_rport_cnt, 0); newrec->ops = template; newrec->dev = dev; ida_init(&newrec->endp_cnt); @@ -462,6 +463,9 @@ nvme_fc_unregister_localport(struct nvme_fc_local_port *portptr) spin_unlock_irqrestore(&nvme_fc_lock, flags); + if (atomic_read(&lport->act_rport_cnt) == 0) + lport->ops->localport_delete(&lport->localport); + nvme_fc_lport_put(lport); return 0; @@ -515,9 +519,6 @@ nvme_fc_free_rport(struct kref *ref) list_del(&rport->endp_list); spin_unlock_irqrestore(&nvme_fc_lock, flags); - /* let the LLDD know we've finished tearing it down */ - lport->ops->remoteport_delete(&rport->remoteport); - ida_simple_remove(&lport->endp_cnt, rport->remoteport.port_num); kfree(rport); @@ -704,6 +705,7 @@ nvme_fc_register_remoteport(struct nvme_fc_local_port *localport, INIT_LIST_HEAD(&newrec->ctrl_list); INIT_LIST_HEAD(&newrec->ls_req_list); kref_init(&newrec->ref); + atomic_set(&newrec->act_ctrl_cnt, 0); spin_lock_init(&newrec->lock); newrec->remoteport.localport = &lport->localport; newrec->dev = lport->dev; @@ -859,6 +861,9 @@ nvme_fc_unregister_remoteport(struct nvme_fc_remote_port *portptr) nvme_fc_abort_lsops(rport); + if (atomic_read(&rport->act_ctrl_cnt) == 0) + rport->lport->ops->remoteport_delete(portptr); + /* * release the reference, which will allow, if all controllers * go away, which should only occur after dev_loss_tmo occurs, @@ -2639,6 +2644,61 @@ out_free_io_queues: return ret; } +static void +nvme_fc_rport_active_on_lport(struct nvme_fc_rport *rport) +{ + struct nvme_fc_lport *lport = rport->lport; + + atomic_inc(&lport->act_rport_cnt); +} + +static void +nvme_fc_rport_inactive_on_lport(struct nvme_fc_rport *rport) +{ + struct nvme_fc_lport *lport = rport->lport; + u32 cnt; + + cnt = atomic_dec_return(&lport->act_rport_cnt); + if (cnt == 0 && lport->localport.port_state == FC_OBJSTATE_DELETED) + lport->ops->localport_delete(&lport->localport); +} + +static int +nvme_fc_ctlr_active_on_rport(struct nvme_fc_ctrl *ctrl) +{ + struct nvme_fc_rport *rport = ctrl->rport; + u32 cnt; + + if (ctrl->assoc_active) + return 1; + + ctrl->assoc_active = true; + cnt = atomic_inc_return(&rport->act_ctrl_cnt); + if (cnt == 1) + nvme_fc_rport_active_on_lport(rport); + + return 0; +} + +static int +nvme_fc_ctlr_inactive_on_rport(struct nvme_fc_ctrl *ctrl) +{ + struct nvme_fc_rport *rport = ctrl->rport; + struct nvme_fc_lport *lport = rport->lport; + u32 cnt; + + /* ctrl->assoc_active=false will be set independently */ + + cnt = atomic_dec_return(&rport->act_ctrl_cnt); + if (cnt == 0) { + if (rport->remoteport.port_state == FC_OBJSTATE_DELETED) + lport->ops->remoteport_delete(&rport->remoteport); + nvme_fc_rport_inactive_on_lport(rport); + } + + return 0; +} + /* * This routine restarts the controller on the host side, and * on the link side, recreates the controller association. @@ -2655,6 +2715,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ctrl->rport->remoteport.port_state != FC_OBJSTATE_ONLINE) return -ENODEV; + if (nvme_fc_ctlr_active_on_rport(ctrl)) + return -ENOTUNIQ; + /* * Create the admin queue */ @@ -2762,6 +2825,8 @@ out_delete_hw_queue: __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); out_free_queue: nvme_fc_free_queue(&ctrl->queues[0]); + ctrl->assoc_active = false; + nvme_fc_ctlr_inactive_on_rport(ctrl); return ret; } @@ -2777,6 +2842,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) { unsigned long flags; + if (!ctrl->assoc_active) + return; + ctrl->assoc_active = false; + spin_lock_irqsave(&ctrl->lock, flags); ctrl->flags |= FCCTRL_TERMIO; ctrl->iocnt = 0; @@ -2849,6 +2918,8 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); nvme_fc_free_queue(&ctrl->queues[0]); + + nvme_fc_ctlr_inactive_on_rport(ctrl); } static void @@ -3048,6 +3119,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, ctrl->rport = rport; ctrl->dev = lport->dev; ctrl->cnum = idx; + ctrl->assoc_active = false; get_device(ctrl->dev); kref_init(&ctrl->ref); -- cgit v1.2.1 From 6ddcf0a30adc5080504ca66f474101e7ad247dd7 Mon Sep 17 00:00:00 2001 From: James Smart Date: Fri, 3 Nov 2017 09:33:30 -0700 Subject: lpfc: tie in to new dev_loss_tmo interface in nvme transport This patch calls the new nvme transport routine for dev_loss_tmo whenever the SCSI fc transport calls the lldd to make a dynamic change to a remote ports dev_loss_tmo. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/lpfc/lpfc_attr.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'drivers') diff --git a/drivers/scsi/lpfc/lpfc_attr.c b/drivers/scsi/lpfc/lpfc_attr.c index c17677f494af..3e02bc3a7c3f 100644 --- a/drivers/scsi/lpfc/lpfc_attr.c +++ b/drivers/scsi/lpfc/lpfc_attr.c @@ -3246,6 +3246,11 @@ lpfc_update_rport_devloss_tmo(struct lpfc_vport *vport) continue; if (ndlp->rport) ndlp->rport->dev_loss_tmo = vport->cfg_devloss_tmo; +#if (IS_ENABLED(CONFIG_NVME_FC)) + if (ndlp->nrport) + nvme_fc_set_remoteport_devloss(ndlp->nrport->remoteport, + vport->cfg_devloss_tmo); +#endif } spin_unlock_irq(shost->host_lock); } -- cgit v1.2.1 From 43b92fd27aaef0f529c9321cfebbaec1d7b8f503 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 5 Nov 2017 08:43:01 +0000 Subject: nvmet-rdma: update queue list during ib_device removal A NULL deref happens when nvmet_rdma_remove_one() is called more than once (e.g. while connected via 2 ports). The first call frees the queues related to the first ib_device but doesn't remove them from the queue list. While calling nvmet_rdma_remove_one() for the second ib_device it goes over the full queue list again and we get the NULL deref. Fixes: f1d4ef7d ("nvmet-rdma: register ib_client to not deadlock in device removal") Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/rdma.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 76d2bb793afe..3333d417b248 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1512,15 +1512,17 @@ static struct nvmet_fabrics_ops nvmet_rdma_ops = { static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) { - struct nvmet_rdma_queue *queue; + struct nvmet_rdma_queue *queue, *tmp; /* Device is being removed, delete all queues using this device */ mutex_lock(&nvmet_rdma_queue_mutex); - list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, + queue_list) { if (queue->dev->device != ib_device) continue; pr_info("Removing queue %d\n", queue->idx); + list_del_init(&queue->queue_list); __nvmet_rdma_queue_disconnect(queue); } mutex_unlock(&nvmet_rdma_queue_mutex); -- cgit v1.2.1 From 1f61def985d896da5935ae85a69c79fc26c7952f Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Mon, 6 Nov 2017 16:18:51 +0200 Subject: nvme-rdma: fix nvme_rdma_create_queue_ib error flow QP object is created using rdma_cm api, therefore the destruction should use the same api for symmetry. Signed-off-by: Max Gurtovoy Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 03644ecf68d2..32e21ab1ae52 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -493,7 +493,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue) return 0; out_destroy_qp: - ib_destroy_qp(queue->qp); + rdma_destroy_qp(queue->cm_id); out_destroy_ib_cq: ib_free_cq(queue->ib_cq); out_put_dev: -- cgit v1.2.1 From 18c53e40487f56369c3ba9331ec3597d9b48d97c Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Tue, 7 Nov 2017 21:10:22 +0900 Subject: nvmet: fix comment typos in admin-cmd.c small typos fixed in admin-cmd.c Signed-off-by: Minwoo Im Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index c4a0bf36e752..9a7d6917e190 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -300,7 +300,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req) } /* - * nuse = ncap = nsze isn't aways true, but we have no way to find + * nuse = ncap = nsze isn't always true, but we have no way to find * that out from the underlying device. */ id->ncap = id->nuse = id->nsze = @@ -424,7 +424,7 @@ out: } /* - * A "mimimum viable" abort implementation: the command is mandatory in the + * A "minimum viable" abort implementation: the command is mandatory in the * spec, but we are not required to do any useful work. We couldn't really * do a useful abort, so don't bother even with waiting for the command * to be exectuted and return immediately telling the command to abort -- cgit v1.2.1 From f00c4d80ffdac9e3a64947ebd57489f3232d5a74 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 5 Nov 2017 10:36:31 +0300 Subject: block: pass full fmode_t to blk_verify_command Use the obvious calling convention. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/scsi/sg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 0419c2298eab..92fd870e1315 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -217,7 +217,7 @@ static int sg_allow_access(struct file *filp, unsigned char *cmd) if (sfp->parentdp->device->type == TYPE_SCANNER) return 0; - return blk_verify_command(cmd, filp->f_mode & FMODE_WRITE); + return blk_verify_command(cmd, filp->f_mode); } static int -- cgit v1.2.1 From a47619b5c6ead6c2e10b8114a5e9291a1679a7c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Wed, 8 Nov 2017 10:59:03 +0100 Subject: nvme: compare NQN string with right size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Copy subnqns using NVMF_NQN_SIZE as it is < 256 Signed-off-by: Javier González Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 65fd2fc1ae3c..8a60cc786220 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1810,7 +1810,7 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { - strcpy(ctrl->subnqn, id->subnqn); + strncpy(ctrl->subnqn, id->subnqn, NVMF_NQN_SIZE); return; } -- cgit v1.2.1 From ab083b11f6f43af071f9b507e48d212bfbb2beb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Wed, 8 Nov 2017 10:59:04 +0100 Subject: nvme: fix eui_show() print format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix print formatting, but keep the original output to prevent user breakage as suggested by Joe Perches. Signed-off-by: Javier González Reviewed-by: Sagi Grimberg Reviewed-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8a60cc786220..6c79b73fe2c3 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2190,7 +2190,7 @@ static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%8phd\n", ns->eui); + return sprintf(buf, "%8ph\n", ns->eui); } static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); -- cgit v1.2.1 From dafc040ba6d813df225a25f8b4bc3eae2207a6be Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 8 Nov 2017 12:00:29 +0200 Subject: nvmet: remove redundant memset if failed to get_smart_log failed We already allocated the buffer with kzalloc. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 9a7d6917e190..dd087be395b9 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -144,10 +144,8 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) } smart_log = buf; status = nvmet_get_smart_log(req, smart_log); - if (status) { - memset(buf, '\0', data_len); + if (status) goto err; - } break; case NVME_LOG_FW_SLOT: /* -- cgit v1.2.1 From 4185f25acbc2a191341c7d8cf51e449698eceb06 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 8 Nov 2017 12:00:30 +0200 Subject: nvmet: remove redundant local variable the status is either success or some status id and we don't need a local variable for it. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/admin-cmd.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index dd087be395b9..90dcdc40ac71 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -35,17 +35,14 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd) static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, struct nvme_smart_log *slog) { - u16 status; struct nvmet_ns *ns; u64 host_reads, host_writes, data_units_read, data_units_written; - status = NVME_SC_SUCCESS; ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid); if (!ns) { - status = NVME_SC_INVALID_NS; pr_err("nvmet : Could not find namespace id : %d\n", le32_to_cpu(req->cmd->get_log_page.nsid)); - goto out; + return NVME_SC_INVALID_NS; } host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); @@ -58,20 +55,18 @@ static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, put_unaligned_le64(host_writes, &slog->host_writes[0]); put_unaligned_le64(data_units_written, &slog->data_units_written[0]); nvmet_put_namespace(ns); -out: - return status; + + return NVME_SC_SUCCESS; } static u16 nvmet_get_smart_log_all(struct nvmet_req *req, struct nvme_smart_log *slog) { - u16 status; u64 host_reads = 0, host_writes = 0; u64 data_units_read = 0, data_units_written = 0; struct nvmet_ns *ns; struct nvmet_ctrl *ctrl; - status = NVME_SC_SUCCESS; ctrl = req->sq->ctrl; rcu_read_lock(); @@ -91,7 +86,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req, put_unaligned_le64(host_writes, &slog->host_writes[0]); put_unaligned_le64(data_units_written, &slog->data_units_written[0]); - return status; + return NVME_SC_SUCCESS; } static u16 nvmet_get_smart_log(struct nvmet_req *req, -- cgit v1.2.1 From 38dabe210fbab4e7e8a03670ab3ba42f247ea08f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:10 -0700 Subject: nvme: centralize AEN defines All the transports were unnecessarilly duplicating the AEN request accounting. This patch defines everything in one place. Signed-off-by: Keith Busch Reviewed-by: Guan Junxiong Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fc.c | 33 ++++++++++++--------------------- drivers/nvme/host/nvme.h | 1 - drivers/nvme/host/pci.c | 16 +++------------- drivers/nvme/host/rdma.c | 14 +++----------- drivers/nvme/target/loop.c | 14 +++----------- 6 files changed, 22 insertions(+), 58 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6c79b73fe2c3..315087dfcd8d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2779,7 +2779,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_queue_async_events(struct nvme_ctrl *ctrl) { - ctrl->event_limit = NVME_NR_AERS; + ctrl->event_limit = NVME_NR_AEN_COMMANDS; queue_work(nvme_wq, &ctrl->async_event_work); } EXPORT_SYMBOL_GPL(nvme_queue_async_events); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index da9296a1eb26..f3bfad47a95f 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -30,15 +30,6 @@ /* *************************** Data Structures/Defines ****************** */ -/* - * We handle AEN commands ourselves and don't even let the - * block layer know about them. - */ -#define NVME_FC_NR_AEN_COMMANDS 1 -#define NVME_FC_AQ_BLKMQ_DEPTH \ - (NVME_AQ_DEPTH - NVME_FC_NR_AEN_COMMANDS) -#define AEN_CMDID_BASE (NVME_FC_AQ_BLKMQ_DEPTH + 1) - enum nvme_fc_queue_flags { NVME_FC_Q_CONNECTED = (1 << 0), }; @@ -170,7 +161,7 @@ struct nvme_fc_ctrl { u32 iocnt; wait_queue_head_t ioabort_wait; - struct nvme_fc_fcp_op aen_ops[NVME_FC_NR_AEN_COMMANDS]; + struct nvme_fc_fcp_op aen_ops[NVME_NR_AEN_COMMANDS]; struct nvme_ctrl ctrl; }; @@ -1546,7 +1537,7 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl) unsigned long flags; int i, ret; - for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE) continue; @@ -1816,7 +1807,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) int i, ret; aen_op = ctrl->aen_ops; - for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { private = kzalloc(ctrl->lport->ops->fcprqst_priv_sz, GFP_KERNEL); if (!private) @@ -1826,7 +1817,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) sqe = &cmdiu->sqe; ret = __nvme_fc_init_request(ctrl, &ctrl->queues[0], aen_op, (struct request *)NULL, - (AEN_CMDID_BASE + i)); + (NVME_AQ_BLK_MQ_DEPTH + i)); if (ret) { kfree(private); return ret; @@ -1839,7 +1830,7 @@ nvme_fc_init_aen_ops(struct nvme_fc_ctrl *ctrl) memset(sqe, 0, sizeof(*sqe)); sqe->common.opcode = nvme_admin_async_event; /* Note: core layer may overwrite the sqe.command_id value */ - sqe->common.command_id = AEN_CMDID_BASE + i; + sqe->common.command_id = NVME_AQ_BLK_MQ_DEPTH + i; } return 0; } @@ -1851,7 +1842,7 @@ nvme_fc_term_aen_ops(struct nvme_fc_ctrl *ctrl) int i; aen_op = ctrl->aen_ops; - for (i = 0; i < NVME_FC_NR_AEN_COMMANDS; i++, aen_op++) { + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { if (!aen_op->fcp_req.private) continue; @@ -2402,7 +2393,7 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) bool terminating = false; blk_status_t ret; - if (aer_idx > NVME_FC_NR_AEN_COMMANDS) + if (aer_idx > NVME_NR_AEN_COMMANDS) return; spin_lock_irqsave(&ctrl->lock, flags); @@ -2722,16 +2713,16 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * Create the admin queue */ - nvme_fc_init_queue(ctrl, 0, NVME_FC_AQ_BLKMQ_DEPTH); + nvme_fc_init_queue(ctrl, 0, NVME_AQ_BLK_MQ_DEPTH); ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, - NVME_FC_AQ_BLKMQ_DEPTH); + NVME_AQ_BLK_MQ_DEPTH); if (ret) goto out_free_queue; ret = nvme_fc_connect_admin_queue(ctrl, &ctrl->queues[0], - NVME_FC_AQ_BLKMQ_DEPTH, - (NVME_FC_AQ_BLKMQ_DEPTH / 4)); + NVME_AQ_BLK_MQ_DEPTH, + (NVME_AQ_BLK_MQ_DEPTH / 4)); if (ret) goto out_delete_hw_queue; @@ -3145,7 +3136,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); ctrl->admin_tag_set.ops = &nvme_fc_admin_mq_ops; - ctrl->admin_tag_set.queue_depth = NVME_FC_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; ctrl->admin_tag_set.reserved_tags = 2; /* fabric connect + Keep-Alive */ ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_fc_fcp_op) + diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index f7c21cea9485..a6d750cfa6b2 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -313,7 +313,6 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl); int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, bool send); -#define NVME_NR_AERS 1 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res); void nvme_queue_async_events(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 32c413ec818c..c3dfd84feef7 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -35,12 +35,6 @@ #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -/* - * We handle AEN commands ourselves and don't even let the - * block layer know about them. - */ -#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AERS) - #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) static int use_threaded_interrupts; @@ -956,7 +950,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, * for them but rather special case them here. */ if (unlikely(nvmeq->qid == 0 && - cqe->command_id >= NVME_AQ_BLKMQ_DEPTH)) { + cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) { nvme_complete_async_event(&nvmeq->dev->ctrl, cqe->status, &cqe->result); return; @@ -1057,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; - c.common.command_id = NVME_AQ_BLKMQ_DEPTH + aer_idx; + c.common.command_id = NVME_AQ_BLK_MQ_DEPTH + aer_idx; spin_lock_irq(&nvmeq->q_lock); __nvme_submit_cmd(nvmeq, &c); @@ -1524,11 +1518,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.ops = &nvme_mq_admin_ops; dev->admin_tagset.nr_hw_queues = 1; - /* - * Subtract one to leave an empty queue entry for 'Full Queue' - * condition. See NVM-Express 1.2 specification, section 4.1.2. - */ - dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1; + dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 32e21ab1ae52..008791bbdfb3 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -41,14 +41,6 @@ #define NVME_RDMA_MAX_INLINE_SEGMENTS 1 -/* - * We handle AEN commands ourselves and don't even let the - * block layer know about them. - */ -#define NVME_RDMA_NR_AEN_COMMANDS 1 -#define NVME_RDMA_AQ_BLKMQ_DEPTH \ - (NVME_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS) - struct nvme_rdma_device { struct ib_device *dev; struct ib_pd *pd; @@ -690,7 +682,7 @@ static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl, set = &ctrl->admin_tag_set; memset(set, 0, sizeof(*set)); set->ops = &nvme_rdma_admin_mq_ops; - set->queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH; + set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; set->reserved_tags = 2; /* connect + keep-alive */ set->numa_node = NUMA_NO_NODE; set->cmd_size = sizeof(struct nvme_rdma_request) + @@ -1318,7 +1310,7 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) memset(cmd, 0, sizeof(*cmd)); cmd->common.opcode = nvme_admin_async_event; - cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH; + cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH; cmd->common.flags |= NVME_CMD_SGL_METABUF; nvme_rdma_set_sg_null(cmd); @@ -1380,7 +1372,7 @@ static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag) * for them but rather special case them here. */ if (unlikely(nvme_rdma_queue_idx(queue) == 0 && - cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH)) + cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); else diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index bc95c6ed531a..7258b796f209 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -23,14 +23,6 @@ #define NVME_LOOP_MAX_SEGMENTS 256 -/* - * We handle AEN commands ourselves and don't even let the - * block layer know about them. - */ -#define NVME_LOOP_NR_AEN_COMMANDS 1 -#define NVME_LOOP_AQ_BLKMQ_DEPTH \ - (NVME_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) - struct nvme_loop_iod { struct nvme_request nvme_req; struct nvme_command cmd; @@ -112,7 +104,7 @@ static void nvme_loop_queue_response(struct nvmet_req *req) * for them but rather special case them here. */ if (unlikely(nvme_loop_queue_idx(queue) == 0 && - cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) { + cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) { nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); } else { @@ -200,7 +192,7 @@ static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) memset(&iod->cmd, 0, sizeof(iod->cmd)); iod->cmd.common.opcode = nvme_admin_async_event; - iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH; + iod->cmd.common.command_id = NVME_AQ_BLK_MQ_DEPTH; iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, @@ -356,7 +348,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; - ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.queue_depth = NVME_AQ_MQ_TAG_DEPTH; ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + -- cgit v1.2.1 From 08e1507544839b98fc3732aea935e70ed9c209ec Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:11 -0700 Subject: nvme-fc: remove unused "queue_size" field This was being saved in a structure, but never used anywhere. The queue size is obtained through other means, so there's no reason to duplicate this without a user for it. Reviewed-by: Sagi Grimberg Signed-off-by: Keith Busch Reviewed-by: Guan Junxiong Reviewed-by: James Smart Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/fc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index f3bfad47a95f..3aa595029192 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -43,7 +43,6 @@ struct nvme_fc_queue { struct device *dev; struct blk_mq_hw_ctx *hctx; void *lldd_handle; - int queue_size; size_t cmnd_capsule_len; u32 qnum; u32 rqcnt; @@ -1886,7 +1885,7 @@ nvme_fc_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, } static void -nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size) +nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx) { struct nvme_fc_queue *queue; @@ -1902,8 +1901,6 @@ nvme_fc_init_queue(struct nvme_fc_ctrl *ctrl, int idx, size_t queue_size) else queue->cmnd_capsule_len = sizeof(struct nvme_command); - queue->queue_size = queue_size; - /* * Considered whether we should allocate buffers for all SQEs * and CQEs and dma map them - mapping their respective entries @@ -2027,7 +2024,7 @@ nvme_fc_init_io_queues(struct nvme_fc_ctrl *ctrl) int i; for (i = 1; i < ctrl->ctrl.queue_count; i++) - nvme_fc_init_queue(ctrl, i, ctrl->ctrl.sqsize); + nvme_fc_init_queue(ctrl, i); } static void @@ -2713,7 +2710,7 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * Create the admin queue */ - nvme_fc_init_queue(ctrl, 0, NVME_AQ_BLK_MQ_DEPTH); + nvme_fc_init_queue(ctrl, 0); ret = __nvme_fc_create_hw_queue(ctrl, &ctrl->queues[0], 0, NVME_AQ_BLK_MQ_DEPTH); -- cgit v1.2.1 From ad22c355b707a8d8d48e282aadc01c0b0604b2e9 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:12 -0700 Subject: nvme: remove handling of multiple AEN requests The driver can handle tracking only one AEN request, so this patch removes handling for multiple ones. Reviewed-by: Christoph Hellwig Reviewed-by: James Smart Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 28 +++------------------------- drivers/nvme/host/fc.c | 9 +++------ drivers/nvme/host/nvme.h | 3 +-- drivers/nvme/host/pci.c | 4 ++-- drivers/nvme/host/rdma.c | 5 +---- drivers/nvme/target/loop.c | 2 +- 6 files changed, 11 insertions(+), 40 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 315087dfcd8d..dedbf12847b6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2670,15 +2670,7 @@ static void nvme_async_event_work(struct work_struct *work) struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, async_event_work); - spin_lock_irq(&ctrl->lock); - while (ctrl->state == NVME_CTRL_LIVE && ctrl->event_limit > 0) { - int aer_idx = --ctrl->event_limit; - - spin_unlock_irq(&ctrl->lock); - ctrl->ops->submit_async_event(ctrl, aer_idx); - spin_lock_irq(&ctrl->lock); - } - spin_unlock_irq(&ctrl->lock); + ctrl->ops->submit_async_event(ctrl); } static bool nvme_ctrl_pp_status(struct nvme_ctrl *ctrl) @@ -2745,22 +2737,8 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res) { u32 result = le32_to_cpu(res->u32); - bool done = true; - switch (le16_to_cpu(status) >> 1) { - case NVME_SC_SUCCESS: - done = false; - /*FALLTHRU*/ - case NVME_SC_ABORT_REQ: - ++ctrl->event_limit; - if (ctrl->state == NVME_CTRL_LIVE) - queue_work(nvme_wq, &ctrl->async_event_work); - break; - default: - break; - } - - if (done) + if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) return; switch (result & 0xff07) { @@ -2774,12 +2752,12 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, default: dev_warn(ctrl->device, "async event result %08x\n", result); } + queue_work(nvme_wq, &ctrl->async_event_work); } EXPORT_SYMBOL_GPL(nvme_complete_async_event); void nvme_queue_async_events(struct nvme_ctrl *ctrl) { - ctrl->event_limit = NVME_NR_AEN_COMMANDS; queue_work(nvme_wq, &ctrl->async_event_work); } EXPORT_SYMBOL_GPL(nvme_queue_async_events); diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 3aa595029192..6eb460b117d6 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2382,7 +2382,7 @@ nvme_fc_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) } static void -nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +nvme_fc_submit_async_event(struct nvme_ctrl *arg) { struct nvme_fc_ctrl *ctrl = to_fc_ctrl(arg); struct nvme_fc_fcp_op *aen_op; @@ -2390,9 +2390,6 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) bool terminating = false; blk_status_t ret; - if (aer_idx > NVME_NR_AEN_COMMANDS) - return; - spin_lock_irqsave(&ctrl->lock, flags); if (ctrl->flags & FCCTRL_TERMIO) terminating = true; @@ -2401,13 +2398,13 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg, int aer_idx) if (terminating) return; - aen_op = &ctrl->aen_ops[aer_idx]; + aen_op = &ctrl->aen_ops[0]; ret = nvme_fc_start_fcp_op(ctrl, aen_op->queue, aen_op, 0, NVMEFC_FCP_NODATA); if (ret) dev_err(ctrl->ctrl.device, - "failed async event work [%d]\n", aer_idx); + "failed async event work\n"); } static void diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index a6d750cfa6b2..b55c97ecea31 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -162,7 +162,6 @@ struct nvme_ctrl { u16 nssa; u16 nr_streams; atomic_t abort_limit; - u8 event_limit; u8 vwc; u32 vs; u32 sgls; @@ -237,7 +236,7 @@ struct nvme_ctrl_ops { int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); void (*free_ctrl)(struct nvme_ctrl *ctrl); - void (*submit_async_event)(struct nvme_ctrl *ctrl, int aer_idx); + void (*submit_async_event)(struct nvme_ctrl *ctrl); void (*delete_ctrl)(struct nvme_ctrl *ctrl); int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size); int (*reinit_request)(void *data, struct request *rq); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index c3dfd84feef7..429d56f1a19e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1043,7 +1043,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return __nvme_poll(nvmeq, tag); } -static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) +static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); struct nvme_queue *nvmeq = dev->queues[0]; @@ -1051,7 +1051,7 @@ static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl, int aer_idx) memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; - c.common.command_id = NVME_AQ_BLK_MQ_DEPTH + aer_idx; + c.common.command_id = NVME_AQ_BLK_MQ_DEPTH; spin_lock_irq(&nvmeq->q_lock); __nvme_submit_cmd(nvmeq, &c); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 008791bbdfb3..c8d854474a5b 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1293,7 +1293,7 @@ static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue) return queue->ctrl->tag_set.tags[queue_idx - 1]; } -static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg); struct nvme_rdma_queue *queue = &ctrl->queues[0]; @@ -1303,9 +1303,6 @@ static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx) struct ib_sge sge; int ret; - if (WARN_ON_ONCE(aer_idx != 0)) - return; - ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE); memset(cmd, 0, sizeof(*cmd)); diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 7258b796f209..f40e70eb4a38 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -184,7 +184,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_STS_OK; } -static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +static void nvme_loop_submit_async_event(struct nvme_ctrl *arg) { struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg); struct nvme_loop_queue *queue = &ctrl->queues[0]; -- cgit v1.2.1 From d99ca609a1b55f87a5e62a11ed70e4d091d815f0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:13 -0700 Subject: nvme: unexport starting async event work Async event work is for core use only and should not be called directly from drivers. Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch Reviewed-by: Guan Junxiong Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 8 +------- drivers/nvme/host/nvme.h | 1 - 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index dedbf12847b6..c135d0aeebd7 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2756,12 +2756,6 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, } EXPORT_SYMBOL_GPL(nvme_complete_async_event); -void nvme_queue_async_events(struct nvme_ctrl *ctrl) -{ - queue_work(nvme_wq, &ctrl->async_event_work); -} -EXPORT_SYMBOL_GPL(nvme_queue_async_events); - void nvme_stop_ctrl(struct nvme_ctrl *ctrl) { nvme_stop_keep_alive(ctrl); @@ -2778,7 +2772,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_queue_async_events(ctrl); + queue_work(nvme_wq, &ctrl->async_event_work); nvme_start_queues(ctrl); } } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b55c97ecea31..151062573ece 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -314,7 +314,6 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len, void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, union nvme_result *res); -void nvme_queue_async_events(struct nvme_ctrl *ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl); -- cgit v1.2.1 From e3d7874dcf175cca2dca7795d6453f637ad8ba9b Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 7 Nov 2017 15:13:14 -0700 Subject: nvme: send uevent for some asynchronous events This will give udev a chance to observe and handle asynchronous event notifications and clear the log to unmask future events of the same type. The driver will create a change uevent of the asyncronuos event result before submitting the next AEN request to the device if a completed AEN event is of type error, smart, command set or vendor specific, Signed-off-by: Keith Busch Reviewed-by: Guan Junxiong Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 28 ++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 1 + 2 files changed, 29 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c135d0aeebd7..683d890d73fa 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2665,11 +2665,28 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_remove_namespaces); +static void nvme_aen_uevent(struct nvme_ctrl *ctrl) +{ + char *envp[2] = { NULL, NULL }; + u32 aen_result = ctrl->aen_result; + + ctrl->aen_result = 0; + if (!aen_result) + return; + + envp[0] = kasprintf(GFP_KERNEL, "NVME_AEN=%#08x", aen_result); + if (!envp[0]) + return; + kobject_uevent_env(&ctrl->device->kobj, KOBJ_CHANGE, envp); + kfree(envp[0]); +} + static void nvme_async_event_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, async_event_work); + nvme_aen_uevent(ctrl); ctrl->ops->submit_async_event(ctrl); } @@ -2741,6 +2758,17 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status, if (le16_to_cpu(status) >> 1 != NVME_SC_SUCCESS) return; + switch (result & 0x7) { + case NVME_AER_ERROR: + case NVME_AER_SMART: + case NVME_AER_CSS: + case NVME_AER_VS: + ctrl->aen_result = result; + break; + default: + break; + } + switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: dev_info(ctrl->device, "rescanning\n"); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 151062573ece..7b9cc7d616b7 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -168,6 +168,7 @@ struct nvme_ctrl { u16 kas; u8 npss; u8 apsta; + u32 aen_result; unsigned int shutdown_timeout; unsigned int kato; bool subsystem; -- cgit v1.2.1 From 03e0f3a65e4da497c3b7b213c68943cbc73a2e34 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 9 Nov 2017 19:32:07 +0800 Subject: nvme-pci: avoid dereference of symbol from unloaded module The 'remove_work' may be scheduled to run after nvme_remove() returns since we can't simply cancel it in nvme_remove() for avoiding deadlock. Once nvme_remove() returns, this module(nvme) can be unloaded. On the other hand, nvme_put_ctrl() calls ctr->ops->free_ctrl which may point to nvme_pci_free_ctrl() in unloaded module. This patch avoids this issue by queuing 'remove_work' via 'nvme_wq', and flush this worqueue in nvme_exit() as suggested by Sagi. Suggested-by: Sagi Grimberg Signed-off-by: Ming Lei Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/pci.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 429d56f1a19e..762b8402e04c 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2280,7 +2280,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) nvme_get_ctrl(&dev->ctrl); nvme_dev_disable(dev, false); - if (!schedule_work(&dev->remove_work)) + if (!queue_work(nvme_wq, &dev->remove_work)) nvme_put_ctrl(&dev->ctrl); } @@ -2703,6 +2703,7 @@ static int __init nvme_init(void) static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); + flush_workqueue(nvme_wq); _nvme_check_size(); } -- cgit v1.2.1 From 5e62d5c993e6889cd314d5b5de6b670152109a0e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 14:29:58 +0100 Subject: nvmet: better data length validation Currently the NVMe target stores the expexted data length in req->data_len and uses that for data transfer decisions, but that does not take the actual transfer length in the SGLs into account. So this adds a new transfer_len field, into which the transport drivers store the actual transfer length. We then check the two match before actually executing the command. The FC transport driver already had such a field, which is removed in favour of the common one. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/core.c | 10 ++++++++++ drivers/nvme/target/fc.c | 32 ++++++++++++-------------------- drivers/nvme/target/loop.c | 3 ++- drivers/nvme/target/nvmet.h | 4 ++++ drivers/nvme/target/rdma.c | 10 ++++++---- 5 files changed, 34 insertions(+), 25 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index da088293eafc..22a2a2bb40f9 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -501,6 +501,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, req->ops = ops; req->sg = NULL; req->sg_cnt = 0; + req->transfer_len = 0; req->rsp->status = 0; /* no support for fused commands yet */ @@ -550,6 +551,15 @@ void nvmet_req_uninit(struct nvmet_req *req) } EXPORT_SYMBOL_GPL(nvmet_req_uninit); +void nvmet_req_execute(struct nvmet_req *req) +{ + if (unlikely(req->data_len != req->transfer_len)) + nvmet_req_complete(req, NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR); + else + req->execute(req); +} +EXPORT_SYMBOL_GPL(nvmet_req_execute); + static inline bool nvmet_cc_en(u32 cc) { return (cc >> NVME_CC_EN_SHIFT) & 0x1; diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c index 5f2570f5dfa9..739b8feadc7d 100644 --- a/drivers/nvme/target/fc.c +++ b/drivers/nvme/target/fc.c @@ -76,7 +76,6 @@ struct nvmet_fc_fcp_iod { dma_addr_t rspdma; struct scatterlist *data_sg; int data_sg_cnt; - u32 total_length; u32 offset; enum nvmet_fcp_datadir io_dir; bool active; @@ -1700,7 +1699,7 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod) u32 page_len, length; int i = 0; - length = fod->total_length; + length = fod->req.transfer_len; nent = DIV_ROUND_UP(length, PAGE_SIZE); sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); if (!sg) @@ -1789,7 +1788,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport, u32 rsn, rspcnt, xfr_length; if (fod->fcpreq->op == NVMET_FCOP_READDATA_RSP) - xfr_length = fod->total_length; + xfr_length = fod->req.transfer_len; else xfr_length = fod->offset; @@ -1815,7 +1814,7 @@ nvmet_fc_prep_fcp_rsp(struct nvmet_fc_tgtport *tgtport, rspcnt = atomic_inc_return(&fod->queue->zrspcnt); if (!(rspcnt % fod->queue->ersp_ratio) || sqe->opcode == nvme_fabrics_command || - xfr_length != fod->total_length || + xfr_length != fod->req.transfer_len || (le16_to_cpu(cqe->status) & 0xFFFE) || cqewd[0] || cqewd[1] || (sqe->flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND)) || queue_90percent_full(fod->queue, le16_to_cpu(cqe->sq_head))) @@ -1892,7 +1891,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, fcpreq->timeout = NVME_FC_TGTOP_TIMEOUT_SEC; tlen = min_t(u32, tgtport->max_sg_cnt * PAGE_SIZE, - (fod->total_length - fod->offset)); + (fod->req.transfer_len - fod->offset)); fcpreq->transfer_length = tlen; fcpreq->transferred_length = 0; fcpreq->fcp_error = 0; @@ -1906,7 +1905,7 @@ nvmet_fc_transfer_fcp_data(struct nvmet_fc_tgtport *tgtport, * combined xfr with response. */ if ((op == NVMET_FCOP_READDATA) && - ((fod->offset + fcpreq->transfer_length) == fod->total_length) && + ((fod->offset + fcpreq->transfer_length) == fod->req.transfer_len) && (tgtport->ops->target_features & NVMET_FCTGTFEAT_READDATA_RSP)) { fcpreq->op = NVMET_FCOP_READDATA_RSP; nvmet_fc_prep_fcp_rsp(tgtport, fod); @@ -1986,7 +1985,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) } fod->offset += fcpreq->transferred_length; - if (fod->offset != fod->total_length) { + if (fod->offset != fod->req.transfer_len) { spin_lock_irqsave(&fod->flock, flags); fod->writedataactive = true; spin_unlock_irqrestore(&fod->flock, flags); @@ -1998,9 +1997,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) } /* data transfer complete, resume with nvmet layer */ - - fod->req.execute(&fod->req); - + nvmet_req_execute(&fod->req); break; case NVMET_FCOP_READDATA: @@ -2023,7 +2020,7 @@ nvmet_fc_fod_op_done(struct nvmet_fc_fcp_iod *fod) } fod->offset += fcpreq->transferred_length; - if (fod->offset != fod->total_length) { + if (fod->offset != fod->req.transfer_len) { /* transfer the next chunk */ nvmet_fc_transfer_fcp_data(tgtport, fod, NVMET_FCOP_READDATA); @@ -2160,7 +2157,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, fod->fcpreq->done = nvmet_fc_xmt_fcp_op_done; - fod->total_length = be32_to_cpu(cmdiu->data_len); + fod->req.transfer_len = be32_to_cpu(cmdiu->data_len); if (cmdiu->flags & FCNVME_CMD_FLAGS_WRITE) { fod->io_dir = NVMET_FCP_WRITE; if (!nvme_is_write(&cmdiu->sqe)) @@ -2171,7 +2168,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, goto transport_error; } else { fod->io_dir = NVMET_FCP_NODATA; - if (fod->total_length) + if (fod->req.transfer_len) goto transport_error; } @@ -2179,9 +2176,6 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, fod->req.rsp = &fod->rspiubuf.cqe; fod->req.port = fod->queue->port; - /* ensure nvmet handlers will set cmd handler callback */ - fod->req.execute = NULL; - /* clear any response payload */ memset(&fod->rspiubuf, 0, sizeof(fod->rspiubuf)); @@ -2201,7 +2195,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, /* keep a running counter of tail position */ atomic_inc(&fod->queue->sqtail); - if (fod->total_length) { + if (fod->req.transfer_len) { ret = nvmet_fc_alloc_tgt_pgs(fod); if (ret) { nvmet_req_complete(&fod->req, ret); @@ -2224,9 +2218,7 @@ nvmet_fc_handle_fcp_rqst(struct nvmet_fc_tgtport *tgtport, * can invoke the nvmet_layer now. If read data, cmd completion will * push the data */ - - fod->req.execute(&fod->req); - + nvmet_req_execute(&fod->req); return; transport_error: diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f40e70eb4a38..96d390416789 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -127,7 +127,7 @@ static void nvme_loop_execute_work(struct work_struct *work) struct nvme_loop_iod *iod = container_of(work, struct nvme_loop_iod, work); - iod->req.execute(&iod->req); + nvmet_req_execute(&iod->req); } static enum blk_eh_timer_return @@ -176,6 +176,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, iod->req.sg = iod->sg_table.sgl; iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + iod->req.transfer_len = blk_rq_bytes(req); } blk_mq_start_request(req); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index e342f02845c1..194ebffc688c 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -223,7 +223,10 @@ struct nvmet_req { struct bio inline_bio; struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; int sg_cnt; + /* data length as parsed from the command: */ size_t data_len; + /* data length as parsed from the SGL descriptor: */ + size_t transfer_len; struct nvmet_port *port; @@ -266,6 +269,7 @@ u16 nvmet_parse_fabrics_cmd(struct nvmet_req *req); bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops); void nvmet_req_uninit(struct nvmet_req *req); +void nvmet_req_execute(struct nvmet_req *req); void nvmet_req_complete(struct nvmet_req *req, u16 status); void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 3333d417b248..49912909c298 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -148,14 +148,14 @@ static inline u32 get_unaligned_le24(const u8 *p) static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) { return nvme_is_write(rsp->req.cmd) && - rsp->req.data_len && + rsp->req.transfer_len && !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); } static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) { return !nvme_is_write(rsp->req.cmd) && - rsp->req.data_len && + rsp->req.transfer_len && !rsp->req.rsp->status && !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); } @@ -577,7 +577,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) return; } - rsp->req.execute(&rsp->req); + nvmet_req_execute(&rsp->req); } static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, @@ -609,6 +609,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) nvmet_rdma_use_inline_sg(rsp, len, off); rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; + rsp->req.transfer_len += len; return 0; } @@ -636,6 +637,7 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, nvmet_data_dir(&rsp->req)); if (ret < 0) return NVME_SC_INTERNAL; + rsp->req.transfer_len += len; rsp->n_rdma += ret; if (invalidate) { @@ -693,7 +695,7 @@ static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); } else { - rsp->req.execute(&rsp->req); + nvmet_req_execute(&rsp->req); } return true; -- cgit v1.2.1 From e454d122e22826589cfa08788a802ab09d4fae24 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 14:29:27 +0100 Subject: nvmet: kill nvmet_inline_bio_init Much easier to just opencode this helper. Also use ARRAY_SIZE instead of passing the inline bvec array size manually. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/target/io-cmd.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index db632818777d..0a4372a016f2 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -33,18 +33,11 @@ static inline u32 nvmet_rw_len(struct nvmet_req *req) req->ns->blksize_shift; } -static void nvmet_inline_bio_init(struct nvmet_req *req) -{ - struct bio *bio = &req->inline_bio; - - bio_init(bio, req->inline_bvec, NVMET_MAX_INLINE_BIOVEC); -} - static void nvmet_execute_rw(struct nvmet_req *req) { int sg_cnt = req->sg_cnt; + struct bio *bio = &req->inline_bio; struct scatterlist *sg; - struct bio *bio; sector_t sector; blk_qc_t cookie; int op, op_flags = 0, i; @@ -66,8 +59,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) sector = le64_to_cpu(req->cmd->rw.slba); sector <<= (req->ns->blksize_shift - 9); - nvmet_inline_bio_init(req); - bio = &req->inline_bio; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); bio_set_dev(bio, req->ns->bdev); bio->bi_iter.bi_sector = sector; bio->bi_private = req; @@ -99,11 +91,9 @@ static void nvmet_execute_rw(struct nvmet_req *req) static void nvmet_execute_flush(struct nvmet_req *req) { - struct bio *bio; - - nvmet_inline_bio_init(req); - bio = &req->inline_bio; + struct bio *bio = &req->inline_bio; + bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec)); bio_set_dev(bio, req->ns->bdev); bio->bi_private = req; bio->bi_end_io = nvmet_bio_done; -- cgit v1.2.1 From 039c635f4e666b647df2100038de276a83fb3fca Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:56 -0800 Subject: ide, scsi: Tell the block layer at request allocation time about preempt requests Convert blk_get_request(q, op, __GFP_RECLAIM) into blk_get_request_flags(q, op, BLK_MQ_PREEMPT). This patch does not change any functionality. Signed-off-by: Bart Van Assche Tested-by: Martin Steigerwald Acked-by: David S. Miller [ for IDE ] Acked-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Tested-by: Oleksandr Natalenko Cc: Ming Lei Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/ide/ide-pm.c | 4 ++-- drivers/scsi/scsi_lib.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 544f02d673ca..f56d742908df 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -89,9 +89,9 @@ int generic_ide_resume(struct device *dev) } memset(&rqpm, 0, sizeof(rqpm)); - rq = blk_get_request(drive->queue, REQ_OP_DRV_IN, __GFP_RECLAIM); + rq = blk_get_request_flags(drive->queue, REQ_OP_DRV_IN, + BLK_MQ_REQ_PREEMPT); ide_req(rq)->type = ATA_PRIV_PM_RESUME; - rq->rq_flags |= RQF_PREEMPT; rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_RESUME; rqpm.pm_state = PM_EVENT_ON; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 286ea983c9e3..eb129dfc2ebe 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -252,9 +252,9 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, struct scsi_request *rq; int ret = DRIVER_ERROR << 24; - req = blk_get_request(sdev->request_queue, + req = blk_get_request_flags(sdev->request_queue, data_direction == DMA_TO_DEVICE ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, __GFP_RECLAIM); + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, BLK_MQ_REQ_PREEMPT); if (IS_ERR(req)) return ret; rq = scsi_req(req); @@ -268,7 +268,7 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, rq->retries = retries; req->timeout = timeout; req->cmd_flags |= flags; - req->rq_flags |= rq_flags | RQF_QUIET | RQF_PREEMPT; + req->rq_flags |= rq_flags | RQF_QUIET; /* * head injection *required* here otherwise quiesce won't work -- cgit v1.2.1 From 3a0a529971ec4e2d933e9c7798db101dfb6b1aec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:58 -0800 Subject: block, scsi: Make SCSI quiesce and resume work reliably The contexts from which a SCSI device can be quiesced or resumed are: * Writing into /sys/class/scsi_device/*/device/state. * SCSI parallel (SPI) domain validation. * The SCSI device power management methods. See also scsi_bus_pm_ops. It is essential during suspend and resume that neither the filesystem state nor the filesystem metadata in RAM changes. This is why while the hibernation image is being written or restored that SCSI devices are quiesced. The SCSI core quiesces devices through scsi_device_quiesce() and scsi_device_resume(). In the SDEV_QUIESCE state execution of non-preempt requests is deferred. This is realized by returning BLKPREP_DEFER from inside scsi_prep_state_check() for quiesced SCSI devices. Avoid that a full queue prevents power management requests to be submitted by deferring allocation of non-preempt requests for devices in the quiesced state. This patch has been tested by running the following commands and by verifying that after each resume the fio job was still running: for ((i=0; i<10; i++)); do ( cd /sys/block/md0/md && while true; do [ "$( sync_action sleep 1 done ) & pids=($!) for d in /sys/class/block/sd*[a-z]; do bdev=${d#/sys/class/block/} hcil=$(readlink "$d/device") hcil=${hcil#../../../} echo 4 > "$d/queue/nr_requests" echo 1 > "/sys/class/scsi_device/$hcil/device/queue_depth" fio --name="$bdev" --filename="/dev/$bdev" --buffered=0 --bs=512 \ --rw=randread --ioengine=libaio --numjobs=4 --iodepth=16 \ --iodepth_batch=1 --thread --loops=$((2**31)) & pids+=($!) done sleep 1 echo "$(date) Hibernating ..." >>hibernate-test-log.txt systemctl hibernate sleep 10 kill "${pids[@]}" echo idle > /sys/block/md0/md/sync_action wait echo "$(date) Done." >>hibernate-test-log.txt done Reported-by: Oleksandr Natalenko References: "I/O hangs after resuming from suspend-to-ram" (https://marc.info/?l=linux-block&m=150340235201348). Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Martin Steigerwald Tested-by: Oleksandr Natalenko Cc: Martin K. Petersen Cc: Ming Lei Cc: Christoph Hellwig Cc: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index eb129dfc2ebe..f907e2f8c1dd 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2947,21 +2947,37 @@ static void scsi_wait_for_queuecommand(struct scsi_device *sdev) int scsi_device_quiesce(struct scsi_device *sdev) { + struct request_queue *q = sdev->request_queue; int err; + /* + * It is allowed to call scsi_device_quiesce() multiple times from + * the same context but concurrent scsi_device_quiesce() calls are + * not allowed. + */ + WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current); + + blk_set_preempt_only(q); + + blk_mq_freeze_queue(q); + /* + * Ensure that the effect of blk_set_preempt_only() will be visible + * for percpu_ref_tryget() callers that occur after the queue + * unfreeze even if the queue was already frozen before this function + * was called. See also https://lwn.net/Articles/573497/. + */ + synchronize_rcu(); + blk_mq_unfreeze_queue(q); + mutex_lock(&sdev->state_mutex); err = scsi_device_set_state(sdev, SDEV_QUIESCE); + if (err == 0) + sdev->quiesced_by = current; + else + blk_clear_preempt_only(q); mutex_unlock(&sdev->state_mutex); - if (err) - return err; - - scsi_run_queue(sdev->request_queue); - while (atomic_read(&sdev->device_busy)) { - msleep_interruptible(200); - scsi_run_queue(sdev->request_queue); - } - return 0; + return err; } EXPORT_SYMBOL(scsi_device_quiesce); @@ -2981,9 +2997,11 @@ void scsi_device_resume(struct scsi_device *sdev) * device deleted during suspend) */ mutex_lock(&sdev->state_mutex); - if (sdev->sdev_state == SDEV_QUIESCE && - scsi_device_set_state(sdev, SDEV_RUNNING) == 0) - scsi_run_queue(sdev->request_queue); + WARN_ON_ONCE(!sdev->quiesced_by); + sdev->quiesced_by = NULL; + blk_clear_preempt_only(sdev->request_queue); + if (sdev->sdev_state == SDEV_QUIESCE) + scsi_device_set_state(sdev, SDEV_RUNNING); mutex_unlock(&sdev->state_mutex); } EXPORT_SYMBOL(scsi_device_resume); -- cgit v1.2.1 From 9a95e4ef709533efac4aafcb8bddf73f96db50ed Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 9 Nov 2017 10:49:59 -0800 Subject: block, nvme: Introduce blk_mq_req_flags_t Several block layer and NVMe core functions accept a combination of BLK_MQ_REQ_* flags through the 'flags' argument but there is no verification at compile time whether the right type of block layer flags is passed. Make it possible for sparse to verify this. This patch does not change any functionality. Signed-off-by: Bart Van Assche Reviewed-by: Hannes Reinecke Tested-by: Oleksandr Natalenko Cc: linux-nvme@lists.infradead.org Cc: Christoph Hellwig Cc: Johannes Thumshirn Cc: Ming Lei Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 5 +++-- drivers/nvme/host/nvme.h | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 683d890d73fa..878d5c09d15b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -305,7 +305,7 @@ static void nvme_put_ns(struct nvme_ns *ns) } struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, unsigned int flags, int qid) + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid) { unsigned op = nvme_is_write(cmd) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN; struct request *req; @@ -573,7 +573,8 @@ EXPORT_SYMBOL_GPL(nvme_setup_cmd); */ int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, - unsigned timeout, int qid, int at_head, int flags) + unsigned timeout, int qid, int at_head, + blk_mq_req_flags_t flags) { struct request *req; int ret; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7b9cc7d616b7..23b8504ace7f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -327,14 +327,15 @@ int nvme_reinit_tagset(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set); #define NVME_QID_ANY -1 struct request *nvme_alloc_request(struct request_queue *q, - struct nvme_command *cmd, unsigned int flags, int qid); + struct nvme_command *cmd, blk_mq_req_flags_t flags, int qid); blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req, struct nvme_command *cmd); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, union nvme_result *result, void *buffer, unsigned bufflen, - unsigned timeout, int qid, int at_head, int flags); + unsigned timeout, int qid, int at_head, + blk_mq_req_flags_t flags); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); -- cgit v1.2.1 From ab9e00cc72fa4c6eb6370757b9e94c4645e91d5d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 13:48:55 +0100 Subject: nvme: track subsystems This adds a new nvme_subsystem structure so that we can track multiple controllers that belong to a single subsystem. For now we only use it to store the NQN, and to check that we don't have duplicate NQNs unless the involved subsystems support multiple controllers. Includes code originally from Hannes Reinecke to expose the subsystems in sysfs. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 210 ++++++++++++++++++++++++++++++++++++++------ drivers/nvme/host/fabrics.c | 4 +- drivers/nvme/host/nvme.h | 27 ++++-- 3 files changed, 205 insertions(+), 36 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 878d5c09d15b..78dc6f624d52 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -68,9 +68,14 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); struct workqueue_struct *nvme_wq; EXPORT_SYMBOL_GPL(nvme_wq); +static DEFINE_IDA(nvme_subsystems_ida); +static LIST_HEAD(nvme_subsystems); +static DEFINE_MUTEX(nvme_subsystems_lock); + static DEFINE_IDA(nvme_instance_ida); static dev_t nvme_chr_devt; static struct class *nvme_class; +static struct class *nvme_subsys_class; static void nvme_ns_remove(struct nvme_ns *ns); static int nvme_revalidate_disk(struct gendisk *disk); @@ -1804,14 +1809,15 @@ static bool quirk_matches(const struct nvme_id_ctrl *id, string_matches(id->fr, q->fr, sizeof(id->fr)); } -static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, + struct nvme_id_ctrl *id) { size_t nqnlen; int off; nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { - strncpy(ctrl->subnqn, id->subnqn, NVMF_NQN_SIZE); + strncpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); return; } @@ -1819,14 +1825,140 @@ static void nvme_init_subnqn(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ - off = snprintf(ctrl->subnqn, NVMF_NQN_SIZE, + off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, "nqn.2014.08.org.nvmexpress:%4x%4x", le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); - memcpy(ctrl->subnqn + off, id->sn, sizeof(id->sn)); + memcpy(subsys->subnqn + off, id->sn, sizeof(id->sn)); off += sizeof(id->sn); - memcpy(ctrl->subnqn + off, id->mn, sizeof(id->mn)); + memcpy(subsys->subnqn + off, id->mn, sizeof(id->mn)); off += sizeof(id->mn); - memset(ctrl->subnqn + off, 0, sizeof(ctrl->subnqn) - off); + memset(subsys->subnqn + off, 0, sizeof(subsys->subnqn) - off); +} + +static void __nvme_release_subsystem(struct nvme_subsystem *subsys) +{ + ida_simple_remove(&nvme_subsystems_ida, subsys->instance); + kfree(subsys); +} + +static void nvme_release_subsystem(struct device *dev) +{ + __nvme_release_subsystem(container_of(dev, struct nvme_subsystem, dev)); +} + +static void nvme_destroy_subsystem(struct kref *ref) +{ + struct nvme_subsystem *subsys = + container_of(ref, struct nvme_subsystem, ref); + + mutex_lock(&nvme_subsystems_lock); + list_del(&subsys->entry); + mutex_unlock(&nvme_subsystems_lock); + + device_del(&subsys->dev); + put_device(&subsys->dev); +} + +static void nvme_put_subsystem(struct nvme_subsystem *subsys) +{ + kref_put(&subsys->ref, nvme_destroy_subsystem); +} + +static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) +{ + struct nvme_subsystem *subsys; + + lockdep_assert_held(&nvme_subsystems_lock); + + list_for_each_entry(subsys, &nvme_subsystems, entry) { + if (strcmp(subsys->subnqn, subsysnqn)) + continue; + if (!kref_get_unless_zero(&subsys->ref)) + continue; + return subsys; + } + + return NULL; +} + +static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) +{ + struct nvme_subsystem *subsys, *found; + int ret; + + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + if (!subsys) + return -ENOMEM; + ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL); + if (ret < 0) { + kfree(subsys); + return ret; + } + subsys->instance = ret; + mutex_init(&subsys->lock); + kref_init(&subsys->ref); + INIT_LIST_HEAD(&subsys->ctrls); + nvme_init_subnqn(subsys, ctrl, id); + memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); + memcpy(subsys->model, id->mn, sizeof(subsys->model)); + memcpy(subsys->firmware_rev, id->fr, sizeof(subsys->firmware_rev)); + subsys->vendor_id = le16_to_cpu(id->vid); + subsys->cmic = id->cmic; + + subsys->dev.class = nvme_subsys_class; + subsys->dev.release = nvme_release_subsystem; + dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance); + device_initialize(&subsys->dev); + + mutex_lock(&nvme_subsystems_lock); + found = __nvme_find_get_subsystem(subsys->subnqn); + if (found) { + /* + * Verify that the subsystem actually supports multiple + * controllers, else bail out. + */ + if (!(id->cmic & (1 << 1))) { + dev_err(ctrl->device, + "ignoring ctrl due to duplicate subnqn (%s).\n", + found->subnqn); + nvme_put_subsystem(found); + ret = -EINVAL; + goto out_unlock; + } + + __nvme_release_subsystem(subsys); + subsys = found; + } else { + ret = device_add(&subsys->dev); + if (ret) { + dev_err(ctrl->device, + "failed to register subsystem device.\n"); + goto out_unlock; + } + list_add_tail(&subsys->entry, &nvme_subsystems); + } + + ctrl->subsys = subsys; + mutex_unlock(&nvme_subsystems_lock); + + if (sysfs_create_link(&subsys->dev.kobj, &ctrl->device->kobj, + dev_name(ctrl->device))) { + dev_err(ctrl->device, + "failed to create sysfs link from subsystem.\n"); + /* the transport driver will eventually put the subsystem */ + return -EINVAL; + } + + mutex_lock(&subsys->lock); + list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); + mutex_unlock(&subsys->lock); + + return 0; + +out_unlock: + mutex_unlock(&nvme_subsystems_lock); + put_device(&subsys->dev); + return ret; } static int nvme_get_log(struct nvme_ctrl *ctrl, u8 log_page, void *log, @@ -1901,9 +2033,13 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) return ret; } - nvme_init_subnqn(ctrl, id); - if (!ctrl->identified) { + int i; + + ret = nvme_init_subsystem(ctrl, id); + if (ret) + goto out_free; + /* * Check for quirks. Quirk can depend on firmware version, * so, in principle, the set of quirks present can change @@ -1912,9 +2048,6 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) * the device, but we'd have to make sure that the driver * behaves intelligently if the quirks change. */ - - int i; - for (i = 0; i < ARRAY_SIZE(core_quirks); i++) { if (quirk_matches(id, &core_quirks[i])) ctrl->quirks |= core_quirks[i].quirks; @@ -1927,14 +2060,10 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) } ctrl->oacs = le16_to_cpu(id->oacs); - ctrl->vid = le16_to_cpu(id->vid); ctrl->oncs = le16_to_cpup(&id->oncs); atomic_set(&ctrl->abort_limit, id->acl + 1); ctrl->vwc = id->vwc; ctrl->cntlid = le16_to_cpup(&id->cntlid); - memcpy(ctrl->serial, id->sn, sizeof(id->sn)); - memcpy(ctrl->model, id->mn, sizeof(id->mn)); - memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); if (id->mdts) max_hw_sectors = 1 << (id->mdts + page_shift - 9); else @@ -2137,9 +2266,9 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ctrl *ctrl = ns->ctrl; - int serial_len = sizeof(ctrl->serial); - int model_len = sizeof(ctrl->model); + struct nvme_subsystem *subsys = ns->ctrl->subsys; + int serial_len = sizeof(subsys->serial); + int model_len = sizeof(subsys->model); if (!uuid_is_null(&ns->uuid)) return sprintf(buf, "uuid.%pU\n", &ns->uuid); @@ -2150,15 +2279,16 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) return sprintf(buf, "eui.%8phN\n", ns->eui); - while (serial_len > 0 && (ctrl->serial[serial_len - 1] == ' ' || - ctrl->serial[serial_len - 1] == '\0')) + while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || + subsys->serial[serial_len - 1] == '\0')) serial_len--; - while (model_len > 0 && (ctrl->model[model_len - 1] == ' ' || - ctrl->model[model_len - 1] == '\0')) + while (model_len > 0 && (subsys->model[model_len - 1] == ' ' || + subsys->model[model_len - 1] == '\0')) model_len--; - return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", ctrl->vid, - serial_len, ctrl->serial, model_len, ctrl->model, ns->ns_id); + return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, + serial_len, subsys->serial, model_len, subsys->model, + ns->ns_id); } static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); @@ -2244,10 +2374,15 @@ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ - return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ + return sprintf(buf, "%.*s\n", \ + (int)sizeof(ctrl->subsys->field), ctrl->subsys->field); \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); +nvme_show_str_function(model); +nvme_show_str_function(serial); +nvme_show_str_function(firmware_rev); + #define nvme_show_int_function(field) \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -2257,9 +2392,6 @@ static ssize_t field##_show(struct device *dev, \ } \ static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); -nvme_show_str_function(model); -nvme_show_str_function(serial); -nvme_show_str_function(firmware_rev); nvme_show_int_function(cntlid); static ssize_t nvme_sysfs_delete(struct device *dev, @@ -2313,7 +2445,7 @@ static ssize_t nvme_sysfs_show_subsysnqn(struct device *dev, { struct nvme_ctrl *ctrl = dev_get_drvdata(dev); - return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subnqn); + return snprintf(buf, PAGE_SIZE, "%s\n", ctrl->subsys->subnqn); } static DEVICE_ATTR(subsysnqn, S_IRUGO, nvme_sysfs_show_subsysnqn, NULL); @@ -2817,12 +2949,23 @@ static void nvme_free_ctrl(struct device *dev) { struct nvme_ctrl *ctrl = container_of(dev, struct nvme_ctrl, ctrl_device); + struct nvme_subsystem *subsys = ctrl->subsys; ida_simple_remove(&nvme_instance_ida, ctrl->instance); ida_destroy(&ctrl->ns_ida); kfree(ctrl->effects); + if (subsys) { + mutex_lock(&subsys->lock); + list_del(&ctrl->subsys_entry); + mutex_unlock(&subsys->lock); + sysfs_remove_link(&subsys->dev.kobj, dev_name(ctrl->device)); + } + ctrl->ops->free_ctrl(ctrl); + + if (subsys) + nvme_put_subsystem(subsys); } /* @@ -3022,8 +3165,15 @@ int __init nvme_core_init(void) goto unregister_chrdev; } + nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); + if (IS_ERR(nvme_subsys_class)) { + result = PTR_ERR(nvme_subsys_class); + goto destroy_class; + } return 0; +destroy_class: + class_destroy(nvme_class); unregister_chrdev: unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); destroy_wq: @@ -3033,6 +3183,8 @@ destroy_wq: void nvme_core_exit(void) { + ida_destroy(&nvme_subsystems_ida); + class_destroy(nvme_subsys_class); class_destroy(nvme_class); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); destroy_workqueue(nvme_wq); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index a4967de5bb25..76b4fe6816a0 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -882,10 +882,10 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count) goto out_unlock; } - if (strcmp(ctrl->subnqn, opts->subsysnqn)) { + if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) { dev_warn(ctrl->device, "controller returned incorrect NQN: \"%s\".\n", - ctrl->subnqn); + ctrl->subsys->subnqn); up_read(&nvmf_transports_rwsem); nvme_delete_ctrl_sync(ctrl); return ERR_PTR(-EINVAL); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 23b8504ace7f..6ae50979c3aa 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -140,13 +140,12 @@ struct nvme_ctrl { struct work_struct reset_work; struct work_struct delete_work; + struct nvme_subsystem *subsys; + struct list_head subsys_entry; + struct opal_dev *opal_dev; char name[12]; - char serial[20]; - char model[40]; - char firmware_rev[8]; - char subnqn[NVMF_NQN_SIZE]; u16 cntlid; u32 ctrl_config; @@ -157,7 +156,6 @@ struct nvme_ctrl { u32 page_size; u32 max_hw_sectors; u16 oncs; - u16 vid; u16 oacs; u16 nssa; u16 nr_streams; @@ -200,6 +198,25 @@ struct nvme_ctrl { struct nvmf_ctrl_options *opts; }; +struct nvme_subsystem { + int instance; + struct device dev; + /* + * Because we unregister the device on the last put we need + * a separate refcount. + */ + struct kref ref; + struct list_head entry; + struct mutex lock; + struct list_head ctrls; + char subnqn[NVMF_NQN_SIZE]; + char serial[20]; + char model[40]; + char firmware_rev[8]; + u8 cmic; + u16 vendor_id; +}; + struct nvme_ns { struct list_head list; -- cgit v1.2.1 From 002fab040468f5b279f8d8d49d487e73b0c3f98a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 13:50:16 +0100 Subject: nvme: introduce a nvme_ns_ids structure This allows us to manage the various uniqueue namespace identifiers together instead needing various variables and arguments. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 69 +++++++++++++++++++++++++++--------------------- drivers/nvme/host/nvme.h | 14 +++++++--- 2 files changed, 49 insertions(+), 34 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 78dc6f624d52..04c0949c754a 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -797,7 +797,7 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) } static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, - u8 *eui64, u8 *nguid, uuid_t *uuid) + struct nvme_ns_ids *ids) { struct nvme_command c = { }; int status; @@ -833,7 +833,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, goto free_data; } len = NVME_NIDT_EUI64_LEN; - memcpy(eui64, data + pos + sizeof(*cur), len); + memcpy(ids->eui64, data + pos + sizeof(*cur), len); break; case NVME_NIDT_NGUID: if (cur->nidl != NVME_NIDT_NGUID_LEN) { @@ -843,7 +843,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, goto free_data; } len = NVME_NIDT_NGUID_LEN; - memcpy(nguid, data + pos + sizeof(*cur), len); + memcpy(ids->nguid, data + pos + sizeof(*cur), len); break; case NVME_NIDT_UUID: if (cur->nidl != NVME_NIDT_UUID_LEN) { @@ -853,7 +853,7 @@ static int nvme_identify_ns_descs(struct nvme_ctrl *ctrl, unsigned nsid, goto free_data; } len = NVME_NIDT_UUID_LEN; - uuid_copy(uuid, data + pos + sizeof(*cur)); + uuid_copy(&ids->uuid, data + pos + sizeof(*cur)); break; default: /* Skip unnkown types */ @@ -1233,22 +1233,31 @@ static void nvme_config_discard(struct nvme_ctrl *ctrl, } static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, - struct nvme_id_ns *id, u8 *eui64, u8 *nguid, uuid_t *uuid) + struct nvme_id_ns *id, struct nvme_ns_ids *ids) { + memset(ids, 0, sizeof(*ids)); + if (ctrl->vs >= NVME_VS(1, 1, 0)) - memcpy(eui64, id->eui64, sizeof(id->eui64)); + memcpy(ids->eui64, id->eui64, sizeof(id->eui64)); if (ctrl->vs >= NVME_VS(1, 2, 0)) - memcpy(nguid, id->nguid, sizeof(id->nguid)); + memcpy(ids->nguid, id->nguid, sizeof(id->nguid)); if (ctrl->vs >= NVME_VS(1, 3, 0)) { /* Don't treat error as fatal we potentially * already have a NGUID or EUI-64 */ - if (nvme_identify_ns_descs(ctrl, nsid, eui64, nguid, uuid)) + if (nvme_identify_ns_descs(ctrl, nsid, ids)) dev_warn(ctrl->device, "%s: Identify Descriptors failed\n", __func__); } } +static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) +{ + return uuid_equal(&a->uuid, &b->uuid) && + memcmp(&a->nguid, &b->nguid, sizeof(a->nguid)) == 0 && + memcmp(&a->eui64, &b->eui64, sizeof(a->eui64)) == 0; +} + static void nvme_update_disk_info(struct gendisk *disk, struct nvme_ns *ns, struct nvme_id_ns *id) { @@ -1304,8 +1313,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) struct nvme_ns *ns = disk->private_data; struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ns *id; - u8 eui64[8] = { 0 }, nguid[16] = { 0 }; - uuid_t uuid = uuid_null; + struct nvme_ns_ids ids; int ret = 0; if (test_bit(NVME_NS_DEAD, &ns->flags)) { @@ -1322,10 +1330,8 @@ static int nvme_revalidate_disk(struct gendisk *disk) goto out; } - nvme_report_ns_ids(ctrl, ns->ns_id, id, eui64, nguid, &uuid); - if (!uuid_equal(&ns->uuid, &uuid) || - memcmp(&ns->nguid, &nguid, sizeof(ns->nguid)) || - memcmp(&ns->eui, &eui64, sizeof(ns->eui))) { + nvme_report_ns_ids(ctrl, ns->ns_id, id, &ids); + if (!nvme_ns_ids_equal(&ns->ids, &ids)) { dev_err(ctrl->device, "identifiers changed for nsid %d\n", ns->ns_id); ret = -ENODEV; @@ -2266,18 +2272,19 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + struct nvme_ns_ids *ids = &ns->ids; struct nvme_subsystem *subsys = ns->ctrl->subsys; int serial_len = sizeof(subsys->serial); int model_len = sizeof(subsys->model); - if (!uuid_is_null(&ns->uuid)) - return sprintf(buf, "uuid.%pU\n", &ns->uuid); + if (!uuid_is_null(&ids->uuid)) + return sprintf(buf, "uuid.%pU\n", &ids->uuid); - if (memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) - return sprintf(buf, "eui.%16phN\n", ns->nguid); + if (memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) + return sprintf(buf, "eui.%16phN\n", ids->nguid); - if (memchr_inv(ns->eui, 0, sizeof(ns->eui))) - return sprintf(buf, "eui.%8phN\n", ns->eui); + if (memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) + return sprintf(buf, "eui.%8phN\n", ids->eui64); while (serial_len > 0 && (subsys->serial[serial_len - 1] == ' ' || subsys->serial[serial_len - 1] == '\0')) @@ -2296,7 +2303,7 @@ static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%pU\n", ns->nguid); + return sprintf(buf, "%pU\n", ns->ids.nguid); } static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); @@ -2304,16 +2311,17 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + struct nvme_ns_ids *ids = &ns->ids; /* For backward compatibility expose the NGUID to userspace if * we have no UUID set */ - if (uuid_is_null(&ns->uuid)) { + if (uuid_is_null(&ids->uuid)) { printk_ratelimited(KERN_WARNING "No UUID available providing old NGUID\n"); - return sprintf(buf, "%pU\n", ns->nguid); + return sprintf(buf, "%pU\n", ids->nguid); } - return sprintf(buf, "%pU\n", &ns->uuid); + return sprintf(buf, "%pU\n", &ids->uuid); } static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); @@ -2321,7 +2329,7 @@ static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%8ph\n", ns->eui); + return sprintf(buf, "%8ph\n", ns->ids.eui64); } static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); @@ -2347,18 +2355,19 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, { struct device *dev = container_of(kobj, struct device, kobj); struct nvme_ns *ns = nvme_get_ns_from_dev(dev); + struct nvme_ns_ids *ids = &ns->ids; if (a == &dev_attr_uuid.attr) { - if (uuid_is_null(&ns->uuid) || - !memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) + if (uuid_is_null(&ids->uuid) || + !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) return 0; } if (a == &dev_attr_nguid.attr) { - if (!memchr_inv(ns->nguid, 0, sizeof(ns->nguid))) + if (!memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) return 0; } if (a == &dev_attr_eui.attr) { - if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) + if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) return 0; } return a->mode; @@ -2591,7 +2600,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (id->ncap == 0) goto out_free_id; - nvme_report_ns_ids(ctrl, ns->ns_id, id, ns->eui, ns->nguid, &ns->uuid); + nvme_report_ns_ids(ctrl, ns->ns_id, id, &ns->ids); if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { if (nvme_nvm_register(ns, disk_name, node)) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6ae50979c3aa..2f1af915f93a 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -217,6 +217,15 @@ struct nvme_subsystem { u16 vendor_id; }; +/* + * Container structure for uniqueue namespace identifiers. + */ +struct nvme_ns_ids { + u8 eui64[8]; + u8 nguid[16]; + uuid_t uuid; +}; + struct nvme_ns { struct list_head list; @@ -227,11 +236,8 @@ struct nvme_ns { struct kref kref; int instance; - u8 eui[8]; - u8 nguid[16]; - uuid_t uuid; - unsigned ns_id; + struct nvme_ns_ids ids; int lba_shift; u16 ms; u16 sgs; -- cgit v1.2.1 From ed754e5deeb17f4e675c84e4b6c640cc7344e498 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 13:50:43 +0100 Subject: nvme: track shared namespaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a new struct nvme_ns_head that holds information about an actual namespace, unlike struct nvme_ns, which only holds the per-controller namespace information. For private namespaces there is a 1:1 relation of the two, but for shared namespaces this lets us discover all the paths to it. For now only the identifiers are moved to the new structure, but most of the information in struct nvme_ns should eventually move over. To allow lockless path lookup the list of nvme_ns structures per nvme_ns_head is protected by SRCU, which requires freeing the nvme_ns structure through call_srcu. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Javier González Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 220 +++++++++++++++++++++++++++++++++++-------- drivers/nvme/host/lightnvm.c | 14 +-- drivers/nvme/host/nvme.h | 26 ++++- 3 files changed, 210 insertions(+), 50 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 04c0949c754a..13676f6cd4f6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -291,6 +291,22 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); +static void nvme_free_ns_head(struct kref *ref) +{ + struct nvme_ns_head *head = + container_of(ref, struct nvme_ns_head, ref); + + ida_simple_remove(&head->subsys->ns_ida, head->instance); + list_del_init(&head->entry); + cleanup_srcu_struct(&head->srcu); + kfree(head); +} + +static void nvme_put_ns_head(struct nvme_ns_head *head) +{ + kref_put(&head->ref, nvme_free_ns_head); +} + static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); @@ -299,7 +315,7 @@ static void nvme_free_ns(struct kref *kref) nvme_nvm_unregister(ns); put_disk(ns->disk); - ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); + nvme_put_ns_head(ns->head); nvme_put_ctrl(ns->ctrl); kfree(ns); } @@ -435,7 +451,7 @@ static inline void nvme_setup_flush(struct nvme_ns *ns, { memset(cmnd, 0, sizeof(*cmnd)); cmnd->common.opcode = nvme_cmd_flush; - cmnd->common.nsid = cpu_to_le32(ns->ns_id); + cmnd->common.nsid = cpu_to_le32(ns->head->ns_id); } static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, @@ -466,7 +482,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.nsid = cpu_to_le32(ns->head->ns_id); cmnd->dsm.nr = cpu_to_le32(segments - 1); cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); @@ -495,7 +511,7 @@ static inline blk_status_t nvme_setup_rw(struct nvme_ns *ns, memset(cmnd, 0, sizeof(*cmnd)); cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); - cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.nsid = cpu_to_le32(ns->head->ns_id); cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); @@ -987,7 +1003,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) memset(&c, 0, sizeof(c)); c.rw.opcode = io.opcode; c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.nsid = cpu_to_le32(ns->head->ns_id); c.rw.slba = cpu_to_le64(io.slba); c.rw.length = cpu_to_le16(io.nblocks); c.rw.control = cpu_to_le16(io.control); @@ -1130,7 +1146,7 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); - return ns->ns_id; + return ns->head->ns_id; case NVME_IOCTL_ADMIN_CMD: return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); case NVME_IOCTL_IO_CMD: @@ -1251,6 +1267,13 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, } } +static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) +{ + return !uuid_is_null(&ids->uuid) || + memchr_inv(ids->nguid, 0, sizeof(ids->nguid)) || + memchr_inv(ids->eui64, 0, sizeof(ids->eui64)); +} + static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) { return uuid_equal(&a->uuid, &b->uuid) && @@ -1321,7 +1344,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) return -ENODEV; } - id = nvme_identify_ns(ctrl, ns->ns_id); + id = nvme_identify_ns(ctrl, ns->head->ns_id); if (!id) return -ENODEV; @@ -1330,10 +1353,10 @@ static int nvme_revalidate_disk(struct gendisk *disk) goto out; } - nvme_report_ns_ids(ctrl, ns->ns_id, id, &ids); - if (!nvme_ns_ids_equal(&ns->ids, &ids)) { + nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { dev_err(ctrl->device, - "identifiers changed for nsid %d\n", ns->ns_id); + "identifiers changed for nsid %d\n", ns->head->ns_id); ret = -ENODEV; } @@ -1374,7 +1397,7 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10, memset(&c, 0, sizeof(c)); c.common.opcode = op; - c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.nsid = cpu_to_le32(ns->head->ns_id); c.common.cdw10[0] = cpu_to_le32(cdw10); return nvme_submit_sync_cmd(ns->queue, &c, data, 16); @@ -1861,6 +1884,7 @@ static void nvme_destroy_subsystem(struct kref *ref) list_del(&subsys->entry); mutex_unlock(&nvme_subsystems_lock); + ida_destroy(&subsys->ns_ida); device_del(&subsys->dev); put_device(&subsys->dev); } @@ -1904,6 +1928,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) mutex_init(&subsys->lock); kref_init(&subsys->ref); INIT_LIST_HEAD(&subsys->ctrls); + INIT_LIST_HEAD(&subsys->nsheads); nvme_init_subnqn(subsys, ctrl, id); memcpy(subsys->serial, id->sn, sizeof(subsys->serial)); memcpy(subsys->model, id->mn, sizeof(subsys->model)); @@ -1941,6 +1966,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) "failed to register subsystem device.\n"); goto out_unlock; } + ida_init(&subsys->ns_ida); list_add_tail(&subsys->entry, &nvme_subsystems); } @@ -2272,7 +2298,7 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->ids; + struct nvme_ns_ids *ids = &ns->head->ids; struct nvme_subsystem *subsys = ns->ctrl->subsys; int serial_len = sizeof(subsys->serial); int model_len = sizeof(subsys->model); @@ -2295,7 +2321,7 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, serial_len, subsys->serial, model_len, subsys->model, - ns->ns_id); + ns->head->ns_id); } static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); @@ -2303,7 +2329,7 @@ static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%pU\n", ns->ids.nguid); + return sprintf(buf, "%pU\n", ns->head->ids.nguid); } static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); @@ -2311,7 +2337,7 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->ids; + struct nvme_ns_ids *ids = &ns->head->ids; /* For backward compatibility expose the NGUID to userspace if * we have no UUID set @@ -2329,7 +2355,7 @@ static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%8ph\n", ns->ids.eui64); + return sprintf(buf, "%8ph\n", ns->head->ids.eui64); } static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); @@ -2337,7 +2363,7 @@ static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, char *buf) { struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%d\n", ns->ns_id); + return sprintf(buf, "%d\n", ns->head->ns_id); } static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); @@ -2355,7 +2381,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, { struct device *dev = container_of(kobj, struct device, kobj); struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->ids; + struct nvme_ns_ids *ids = &ns->head->ids; if (a == &dev_attr_uuid.attr) { if (uuid_is_null(&ids->uuid) || @@ -2507,12 +2533,124 @@ static const struct attribute_group *nvme_dev_attr_groups[] = { NULL, }; +static struct nvme_ns_head *__nvme_find_ns_head(struct nvme_subsystem *subsys, + unsigned nsid) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + + list_for_each_entry(h, &subsys->nsheads, entry) { + if (h->ns_id == nsid && kref_get_unless_zero(&h->ref)) + return h; + } + + return NULL; +} + +static int __nvme_check_ids(struct nvme_subsystem *subsys, + struct nvme_ns_head *new) +{ + struct nvme_ns_head *h; + + lockdep_assert_held(&subsys->lock); + + list_for_each_entry(h, &subsys->nsheads, entry) { + if (nvme_ns_ids_valid(&new->ids) && + nvme_ns_ids_equal(&new->ids, &h->ids)) + return -EINVAL; + } + + return 0; +} + +static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, + unsigned nsid, struct nvme_id_ns *id) +{ + struct nvme_ns_head *head; + int ret = -ENOMEM; + + head = kzalloc(sizeof(*head), GFP_KERNEL); + if (!head) + goto out; + ret = ida_simple_get(&ctrl->subsys->ns_ida, 1, 0, GFP_KERNEL); + if (ret < 0) + goto out_free_head; + head->instance = ret; + INIT_LIST_HEAD(&head->list); + init_srcu_struct(&head->srcu); + head->subsys = ctrl->subsys; + head->ns_id = nsid; + kref_init(&head->ref); + + nvme_report_ns_ids(ctrl, nsid, id, &head->ids); + + ret = __nvme_check_ids(ctrl->subsys, head); + if (ret) { + dev_err(ctrl->device, + "duplicate IDs for nsid %d\n", nsid); + goto out_cleanup_srcu; + } + + list_add_tail(&head->entry, &ctrl->subsys->nsheads); + return head; +out_cleanup_srcu: + cleanup_srcu_struct(&head->srcu); + ida_simple_remove(&ctrl->subsys->ns_ida, head->instance); +out_free_head: + kfree(head); +out: + return ERR_PTR(ret); +} + +static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, + struct nvme_id_ns *id, bool *new) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + bool is_shared = id->nmic & (1 << 0); + struct nvme_ns_head *head = NULL; + int ret = 0; + + mutex_lock(&ctrl->subsys->lock); + if (is_shared) + head = __nvme_find_ns_head(ctrl->subsys, nsid); + if (!head) { + head = nvme_alloc_ns_head(ctrl, nsid, id); + if (IS_ERR(head)) { + ret = PTR_ERR(head); + goto out_unlock; + } + + *new = true; + } else { + struct nvme_ns_ids ids; + + nvme_report_ns_ids(ctrl, nsid, id, &ids); + if (!nvme_ns_ids_equal(&head->ids, &ids)) { + dev_err(ctrl->device, + "IDs don't match for shared namespace %d\n", + nsid); + ret = -EINVAL; + goto out_unlock; + } + + *new = false; + } + + list_add_tail(&ns->siblings, &head->list); + ns->head = head; + +out_unlock: + mutex_unlock(&ctrl->subsys->lock); + return ret; +} + static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) { struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); - return nsa->ns_id - nsb->ns_id; + return nsa->head->ns_id - nsb->head->ns_id; } static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) @@ -2521,13 +2659,13 @@ static struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid) mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->ns_id == nsid) { + if (ns->head->ns_id == nsid) { if (!kref_get_unless_zero(&ns->kref)) continue; ret = ns; break; } - if (ns->ns_id > nsid) + if (ns->head->ns_id > nsid) break; } mutex_unlock(&ctrl->namespaces_mutex); @@ -2542,7 +2680,7 @@ static int nvme_setup_streams_ns(struct nvme_ctrl *ctrl, struct nvme_ns *ns) if (!ctrl->nr_streams) return 0; - ret = nvme_get_stream_params(ctrl, &s, ns->ns_id); + ret = nvme_get_stream_params(ctrl, &s, ns->head->ns_id); if (ret) return ret; @@ -2567,32 +2705,26 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; int node = dev_to_node(ctrl->dev); + bool new = true; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) return; - ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL); - if (ns->instance < 0) - goto out_free_ns; - ns->queue = blk_mq_init_queue(ctrl->tagset); if (IS_ERR(ns->queue)) - goto out_release_instance; + goto out_free_ns; queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); ns->queue->queuedata = ns; ns->ctrl = ctrl; kref_init(&ns->kref); - ns->ns_id = nsid; ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); nvme_setup_streams_ns(ctrl, ns); - sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); - id = nvme_identify_ns(ctrl, nsid); if (!id) goto out_free_queue; @@ -2600,18 +2732,21 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (id->ncap == 0) goto out_free_id; - nvme_report_ns_ids(ctrl, ns->ns_id, id, &ns->ids); + if (nvme_init_ns_head(ns, nsid, id, &new)) + goto out_free_id; + + sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { if (nvme_nvm_register(ns, disk_name, node)) { dev_warn(ctrl->device, "LightNVM init failure\n"); - goto out_free_id; + goto out_unlink_ns; } } disk = alloc_disk_node(0, node); if (!disk) - goto out_free_id; + goto out_unlink_ns; disk->fops = &nvme_fops; disk->private_data = ns; @@ -2639,18 +2774,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) pr_warn("%s: failed to register lightnvm sysfs group for identification\n", ns->disk->disk_name); return; + out_unlink_ns: + mutex_lock(&ctrl->subsys->lock); + list_del_rcu(&ns->siblings); + mutex_unlock(&ctrl->subsys->lock); out_free_id: kfree(id); out_free_queue: blk_cleanup_queue(ns->queue); - out_release_instance: - ida_simple_remove(&ctrl->ns_ida, ns->instance); out_free_ns: kfree(ns); } static void nvme_ns_remove(struct nvme_ns *ns) { + struct nvme_ns_head *head = ns->head; + if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; @@ -2665,10 +2804,16 @@ static void nvme_ns_remove(struct nvme_ns *ns) blk_cleanup_queue(ns->queue); } + mutex_lock(&ns->ctrl->subsys->lock); + if (head) + list_del_rcu(&ns->siblings); + mutex_unlock(&ns->ctrl->subsys->lock); + mutex_lock(&ns->ctrl->namespaces_mutex); list_del_init(&ns->list); mutex_unlock(&ns->ctrl->namespaces_mutex); + synchronize_srcu(&head->srcu); nvme_put_ns(ns); } @@ -2691,7 +2836,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, struct nvme_ns *ns, *next; list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { - if (ns->ns_id > nsid) + if (ns->head->ns_id > nsid) nvme_ns_remove(ns); } } @@ -2961,7 +3106,6 @@ static void nvme_free_ctrl(struct device *dev) struct nvme_subsystem *subsys = ctrl->subsys; ida_simple_remove(&nvme_instance_ida, ctrl->instance); - ida_destroy(&ctrl->ns_ida); kfree(ctrl->effects); if (subsys) { @@ -3022,8 +3166,6 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, if (ret) goto out_free_name; - ida_init(&ctrl->ns_ida); - /* * Initialize latency tolerance controls. The sysfs files won't * be visible to userspace unless the device actually supports APST. diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 8fc949c5b49b..ba3d7f3349e5 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -305,7 +305,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) int ret; c.identity.opcode = nvme_nvm_admin_identity; - c.identity.nsid = cpu_to_le32(ns->ns_id); + c.identity.nsid = cpu_to_le32(ns->head->ns_id); c.identity.chnl_off = 0; nvme_nvm_id = kmalloc(sizeof(struct nvme_nvm_id), GFP_KERNEL); @@ -344,7 +344,7 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, int ret = 0; c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl; - c.l2p.nsid = cpu_to_le32(ns->ns_id); + c.l2p.nsid = cpu_to_le32(ns->head->ns_id); entries = kmalloc(len, GFP_KERNEL); if (!entries) return -ENOMEM; @@ -402,7 +402,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, int ret = 0; c.get_bb.opcode = nvme_nvm_admin_get_bb_tbl; - c.get_bb.nsid = cpu_to_le32(ns->ns_id); + c.get_bb.nsid = cpu_to_le32(ns->head->ns_id); c.get_bb.spba = cpu_to_le64(ppa.ppa); bb_tbl = kzalloc(tblsz, GFP_KERNEL); @@ -452,7 +452,7 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr *ppas, int ret = 0; c.set_bb.opcode = nvme_nvm_admin_set_bb_tbl; - c.set_bb.nsid = cpu_to_le32(ns->ns_id); + c.set_bb.nsid = cpu_to_le32(ns->head->ns_id); c.set_bb.spba = cpu_to_le64(ppas->ppa); c.set_bb.nlb = cpu_to_le16(nr_ppas - 1); c.set_bb.value = type; @@ -469,7 +469,7 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns, struct nvme_nvm_command *c) { c->ph_rw.opcode = rqd->opcode; - c->ph_rw.nsid = cpu_to_le32(ns->ns_id); + c->ph_rw.nsid = cpu_to_le32(ns->head->ns_id); c->ph_rw.spba = cpu_to_le64(rqd->ppa_addr.ppa); c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list); c->ph_rw.control = cpu_to_le16(rqd->flags); @@ -731,7 +731,7 @@ static int nvme_nvm_submit_vio(struct nvme_ns *ns, memset(&c, 0, sizeof(c)); c.ph_rw.opcode = vio.opcode; - c.ph_rw.nsid = cpu_to_le32(ns->ns_id); + c.ph_rw.nsid = cpu_to_le32(ns->head->ns_id); c.ph_rw.control = cpu_to_le16(vio.control); c.ph_rw.length = cpu_to_le16(vio.nppas); @@ -768,7 +768,7 @@ static int nvme_nvm_user_vcmd(struct nvme_ns *ns, int admin, memset(&c, 0, sizeof(c)); c.common.opcode = vcmd.opcode; - c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.nsid = cpu_to_le32(ns->head->ns_id); c.common.cdw2[0] = cpu_to_le32(vcmd.cdw2); c.common.cdw2[1] = cpu_to_le32(vcmd.cdw3); /* cdw11-12 */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2f1af915f93a..6e5004b00975 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -136,7 +136,6 @@ struct nvme_ctrl { struct device ctrl_device; struct device *device; /* char device */ struct cdev cdev; - struct ida ns_ida; struct work_struct reset_work; struct work_struct delete_work; @@ -209,12 +208,14 @@ struct nvme_subsystem { struct list_head entry; struct mutex lock; struct list_head ctrls; + struct list_head nsheads; char subnqn[NVMF_NQN_SIZE]; char serial[20]; char model[40]; char firmware_rev[8]; u8 cmic; u16 vendor_id; + struct ida ns_ida; }; /* @@ -226,18 +227,35 @@ struct nvme_ns_ids { uuid_t uuid; }; +/* + * Anchor structure for namespaces. There is one for each namespace in a + * NVMe subsystem that any of our controllers can see, and the namespace + * structure for each controller is chained of it. For private namespaces + * there is a 1:1 relation to our namespace structures, that is ->list + * only ever has a single entry for private namespaces. + */ +struct nvme_ns_head { + struct list_head list; + struct srcu_struct srcu; + struct nvme_subsystem *subsys; + unsigned ns_id; + struct nvme_ns_ids ids; + struct list_head entry; + struct kref ref; + int instance; +}; + struct nvme_ns { struct list_head list; struct nvme_ctrl *ctrl; struct request_queue *queue; struct gendisk *disk; + struct list_head siblings; struct nvm_dev *ndev; struct kref kref; - int instance; + struct nvme_ns_head *head; - unsigned ns_id; - struct nvme_ns_ids ids; int lba_shift; u16 ms; u16 sgs; -- cgit v1.2.1 From 32acab3181c7053c775ca128c3a5c6ce50197d7f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 2 Nov 2017 12:59:30 +0100 Subject: nvme: implement multipath access to nvme subsystems This patch adds native multipath support to the nvme driver. For each namespace we create only single block device node, which can be used to access that namespace through any of the controllers that refer to it. The gendisk for each controllers path to the name space still exists inside the kernel, but is hidden from userspace. The character device nodes are still available on a per-controller basis. A new link from the sysfs directory for the subsystem allows to find all controllers for a given subsystem. Currently we will always send I/O to the first available path, this will be changed once the NVMe Asynchronous Namespace Access (ANA) TP is ratified and implemented, at which point we will look at the ANA state for each namespace. Another possibility that was prototyped is to use the path that is closes to the submitting NUMA code, which will be mostly interesting for PCI, but might also be useful for RDMA or FC transports in the future. There is not plan to implement round robin or I/O service time path selectors, as those are not scalable with the performance rates provided by NVMe. The multipath device will go away once all paths to it disappear, any delay to keep it alive needs to be implemented at the controller level. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- drivers/nvme/host/Kconfig | 9 ++ drivers/nvme/host/Makefile | 1 + drivers/nvme/host/core.c | 147 +++++++++++++++++++++--- drivers/nvme/host/multipath.c | 255 ++++++++++++++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 57 ++++++++++ 5 files changed, 455 insertions(+), 14 deletions(-) create mode 100644 drivers/nvme/host/multipath.c (limited to 'drivers') diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 46d6cb1e03bd..b979cf3bce65 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -13,6 +13,15 @@ config BLK_DEV_NVME To compile this driver as a module, choose M here: the module will be called nvme. +config NVME_MULTIPATH + bool "NVMe multipath support" + depends on NVME_CORE + ---help--- + This option enables support for multipath access to NVMe + subsystems. If this option is enabled only a single + /dev/nvmeXnY device will show up for each NVMe namespaces, + even if it is accessible through multiple controllers. + config NVME_FABRICS tristate diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index cc0aacb4c8b4..b856f2f549cd 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o obj-$(CONFIG_NVME_FC) += nvme-fc.o nvme-core-y := core.o +nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o nvme-core-$(CONFIG_NVM) += lightnvm.o nvme-y += pci.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 13676f6cd4f6..59f80a613fd8 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -185,17 +185,22 @@ static inline bool nvme_req_needs_retry(struct request *req) return false; if (nvme_req(req)->retries >= nvme_max_retries) return false; - if (blk_queue_dying(req->q)) - return false; return true; } void nvme_complete_rq(struct request *req) { if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) { - nvme_req(req)->retries++; - blk_mq_requeue_request(req, true); - return; + if (nvme_req_needs_failover(req)) { + nvme_failover_req(req); + return; + } + + if (!blk_queue_dying(req->q)) { + nvme_req(req)->retries++; + blk_mq_requeue_request(req, true); + return; + } } blk_mq_end_request(req, nvme_error_status(req)); @@ -286,7 +291,8 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, ctrl->state = new_state; spin_unlock_irqrestore(&ctrl->lock, flags); - + if (changed && ctrl->state == NVME_CTRL_LIVE) + nvme_kick_requeue_lists(ctrl); return changed; } EXPORT_SYMBOL_GPL(nvme_change_ctrl_state); @@ -296,6 +302,7 @@ static void nvme_free_ns_head(struct kref *ref) struct nvme_ns_head *head = container_of(ref, struct nvme_ns_head, ref); + nvme_mpath_remove_disk(head); ida_simple_remove(&head->subsys->ns_ida, head->instance); list_del_init(&head->entry); cleanup_srcu_struct(&head->srcu); @@ -1138,11 +1145,33 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, return status; } -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +/* + * Issue ioctl requests on the first available path. Note that unlike normal + * block layer requests we will not retry failed request on another controller. + */ +static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk, + struct nvme_ns_head **head, int *srcu_idx) { - struct nvme_ns *ns = bdev->bd_disk->private_data; +#ifdef CONFIG_NVME_MULTIPATH + if (disk->fops == &nvme_ns_head_ops) { + *head = disk->private_data; + *srcu_idx = srcu_read_lock(&(*head)->srcu); + return nvme_find_path(*head); + } +#endif + *head = NULL; + *srcu_idx = -1; + return disk->private_data; +} + +static void nvme_put_ns_from_disk(struct nvme_ns_head *head, int idx) +{ + if (head) + srcu_read_unlock(&head->srcu, idx); +} +static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned cmd, unsigned long arg) +{ switch (cmd) { case NVME_IOCTL_ID: force_successful_syscall_return(); @@ -1165,10 +1194,31 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, } } +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns_head *head = NULL; + struct nvme_ns *ns; + int srcu_idx, ret; + + ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); + if (unlikely(!ns)) + ret = -EWOULDBLOCK; + else + ret = nvme_ns_ioctl(ns, cmd, arg); + nvme_put_ns_from_disk(head, srcu_idx); + return ret; +} + static int nvme_open(struct block_device *bdev, fmode_t mode) { struct nvme_ns *ns = bdev->bd_disk->private_data; +#ifdef CONFIG_NVME_MULTIPATH + /* should never be called due to GENHD_FL_HIDDEN */ + if (WARN_ON_ONCE(ns->head->disk)) + return -ENXIO; +#endif if (!kref_get_unless_zero(&ns->kref)) return -ENXIO; return 0; @@ -1329,6 +1379,10 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->noiob) nvme_set_chunk_size(ns); nvme_update_disk_info(disk, ns, id); +#ifdef CONFIG_NVME_MULTIPATH + if (ns->head->disk) + nvme_update_disk_info(ns->head->disk, ns, id); +#endif } static int nvme_revalidate_disk(struct gendisk *disk) @@ -1388,8 +1442,10 @@ static char nvme_pr_type(enum pr_type type) static int nvme_pr_command(struct block_device *bdev, u32 cdw10, u64 key, u64 sa_key, u8 op) { - struct nvme_ns *ns = bdev->bd_disk->private_data; + struct nvme_ns_head *head = NULL; + struct nvme_ns *ns; struct nvme_command c; + int srcu_idx, ret; u8 data[16] = { 0, }; put_unaligned_le64(key, &data[0]); @@ -1397,10 +1453,16 @@ static int nvme_pr_command(struct block_device *bdev, u32 cdw10, memset(&c, 0, sizeof(c)); c.common.opcode = op; - c.common.nsid = cpu_to_le32(ns->head->ns_id); + c.common.nsid = cpu_to_le32(head->ns_id); c.common.cdw10[0] = cpu_to_le32(cdw10); - return nvme_submit_sync_cmd(ns->queue, &c, data, 16); + ns = nvme_get_ns_from_disk(bdev->bd_disk, &head, &srcu_idx); + if (unlikely(!ns)) + ret = -EWOULDBLOCK; + else + ret = nvme_submit_sync_cmd(ns->queue, &c, data, 16); + nvme_put_ns_from_disk(head, srcu_idx); + return ret; } static int nvme_pr_register(struct block_device *bdev, u64 old, @@ -1490,6 +1552,32 @@ static const struct block_device_operations nvme_fops = { .pr_ops = &nvme_pr_ops, }; +#ifdef CONFIG_NVME_MULTIPATH +static int nvme_ns_head_open(struct block_device *bdev, fmode_t mode) +{ + struct nvme_ns_head *head = bdev->bd_disk->private_data; + + if (!kref_get_unless_zero(&head->ref)) + return -ENXIO; + return 0; +} + +static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode) +{ + nvme_put_ns_head(disk->private_data); +} + +const struct block_device_operations nvme_ns_head_ops = { + .owner = THIS_MODULE, + .open = nvme_ns_head_open, + .release = nvme_ns_head_release, + .ioctl = nvme_ioctl, + .compat_ioctl = nvme_ioctl, + .getgeo = nvme_getgeo, + .pr_ops = &nvme_pr_ops, +}; +#endif /* CONFIG_NVME_MULTIPATH */ + static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) { unsigned long timeout = @@ -2592,6 +2680,10 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, goto out_cleanup_srcu; } + ret = nvme_mpath_alloc_disk(ctrl, head); + if (ret) + goto out_cleanup_srcu; + list_add_tail(&head->entry, &ctrl->subsys->nsheads); return head; out_cleanup_srcu: @@ -2704,7 +2796,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) struct gendisk *disk; struct nvme_id_ns *id; char disk_name[DISK_NAME_LEN]; - int node = dev_to_node(ctrl->dev); + int node = dev_to_node(ctrl->dev), flags = GENHD_FL_EXT_DEVT; bool new = true; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); @@ -2735,7 +2827,30 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (nvme_init_ns_head(ns, nsid, id, &new)) goto out_free_id; +#ifdef CONFIG_NVME_MULTIPATH + /* + * If multipathing is enabled we need to always use the subsystem + * instance number for numbering our devices to avoid conflicts + * between subsystems that have multiple controllers and thus use + * the multipath-aware subsystem node and those that have a single + * controller and use the controller node directly. + */ + if (ns->head->disk) { + sprintf(disk_name, "nvme%dc%dn%d", ctrl->subsys->instance, + ctrl->cntlid, ns->head->instance); + flags = GENHD_FL_HIDDEN; + } else { + sprintf(disk_name, "nvme%dn%d", ctrl->subsys->instance, + ns->head->instance); + } +#else + /* + * But without the multipath code enabled, multiple controller per + * subsystems are visible as devices and thus we cannot use the + * subsystem instance. + */ sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); +#endif if ((ctrl->quirks & NVME_QUIRK_LIGHTNVM) && id->vs[0] == 0x1) { if (nvme_nvm_register(ns, disk_name, node)) { @@ -2751,7 +2866,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) disk->fops = &nvme_fops; disk->private_data = ns; disk->queue = ns->queue; - disk->flags = GENHD_FL_EXT_DEVT; + disk->flags = flags; memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); ns->disk = disk; @@ -2773,6 +2888,9 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (ns->ndev && nvme_nvm_register_sysfs(ns)) pr_warn("%s: failed to register lightnvm sysfs group for identification\n", ns->disk->disk_name); + + if (new) + nvme_mpath_add_disk(ns->head); return; out_unlink_ns: mutex_lock(&ctrl->subsys->lock); @@ -2805,6 +2923,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) } mutex_lock(&ns->ctrl->subsys->lock); + nvme_mpath_clear_current_path(ns); if (head) list_del_rcu(&ns->siblings); mutex_unlock(&ns->ctrl->subsys->lock); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c new file mode 100644 index 000000000000..062754ebebfd --- /dev/null +++ b/drivers/nvme/host/multipath.c @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2017 Christoph Hellwig. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include "nvme.h" + +static bool multipath = true; +module_param(multipath, bool, 0644); +MODULE_PARM_DESC(multipath, + "turn on native support for multiple controllers per subsystem"); + +void nvme_failover_req(struct request *req) +{ + struct nvme_ns *ns = req->q->queuedata; + unsigned long flags; + + spin_lock_irqsave(&ns->head->requeue_lock, flags); + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + blk_mq_end_request(req, 0); + + nvme_reset_ctrl(ns->ctrl); + kblockd_schedule_work(&ns->head->requeue_work); +} + +bool nvme_req_needs_failover(struct request *req) +{ + if (!(req->cmd_flags & REQ_NVME_MPATH)) + return false; + + switch (nvme_req(req)->status & 0x7ff) { + /* + * Generic command status: + */ + case NVME_SC_INVALID_OPCODE: + case NVME_SC_INVALID_FIELD: + case NVME_SC_INVALID_NS: + case NVME_SC_LBA_RANGE: + case NVME_SC_CAP_EXCEEDED: + case NVME_SC_RESERVATION_CONFLICT: + return false; + + /* + * I/O command set specific error. Unfortunately these values are + * reused for fabrics commands, but those should never get here. + */ + case NVME_SC_BAD_ATTRIBUTES: + case NVME_SC_INVALID_PI: + case NVME_SC_READ_ONLY: + case NVME_SC_ONCS_NOT_SUPPORTED: + WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode == + nvme_fabrics_command); + return false; + + /* + * Media and Data Integrity Errors: + */ + case NVME_SC_WRITE_FAULT: + case NVME_SC_READ_ERROR: + case NVME_SC_GUARD_CHECK: + case NVME_SC_APPTAG_CHECK: + case NVME_SC_REFTAG_CHECK: + case NVME_SC_COMPARE_FAILED: + case NVME_SC_ACCESS_DENIED: + case NVME_SC_UNWRITTEN_BLOCK: + return false; + } + + /* Everything else could be a path failure, so should be retried */ + return true; +} + +void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->head->disk) + kblockd_schedule_work(&ns->head->requeue_work); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + +static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) +{ + struct nvme_ns *ns; + + list_for_each_entry_rcu(ns, &head->list, siblings) { + if (ns->ctrl->state == NVME_CTRL_LIVE) { + rcu_assign_pointer(head->current_path, ns); + return ns; + } + } + + return NULL; +} + +inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) +{ + struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); + + if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) + ns = __nvme_find_path(head); + return ns; +} + +static blk_qc_t nvme_ns_head_make_request(struct request_queue *q, + struct bio *bio) +{ + struct nvme_ns_head *head = q->queuedata; + struct device *dev = disk_to_dev(head->disk); + struct nvme_ns *ns; + blk_qc_t ret = BLK_QC_T_NONE; + int srcu_idx; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = nvme_find_path(head); + if (likely(ns)) { + bio->bi_disk = ns->disk; + bio->bi_opf |= REQ_NVME_MPATH; + ret = direct_make_request(bio); + } else if (!list_empty_careful(&head->list)) { + dev_warn_ratelimited(dev, "no path available - requeing I/O\n"); + + spin_lock_irq(&head->requeue_lock); + bio_list_add(&head->requeue_list, bio); + spin_unlock_irq(&head->requeue_lock); + } else { + dev_warn_ratelimited(dev, "no path - failing I/O\n"); + + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + } + + srcu_read_unlock(&head->srcu, srcu_idx); + return ret; +} + +static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) +{ + struct nvme_ns_head *head = q->queuedata; + struct nvme_ns *ns; + bool found = false; + int srcu_idx; + + srcu_idx = srcu_read_lock(&head->srcu); + ns = srcu_dereference(head->current_path, &head->srcu); + if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) + found = ns->queue->poll_fn(q, qc); + srcu_read_unlock(&head->srcu, srcu_idx); + return found; +} + +static void nvme_requeue_work(struct work_struct *work) +{ + struct nvme_ns_head *head = + container_of(work, struct nvme_ns_head, requeue_work); + struct bio *bio, *next; + + spin_lock_irq(&head->requeue_lock); + next = bio_list_get(&head->requeue_list); + spin_unlock_irq(&head->requeue_lock); + + while ((bio = next) != NULL) { + next = bio->bi_next; + bio->bi_next = NULL; + + /* + * Reset disk to the mpath node and resubmit to select a new + * path. + */ + bio->bi_disk = head->disk; + generic_make_request(bio); + } +} + +int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) +{ + struct request_queue *q; + bool vwc = false; + + bio_list_init(&head->requeue_list); + spin_lock_init(&head->requeue_lock); + INIT_WORK(&head->requeue_work, nvme_requeue_work); + + /* + * Add a multipath node if the subsystems supports multiple controllers. + * We also do this for private namespaces as the namespace sharing data could + * change after a rescan. + */ + if (!(ctrl->subsys->cmic & (1 << 1)) || !multipath) + return 0; + + q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE); + if (!q) + goto out; + q->queuedata = head; + blk_queue_make_request(q, nvme_ns_head_make_request); + q->poll_fn = nvme_ns_head_poll; + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); + /* set to a default value for 512 until disk is validated */ + blk_queue_logical_block_size(q, 512); + + /* we need to propagate up the VMC settings */ + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + vwc = true; + blk_queue_write_cache(q, vwc, vwc); + + head->disk = alloc_disk(0); + if (!head->disk) + goto out_cleanup_queue; + head->disk->fops = &nvme_ns_head_ops; + head->disk->private_data = head; + head->disk->queue = q; + head->disk->flags = GENHD_FL_EXT_DEVT; + sprintf(head->disk->disk_name, "nvme%dn%d", + ctrl->subsys->instance, head->instance); + return 0; + +out_cleanup_queue: + blk_cleanup_queue(q); +out: + return -ENOMEM; +} + +void nvme_mpath_add_disk(struct nvme_ns_head *head) +{ + if (!head->disk) + return; + device_add_disk(&head->subsys->dev, head->disk); +} + +void nvme_mpath_remove_disk(struct nvme_ns_head *head) +{ + if (!head->disk) + return; + del_gendisk(head->disk); + blk_set_queue_dying(head->disk->queue); + /* make sure all pending bios are cleaned up */ + kblockd_schedule_work(&head->requeue_work); + flush_work(&head->requeue_work); + blk_cleanup_queue(head->disk->queue); + put_disk(head->disk); +} diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 6e5004b00975..dd6ced664b45 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -95,6 +95,11 @@ struct nvme_request { u16 status; }; +/* + * Mark a bio as coming in through the mpath node. + */ +#define REQ_NVME_MPATH REQ_DRV + enum { NVME_REQ_CANCELLED = (1 << 0), }; @@ -235,6 +240,13 @@ struct nvme_ns_ids { * only ever has a single entry for private namespaces. */ struct nvme_ns_head { +#ifdef CONFIG_NVME_MULTIPATH + struct gendisk *disk; + struct nvme_ns __rcu *current_path; + struct bio_list requeue_list; + spinlock_t requeue_lock; + struct work_struct requeue_work; +#endif struct list_head list; struct srcu_struct srcu; struct nvme_subsystem *subsys; @@ -384,6 +396,51 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); +extern const struct block_device_operations nvme_ns_head_ops; + +#ifdef CONFIG_NVME_MULTIPATH +void nvme_failover_req(struct request *req); +bool nvme_req_needs_failover(struct request *req); +void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); +int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); +void nvme_mpath_add_disk(struct nvme_ns_head *head); +void nvme_mpath_remove_disk(struct nvme_ns_head *head); + +static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) +{ + struct nvme_ns_head *head = ns->head; + + if (head && ns == srcu_dereference(head->current_path, &head->srcu)) + rcu_assign_pointer(head->current_path, NULL); +} +struct nvme_ns *nvme_find_path(struct nvme_ns_head *head); +#else +static inline void nvme_failover_req(struct request *req) +{ +} +static inline bool nvme_req_needs_failover(struct request *req) +{ + return false; +} +static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) +{ +} +static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, + struct nvme_ns_head *head) +{ + return 0; +} +static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) +{ +} +static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) +{ +} +static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) +{ +} +#endif /* CONFIG_NVME_MULTIPATH */ + #ifdef CONFIG_NVM int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node); void nvme_nvm_unregister(struct nvme_ns *ns); -- cgit v1.2.1 From 5b85b826b8925b3f1910b6613eb82c4df240a5b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 9 Nov 2017 13:51:03 +0100 Subject: nvme: also expose the namespace identification sysfs files for mpath nodes We do this by adding a helper that returns the ns_head for a device that can belong to either the per-controller or per-subsystem block device nodes, and otherwise reuse all the existing code. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Martin K. Petersen Reviewed-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 57 +++++++++++++++++++++++-------------------- drivers/nvme/host/multipath.c | 6 +++++ drivers/nvme/host/nvme.h | 1 + 3 files changed, 38 insertions(+), 26 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 59f80a613fd8..438d3fa12608 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2382,12 +2382,22 @@ static ssize_t nvme_sysfs_rescan(struct device *dev, } static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); +static inline struct nvme_ns_head *dev_to_ns_head(struct device *dev) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (disk->fops == &nvme_fops) + return nvme_get_ns_from_dev(dev)->head; + else + return disk->private_data; +} + static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->head->ids; - struct nvme_subsystem *subsys = ns->ctrl->subsys; + struct nvme_ns_head *head = dev_to_ns_head(dev); + struct nvme_ns_ids *ids = &head->ids; + struct nvme_subsystem *subsys = head->subsys; int serial_len = sizeof(subsys->serial); int model_len = sizeof(subsys->model); @@ -2409,23 +2419,21 @@ static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, return sprintf(buf, "nvme.%04x-%*phN-%*phN-%08x\n", subsys->vendor_id, serial_len, subsys->serial, model_len, subsys->model, - ns->head->ns_id); + head->ns_id); } static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); static ssize_t nguid_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%pU\n", ns->head->ids.nguid); + return sprintf(buf, "%pU\n", dev_to_ns_head(dev)->ids.nguid); } static DEVICE_ATTR(nguid, S_IRUGO, nguid_show, NULL); static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->head->ids; + struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; /* For backward compatibility expose the NGUID to userspace if * we have no UUID set @@ -2440,22 +2448,20 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); static ssize_t eui_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%8ph\n", ns->head->ids.eui64); + return sprintf(buf, "%8ph\n", dev_to_ns_head(dev)->ids.eui64); } static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, - char *buf) + char *buf) { - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - return sprintf(buf, "%d\n", ns->head->ns_id); + return sprintf(buf, "%d\n", dev_to_ns_head(dev)->ns_id); } static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); -static struct attribute *nvme_ns_attrs[] = { +static struct attribute *nvme_ns_id_attrs[] = { &dev_attr_wwid.attr, &dev_attr_uuid.attr, &dev_attr_nguid.attr, @@ -2464,12 +2470,11 @@ static struct attribute *nvme_ns_attrs[] = { NULL, }; -static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, +static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); - struct nvme_ns *ns = nvme_get_ns_from_dev(dev); - struct nvme_ns_ids *ids = &ns->head->ids; + struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; if (a == &dev_attr_uuid.attr) { if (uuid_is_null(&ids->uuid) || @@ -2487,9 +2492,9 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, return a->mode; } -static const struct attribute_group nvme_ns_attr_group = { - .attrs = nvme_ns_attrs, - .is_visible = nvme_ns_attrs_are_visible, +const struct attribute_group nvme_ns_id_attr_group = { + .attrs = nvme_ns_id_attrs, + .is_visible = nvme_ns_id_attrs_are_visible, }; #define nvme_show_str_function(field) \ @@ -2882,7 +2887,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) device_add_disk(ctrl->device, ns->disk); if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, - &nvme_ns_attr_group)) + &nvme_ns_id_attr_group)) pr_warn("%s: failed to create sysfs group for identification\n", ns->disk->disk_name); if (ns->ndev && nvme_nvm_register_sysfs(ns)) @@ -2915,7 +2920,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (blk_get_integrity(ns->disk)) blk_integrity_unregister(ns->disk); sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, - &nvme_ns_attr_group); + &nvme_ns_id_attr_group); if (ns->ndev) nvme_nvm_unregister_sysfs(ns); del_gendisk(ns->disk); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 062754ebebfd..850275896e49 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -239,12 +239,18 @@ void nvme_mpath_add_disk(struct nvme_ns_head *head) if (!head->disk) return; device_add_disk(&head->subsys->dev, head->disk); + if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, + &nvme_ns_id_attr_group)) + pr_warn("%s: failed to create sysfs group for identification\n", + head->disk->disk_name); } void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) return; + sysfs_remove_group(&disk_to_dev(head->disk)->kobj, + &nvme_ns_id_attr_group); del_gendisk(head->disk); blk_set_queue_dying(head->disk->queue); /* make sure all pending bios are cleaned up */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index dd6ced664b45..895a4330dda0 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -396,6 +396,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); +extern const struct attribute_group nvme_ns_id_attr_group; extern const struct block_device_operations nvme_ns_head_ops; #ifdef CONFIG_NVME_MULTIPATH -- cgit v1.2.1 From e9a48034d7d1318ece7d4a235838a86c94db9d68 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Thu, 9 Nov 2017 17:57:06 +0100 Subject: nvme: create 'slaves' and 'holders' entries for hidden controllers When creating nvme multipath devices we should populate the 'slaves' and 'holders' directorys properly to aid userspace topology detection. Signed-off-by: Hannes Reinecke [hch: split from a larger patch, compile fix for CONFIG_NVME_MULTIPATH=n] Reviewed-by: Keith Busch Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 ++ drivers/nvme/host/multipath.c | 30 ++++++++++++++++++++++++++++++ drivers/nvme/host/nvme.h | 8 ++++++++ 3 files changed, 40 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 438d3fa12608..c3182aa654fd 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2896,6 +2896,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) if (new) nvme_mpath_add_disk(ns->head); + nvme_mpath_add_disk_links(ns); return; out_unlink_ns: mutex_lock(&ctrl->subsys->lock); @@ -2919,6 +2920,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (ns->disk && ns->disk->flags & GENHD_FL_UP) { if (blk_get_integrity(ns->disk)) blk_integrity_unregister(ns->disk); + nvme_mpath_remove_disk_links(ns); sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, &nvme_ns_id_attr_group); if (ns->ndev) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 850275896e49..78d92151a904 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -245,6 +245,25 @@ void nvme_mpath_add_disk(struct nvme_ns_head *head) head->disk->disk_name); } +void nvme_mpath_add_disk_links(struct nvme_ns *ns) +{ + struct kobject *slave_disk_kobj, *holder_disk_kobj; + + if (!ns->head->disk) + return; + + slave_disk_kobj = &disk_to_dev(ns->disk)->kobj; + if (sysfs_create_link(ns->head->disk->slave_dir, slave_disk_kobj, + kobject_name(slave_disk_kobj))) + return; + + holder_disk_kobj = &disk_to_dev(ns->head->disk)->kobj; + if (sysfs_create_link(ns->disk->part0.holder_dir, holder_disk_kobj, + kobject_name(holder_disk_kobj))) + sysfs_remove_link(ns->head->disk->slave_dir, + kobject_name(slave_disk_kobj)); +} + void nvme_mpath_remove_disk(struct nvme_ns_head *head) { if (!head->disk) @@ -259,3 +278,14 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) blk_cleanup_queue(head->disk->queue); put_disk(head->disk); } + +void nvme_mpath_remove_disk_links(struct nvme_ns *ns) +{ + if (!ns->head->disk) + return; + + sysfs_remove_link(ns->disk->part0.holder_dir, + kobject_name(&disk_to_dev(ns->head->disk)->kobj)); + sysfs_remove_link(ns->head->disk->slave_dir, + kobject_name(&disk_to_dev(ns->disk)->kobj)); +} diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 895a4330dda0..c0873a68872f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -405,7 +405,9 @@ bool nvme_req_needs_failover(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns_head *head); +void nvme_mpath_add_disk_links(struct nvme_ns *ns); void nvme_mpath_remove_disk(struct nvme_ns_head *head); +void nvme_mpath_remove_disk_links(struct nvme_ns *ns); static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) { @@ -437,6 +439,12 @@ static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) { } +static inline void nvme_mpath_add_disk_links(struct nvme_ns *ns) +{ +} +static inline void nvme_mpath_remove_disk_links(struct nvme_ns *ns) +{ +} static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) { } -- cgit v1.2.1 From 1e496938b6aec67b7cfd06614164fee0fe9f13e3 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 10 Nov 2017 10:58:23 +0100 Subject: nvme: expose subsys attribute to sysfs We should be exposing the subsystem attributes like 'model' and 'subsysnqn' to sysfs to allow for easier identification of the subsystem. Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c3182aa654fd..7b3bbc1a9ac4 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1999,6 +1999,53 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) return NULL; } +#define SUBSYS_ATTR_RO(_name, _mode, _show) \ + struct device_attribute subsys_attr_##_name = \ + __ATTR(_name, _mode, _show, NULL) + +static ssize_t nvme_subsys_show_nqn(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct nvme_subsystem *subsys = + container_of(dev, struct nvme_subsystem, dev); + + return snprintf(buf, PAGE_SIZE, "%s\n", subsys->subnqn); +} +static SUBSYS_ATTR_RO(subsysnqn, S_IRUGO, nvme_subsys_show_nqn); + +#define nvme_subsys_show_str_function(field) \ +static ssize_t subsys_##field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_subsystem *subsys = \ + container_of(dev, struct nvme_subsystem, dev); \ + return sprintf(buf, "%.*s\n", \ + (int)sizeof(subsys->field), subsys->field); \ +} \ +static SUBSYS_ATTR_RO(field, S_IRUGO, subsys_##field##_show); + +nvme_subsys_show_str_function(model); +nvme_subsys_show_str_function(serial); +nvme_subsys_show_str_function(firmware_rev); + +static struct attribute *nvme_subsys_attrs[] = { + &subsys_attr_model.attr, + &subsys_attr_serial.attr, + &subsys_attr_firmware_rev.attr, + &subsys_attr_subsysnqn.attr, + NULL, +}; + +static struct attribute_group nvme_subsys_attrs_group = { + .attrs = nvme_subsys_attrs, +}; + +static const struct attribute_group *nvme_subsys_attrs_groups[] = { + &nvme_subsys_attrs_group, + NULL, +}; + static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { struct nvme_subsystem *subsys, *found; @@ -2026,6 +2073,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys->dev.class = nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; + subsys->dev.groups = nvme_subsys_attrs_groups; dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance); device_initialize(&subsys->dev); -- cgit v1.2.1 From 15f7b41f70ddcca3b555bd0fdc7c8da7466b517e Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 10 Nov 2017 12:29:34 -0500 Subject: brd: remove unused brd_mutex Remove unused mutex brd_mutex. It is unused since the commit ff26956875c2 ("brd: remove support for BLKFLSBUF"). Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- drivers/block/brd.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 2d7178f7754e..c1cf87718c2e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -60,7 +60,6 @@ struct brd_device { /* * Look up and return a brd's page for a given sector. */ -static DEFINE_MUTEX(brd_mutex); static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) { pgoff_t idx; -- cgit v1.2.1 From 8dc7a31fbce5e2dbbacd83d910da37105181b054 Mon Sep 17 00:00:00 2001 From: Hongxu Jia Date: Fri, 10 Nov 2017 15:59:17 +0800 Subject: ide: ide-atapi: fix compile error with defining macro DEBUG Compile ide-atapi failed with defining macro "DEBUG" ... |drivers/ide/ide-atapi.c:285:52: error: 'struct request' has no member named 'cmd'; did you mean 'csd'? | debug_log("%s: rq->cmd[0]: 0x%x\n", __func__, rq->cmd[0]); ... Since we split the scsi_request out of struct request, it missed do the same thing on debug_log Fixes: 82ed4db499b8 ("block: split scsi_request out of struct request") Signed-off-by: Hongxu Jia Signed-off-by: Jens Axboe --- drivers/ide/ide-atapi.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 14d1e7d9a1d6..0e6bc631a1ca 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -282,7 +282,7 @@ int ide_cd_expiry(ide_drive_t *drive) struct request *rq = drive->hwif->rq; unsigned long wait = 0; - debug_log("%s: rq->cmd[0]: 0x%x\n", __func__, rq->cmd[0]); + debug_log("%s: scsi_req(rq)->cmd[0]: 0x%x\n", __func__, scsi_req(rq)->cmd[0]); /* * Some commands are *slow* and normally take a long time to complete. @@ -463,7 +463,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) return ide_do_reset(drive); } - debug_log("[cmd %x]: check condition\n", rq->cmd[0]); + debug_log("[cmd %x]: check condition\n", scsi_req(rq)->cmd[0]); /* Retry operation */ ide_retry_pc(drive); @@ -531,7 +531,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) ide_pad_transfer(drive, write, bcount); debug_log("[cmd %x] transferred %d bytes, padded %d bytes, resid: %u\n", - rq->cmd[0], done, bcount, scsi_req(rq)->resid_len); + scsi_req(rq)->cmd[0], done, bcount, scsi_req(rq)->resid_len); /* And set the interrupt handler again */ ide_set_handler(drive, ide_pc_intr, timeout); -- cgit v1.2.1 From a04b5de5050ab8b891128eb2c47a0916fe8622e1 Mon Sep 17 00:00:00 2001 From: Martin Wilck Date: Thu, 28 Sep 2017 21:33:23 +0200 Subject: nvme: fix visibility of "uuid" ns attribute "uuid" must be invisible if both ns->uuid and ns->nguid are unset, not if either one is. Fixes: d934f9848a77 "nvme: provide UUID value to userspace" Signed-off-by: Martin Wilck [hch: rebased to the nvme-4.15 tree to help resolving a conflict] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 7b3bbc1a9ac4..993813ccdc0b 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2525,7 +2525,7 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, struct nvme_ns_ids *ids = &dev_to_ns_head(dev)->ids; if (a == &dev_attr_uuid.attr) { - if (uuid_is_null(&ids->uuid) || + if (uuid_is_null(&ids->uuid) && !memchr_inv(ids->nguid, 0, sizeof(ids->nguid))) return 0; } -- cgit v1.2.1