diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 10:23:25 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 10:23:25 -0700 |
commit | 73ba2fb33c492916853dfe63e3b3163da0be661d (patch) | |
tree | c2fda8ca1273744d2e884d24189a15ac1a7d63c2 /drivers/nvme/host/rdma.c | |
parent | 958f338e96f874a0d29442396d6adf9c1e17aa2d (diff) | |
parent | b86d865cb1cae1e61527ea0b8977078bbf694328 (diff) | |
download | linux-rt-73ba2fb33c492916853dfe63e3b3163da0be661d.tar.gz |
Merge tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
"First pull request for this merge window, there will also be a
followup request with some stragglers.
This pull request contains:
- Fix for a thundering heard issue in the wbt block code (Anchal
Agarwal)
- A few NVMe pull requests:
* Improved tracepoints (Keith)
* Larger inline data support for RDMA (Steve Wise)
* RDMA setup/teardown fixes (Sagi)
* Effects log suppor for NVMe target (Chaitanya Kulkarni)
* Buffered IO suppor for NVMe target (Chaitanya Kulkarni)
* TP4004 (ANA) support (Christoph)
* Various NVMe fixes
- Block io-latency controller support. Much needed support for
properly containing block devices. (Josef)
- Series improving how we handle sense information on the stack
(Kees)
- Lightnvm fixes and updates/improvements (Mathias/Javier et al)
- Zoned device support for null_blk (Matias)
- AIX partition fixes (Mauricio Faria de Oliveira)
- DIF checksum code made generic (Max Gurtovoy)
- Add support for discard in iostats (Michael Callahan / Tejun)
- Set of updates for BFQ (Paolo)
- Removal of async write support for bsg (Christoph)
- Bio page dirtying and clone fixups (Christoph)
- Set of bcache fix/changes (via Coly)
- Series improving blk-mq queue setup/teardown speed (Ming)
- Series improving merging performance on blk-mq (Ming)
- Lots of other fixes and cleanups from a slew of folks"
* tag 'for-4.19/block-20180812' of git://git.kernel.dk/linux-block: (190 commits)
blkcg: Make blkg_root_lookup() work for queues in bypass mode
bcache: fix error setting writeback_rate through sysfs interface
null_blk: add lock drop/acquire annotation
Blk-throttle: reduce tail io latency when iops limit is enforced
block: paride: pd: mark expected switch fall-throughs
block: Ensure that a request queue is dissociated from the cgroup controller
block: Introduce blk_exit_queue()
blkcg: Introduce blkg_root_lookup()
block: Remove two superfluous #include directives
blk-mq: count the hctx as active before allocating tag
block: bvec_nr_vecs() returns value for wrong slab
bcache: trivial - remove tailing backslash in macro BTREE_FLAG
bcache: make the pr_err statement used for ENOENT only in sysfs_attatch section
bcache: set max writeback rate when I/O request is idle
bcache: add code comments for bset.c
bcache: fix mistaken comments in request.c
bcache: fix mistaken code comments in bcache.h
bcache: add a comment in super.c
bcache: avoid unncessary cache prefetch bch_btree_node_get()
bcache: display rate debug parameters to 0 when writeback is not running
...
Diffstat (limited to 'drivers/nvme/host/rdma.c')
-rw-r--r-- | drivers/nvme/host/rdma.c | 234 |
1 files changed, 111 insertions, 123 deletions
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 66ec5985c9f3..0805fa6215ee 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -40,13 +40,14 @@ #define NVME_RDMA_MAX_SEGMENTS 256 -#define NVME_RDMA_MAX_INLINE_SEGMENTS 1 +#define NVME_RDMA_MAX_INLINE_SEGMENTS 4 struct nvme_rdma_device { struct ib_device *dev; struct ib_pd *pd; struct kref ref; struct list_head entry; + unsigned int num_inline_segments; }; struct nvme_rdma_qe { @@ -117,6 +118,7 @@ struct nvme_rdma_ctrl { struct sockaddr_storage src_addr; struct nvme_ctrl ctrl; + bool use_inline_data; }; static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl) @@ -249,7 +251,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor) /* +1 for drain */ init_attr.cap.max_recv_wr = queue->queue_size + 1; init_attr.cap.max_recv_sge = 1; - init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS; + init_attr.cap.max_send_sge = 1 + dev->num_inline_segments; init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; init_attr.qp_type = IB_QPT_RC; init_attr.send_cq = queue->ib_cq; @@ -286,6 +288,7 @@ static int nvme_rdma_init_request(struct blk_mq_tag_set *set, struct ib_device *ibdev = dev->dev; int ret; + nvme_req(rq)->ctrl = &ctrl->ctrl; ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command), DMA_TO_DEVICE); if (ret) @@ -374,6 +377,8 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id) goto out_free_pd; } + ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS, + ndev->dev->attrs.max_sge - 1); list_add(&ndev->entry, &device_list); out_unlock: mutex_unlock(&device_list_mutex); @@ -868,6 +873,31 @@ out_free_io_queues: return ret; } +static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_stop_queue(&ctrl->queues[0]); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_cancel_request, + &ctrl->ctrl); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_destroy_admin_queue(ctrl, remove); +} + +static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, + bool remove) +{ + if (ctrl->ctrl.queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + nvme_rdma_stop_io_queues(ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_cancel_request, + &ctrl->ctrl); + if (remove) + nvme_start_queues(&ctrl->ctrl); + nvme_rdma_destroy_io_queues(ctrl, remove); + } +} + static void nvme_rdma_stop_ctrl(struct nvme_ctrl *nctrl) { struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl); @@ -912,21 +942,44 @@ static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) } } -static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) { - struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), - struct nvme_rdma_ctrl, reconnect_work); + int ret = -EINVAL; bool changed; - int ret; - ++ctrl->ctrl.nr_reconnects; - - ret = nvme_rdma_configure_admin_queue(ctrl, false); + ret = nvme_rdma_configure_admin_queue(ctrl, new); if (ret) - goto requeue; + return ret; + + if (ctrl->ctrl.icdoff) { + dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); + goto destroy_admin; + } + + if (!(ctrl->ctrl.sgls & (1 << 2))) { + dev_err(ctrl->ctrl.device, + "Mandatory keyed sgls are not supported!\n"); + goto destroy_admin; + } + + if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) { + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl sqsize %u, clamping down\n", + ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); + } + + if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { + dev_warn(ctrl->ctrl.device, + "sqsize %u > ctrl maxcmd %u, clamping down\n", + ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); + ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; + } + + if (ctrl->ctrl.sgls & (1 << 20)) + ctrl->use_inline_data = true; if (ctrl->ctrl.queue_count > 1) { - ret = nvme_rdma_configure_io_queues(ctrl, false); + ret = nvme_rdma_configure_io_queues(ctrl, new); if (ret) goto destroy_admin; } @@ -935,10 +988,31 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) if (!changed) { /* state change failure is ok if we're in DELETING state */ WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); - return; + ret = -EINVAL; + goto destroy_io; } nvme_start_ctrl(&ctrl->ctrl); + return 0; + +destroy_io: + if (ctrl->ctrl.queue_count > 1) + nvme_rdma_destroy_io_queues(ctrl, new); +destroy_admin: + nvme_rdma_stop_queue(&ctrl->queues[0]); + nvme_rdma_destroy_admin_queue(ctrl, new); + return ret; +} + +static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) +{ + struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvme_rdma_ctrl, reconnect_work); + + ++ctrl->ctrl.nr_reconnects; + + if (nvme_rdma_setup_ctrl(ctrl, false)) + goto requeue; dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n", ctrl->ctrl.nr_reconnects); @@ -947,9 +1021,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work) return; -destroy_admin: - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_rdma_destroy_admin_queue(ctrl, false); requeue: dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n", ctrl->ctrl.nr_reconnects); @@ -962,27 +1033,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) struct nvme_rdma_ctrl, err_work); nvme_stop_keep_alive(&ctrl->ctrl); - - if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_destroy_io_queues(ctrl, false); - } - - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_destroy_admin_queue(ctrl, false); - - /* - * queues are not a live anymore, so restart the queues to fail fast - * new IO - */ - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); + nvme_rdma_teardown_admin_queue(ctrl, false); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1090,19 +1143,27 @@ static int nvme_rdma_set_sg_null(struct nvme_command *c) } static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue, - struct nvme_rdma_request *req, struct nvme_command *c) + struct nvme_rdma_request *req, struct nvme_command *c, + int count) { struct nvme_sgl_desc *sg = &c->common.dptr.sgl; + struct scatterlist *sgl = req->sg_table.sgl; + struct ib_sge *sge = &req->sge[1]; + u32 len = 0; + int i; - req->sge[1].addr = sg_dma_address(req->sg_table.sgl); - req->sge[1].length = sg_dma_len(req->sg_table.sgl); - req->sge[1].lkey = queue->device->pd->local_dma_lkey; + for (i = 0; i < count; i++, sgl++, sge++) { + sge->addr = sg_dma_address(sgl); + sge->length = sg_dma_len(sgl); + sge->lkey = queue->device->pd->local_dma_lkey; + len += sge->length; + } sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff); - sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl)); + sg->length = cpu_to_le32(len); sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET; - req->num_sge++; + req->num_sge += count; return 0; } @@ -1195,15 +1256,16 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, goto out_free_table; } - if (count == 1) { + if (count <= dev->num_inline_segments) { if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) && + queue->ctrl->use_inline_data && blk_rq_payload_bytes(rq) <= nvme_rdma_inline_data_size(queue)) { - ret = nvme_rdma_map_sg_inline(queue, req, c); + ret = nvme_rdma_map_sg_inline(queue, req, c, count); goto out; } - if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { + if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) { ret = nvme_rdma_map_sg_single(queue, req, c); goto out; } @@ -1574,6 +1636,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id, case RDMA_CM_EVENT_CONNECT_ERROR: case RDMA_CM_EVENT_UNREACHABLE: nvme_rdma_destroy_queue_ib(queue); + /* fall through */ case RDMA_CM_EVENT_ADDR_ERROR: dev_dbg(queue->ctrl->ctrl.device, "CM error event %d\n", ev->event); @@ -1736,25 +1799,12 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = { static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) { - if (ctrl->ctrl.queue_count > 1) { - nvme_stop_queues(&ctrl->ctrl); - nvme_rdma_stop_io_queues(ctrl); - blk_mq_tagset_busy_iter(&ctrl->tag_set, - nvme_cancel_request, &ctrl->ctrl); - nvme_rdma_destroy_io_queues(ctrl, shutdown); - } - + nvme_rdma_teardown_io_queues(ctrl, shutdown); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); - - blk_mq_quiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_stop_queue(&ctrl->queues[0]); - blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, - nvme_cancel_request, &ctrl->ctrl); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - nvme_rdma_destroy_admin_queue(ctrl, shutdown); + nvme_rdma_teardown_admin_queue(ctrl, shutdown); } static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl) @@ -1766,8 +1816,6 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) { struct nvme_rdma_ctrl *ctrl = container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work); - int ret; - bool changed; nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); @@ -1778,25 +1826,9 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) return; } - ret = nvme_rdma_configure_admin_queue(ctrl, false); - if (ret) + if (nvme_rdma_setup_ctrl(ctrl, false)) goto out_fail; - if (ctrl->ctrl.queue_count > 1) { - ret = nvme_rdma_configure_io_queues(ctrl, false); - if (ret) - goto out_fail; - } - - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - if (!changed) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING); - return; - } - - nvme_start_ctrl(&ctrl->ctrl); - return; out_fail: @@ -1959,49 +1991,10 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); WARN_ON_ONCE(!changed); - ret = nvme_rdma_configure_admin_queue(ctrl, true); + ret = nvme_rdma_setup_ctrl(ctrl, true); if (ret) goto out_uninit_ctrl; - /* sanity check icdoff */ - if (ctrl->ctrl.icdoff) { - dev_err(ctrl->ctrl.device, "icdoff is not supported!\n"); - ret = -EINVAL; - goto out_remove_admin_queue; - } - - /* sanity check keyed sgls */ - if (!(ctrl->ctrl.sgls & (1 << 2))) { - dev_err(ctrl->ctrl.device, - "Mandatory keyed sgls are not supported!\n"); - ret = -EINVAL; - goto out_remove_admin_queue; - } - - /* only warn if argument is too large here, will clamp later */ - if (opts->queue_size > ctrl->ctrl.sqsize + 1) { - dev_warn(ctrl->ctrl.device, - "queue_size %zu > ctrl sqsize %u, clamping down\n", - opts->queue_size, ctrl->ctrl.sqsize + 1); - } - - /* warn if maxcmd is lower than sqsize+1 */ - if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { - dev_warn(ctrl->ctrl.device, - "sqsize %u > ctrl maxcmd %u, clamping down\n", - ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd); - ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1; - } - - if (opts->nr_io_queues) { - ret = nvme_rdma_configure_io_queues(ctrl, true); - if (ret) - goto out_remove_admin_queue; - } - - changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); - WARN_ON_ONCE(!changed); - dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n", ctrl->ctrl.opts->subsysnqn, &ctrl->addr); @@ -2011,13 +2004,8 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list); mutex_unlock(&nvme_rdma_ctrl_mutex); - nvme_start_ctrl(&ctrl->ctrl); - return &ctrl->ctrl; -out_remove_admin_queue: - nvme_rdma_stop_queue(&ctrl->queues[0]); - nvme_rdma_destroy_admin_queue(ctrl, true); out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); nvme_put_ctrl(&ctrl->ctrl); |