diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-07 09:35:50 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-07 09:35:50 -0700 |
commit | 7c989b1da3946e40bf71be00a0b401015235605a (patch) | |
tree | 33de4ff984af6a301d6e80a05d40a893909388c9 /block | |
parent | 513389809e138ae903b6ef43c1d5d2ffaf4dca17 (diff) | |
parent | 0e0abad2a71bcd7ba0f30e7975f5b4199ade4e60 (diff) | |
download | linux-next-7c989b1da3946e40bf71be00a0b401015235605a.tar.gz |
Merge tag 'for-6.1/passthrough-2022-10-04' of git://git.kernel.dk/linux
Pull passthrough updates from Jens Axboe:
"With these changes, passthrough NVMe support over io_uring now
performs at the same level as block device O_DIRECT, and in many cases
6-8% better.
This contains:
- Add support for fixed buffers for passthrough (Anuj, Kanchan)
- Enable batched allocations and freeing on passthrough, similarly to
what we support on the normal storage path (me)
- Fix from Geert fixing an issue with !CONFIG_IO_URING"
* tag 'for-6.1/passthrough-2022-10-04' of git://git.kernel.dk/linux:
io_uring: Add missing inline to io_uring_cmd_import_fixed() dummy
nvme: wire up fixed buffer support for nvme passthrough
nvme: pass ubuffer as an integer
block: extend functionality to map bvec iterator
block: factor out blk_rq_map_bio_alloc helper
block: rename bio_map_put to blk_mq_map_bio_put
nvme: refactor nvme_alloc_request
nvme: refactor nvme_add_user_metadata
nvme: Use blk_rq_map_user_io helper
scsi: Use blk_rq_map_user_io helper
block: add blk_rq_map_user_io
io_uring: introduce fixed buffer support for io_uring_cmd
io_uring: add io_uring_cmd_import_fixed
nvme: enable batched completions of passthrough IO
nvme: split out metadata vs non metadata end_io uring_cmd completions
block: allow end_io based requests in the completion batch handling
block: change request end_io handler to pass back a return value
block: enable batched allocation for blk_mq_alloc_request()
block: kill deprecated BUG_ON() in the flush handling
Diffstat (limited to 'block')
-rw-r--r-- | block/blk-flush.c | 11 | ||||
-rw-r--r-- | block/blk-map.c | 150 | ||||
-rw-r--r-- | block/blk-mq.c | 107 |
3 files changed, 230 insertions, 38 deletions
diff --git a/block/blk-flush.c b/block/blk-flush.c index d20a0c6b2c66..53202eff545e 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -205,7 +205,6 @@ static void blk_flush_complete_seq(struct request *rq, * flush data request completion path. Restore @rq for * normal completion and end it. */ - BUG_ON(!list_empty(&rq->queuelist)); list_del_init(&rq->flush.list); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); @@ -218,7 +217,8 @@ static void blk_flush_complete_seq(struct request *rq, blk_kick_flush(q, fq, cmd_flags); } -static void flush_end_io(struct request *flush_rq, blk_status_t error) +static enum rq_end_io_ret flush_end_io(struct request *flush_rq, + blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; @@ -232,7 +232,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); - return; + return RQ_END_IO_NONE; } blk_account_io_flush(flush_rq); @@ -269,6 +269,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) } spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + return RQ_END_IO_NONE; } bool is_flush_rq(struct request *rq) @@ -354,7 +355,8 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, blk_flush_queue_rq(flush_rq, false); } -static void mq_flush_data_end_io(struct request *rq, blk_status_t error) +static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, + blk_status_t error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; @@ -376,6 +378,7 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) spin_unlock_irqrestore(&fq->mq_flush_lock, flags); blk_mq_sched_restart(hctx); + return RQ_END_IO_NONE; } /** diff --git a/block/blk-map.c b/block/blk-map.c index 7693f8e3c454..34735626b00f 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -231,7 +231,7 @@ out_bmd: return ret; } -static void bio_map_put(struct bio *bio) +static void blk_mq_map_bio_put(struct bio *bio) { if (bio->bi_opf & REQ_ALLOC_CACHE) { bio_put(bio); @@ -241,17 +241,10 @@ static void bio_map_put(struct bio *bio) } } -static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, - gfp_t gfp_mask) +static struct bio *blk_rq_map_bio_alloc(struct request *rq, + unsigned int nr_vecs, gfp_t gfp_mask) { - unsigned int max_sectors = queue_max_hw_sectors(rq->q); - unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; - int ret; - int j; - - if (!iov_iter_count(iter)) - return -EINVAL; if (rq->cmd_flags & REQ_POLLED) { blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE; @@ -259,13 +252,31 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask, &fs_bio_set); if (!bio) - return -ENOMEM; + return NULL; } else { bio = bio_kmalloc(nr_vecs, gfp_mask); if (!bio) - return -ENOMEM; + return NULL; bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); } + return bio; +} + +static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, + gfp_t gfp_mask) +{ + unsigned int max_sectors = queue_max_hw_sectors(rq->q); + unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); + struct bio *bio; + int ret; + int j; + + if (!iov_iter_count(iter)) + return -EINVAL; + + bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); + if (bio == NULL) + return -ENOMEM; while (iov_iter_count(iter)) { struct page **pages, *stack_pages[UIO_FASTIOV]; @@ -331,7 +342,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, out_unmap: bio_release_pages(bio, false); - bio_map_put(bio); + blk_mq_map_bio_put(bio); return ret; } @@ -537,6 +548,62 @@ int blk_rq_append_bio(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(blk_rq_append_bio); +/* Prepare bio for passthrough IO given ITER_BVEC iter */ +static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) +{ + struct request_queue *q = rq->q; + size_t nr_iter = iov_iter_count(iter); + size_t nr_segs = iter->nr_segs; + struct bio_vec *bvecs, *bvprvp = NULL; + struct queue_limits *lim = &q->limits; + unsigned int nsegs = 0, bytes = 0; + struct bio *bio; + size_t i; + + if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q)) + return -EINVAL; + if (nr_segs > queue_max_segments(q)) + return -EINVAL; + + /* no iovecs to alloc, as we already have a BVEC iterator */ + bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); + if (bio == NULL) + return -ENOMEM; + + bio_iov_bvec_set(bio, (struct iov_iter *)iter); + blk_rq_bio_prep(rq, bio, nr_segs); + + /* loop to perform a bunch of sanity checks */ + bvecs = (struct bio_vec *)iter->bvec; + for (i = 0; i < nr_segs; i++) { + struct bio_vec *bv = &bvecs[i]; + + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, fallback to copy. + */ + if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) { + blk_mq_map_bio_put(bio); + return -EREMOTEIO; + } + /* check full condition */ + if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) + goto put_bio; + if (bytes + bv->bv_len > nr_iter) + goto put_bio; + if (bv->bv_offset + bv->bv_len > PAGE_SIZE) + goto put_bio; + + nsegs++; + bytes += bv->bv_len; + bvprvp = bv; + } + return 0; +put_bio: + blk_mq_map_bio_put(bio); + return -EINVAL; +} + /** * blk_rq_map_user_iov - map user data to a request, for passthrough requests * @q: request queue where request should be inserted @@ -556,24 +623,35 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - bool copy = false; + bool copy = false, map_bvec = false; unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; int ret = -EINVAL; - if (!iter_is_iovec(iter)) - goto fail; - if (map_data) copy = true; else if (blk_queue_may_bounce(q)) copy = true; else if (iov_iter_alignment(iter) & align) copy = true; + else if (iov_iter_is_bvec(iter)) + map_bvec = true; + else if (!iter_is_iovec(iter)) + copy = true; else if (queue_virt_boundary(q)) copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); + if (map_bvec) { + ret = blk_rq_map_user_bvec(rq, iter); + if (!ret) + return 0; + if (ret != -EREMOTEIO) + goto fail; + /* fall back to copying the data on limits mismatches */ + copy = true; + } + i = *iter; do { if (copy) @@ -611,6 +689,42 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_user); +int blk_rq_map_user_io(struct request *req, struct rq_map_data *map_data, + void __user *ubuf, unsigned long buf_len, gfp_t gfp_mask, + bool vec, int iov_count, bool check_iter_count, int rw) +{ + int ret = 0; + + if (vec) { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov = fast_iov; + struct iov_iter iter; + + ret = import_iovec(rw, ubuf, iov_count ? iov_count : buf_len, + UIO_FASTIOV, &iov, &iter); + if (ret < 0) + return ret; + + if (iov_count) { + /* SG_IO howto says that the shorter of the two wins */ + iov_iter_truncate(&iter, buf_len); + if (check_iter_count && !iov_iter_count(&iter)) { + kfree(iov); + return -EINVAL; + } + } + + ret = blk_rq_map_user_iov(req->q, req, map_data, &iter, + gfp_mask); + kfree(iov); + } else if (buf_len) { + ret = blk_rq_map_user(req->q, req, map_data, ubuf, buf_len, + gfp_mask); + } + return ret; +} +EXPORT_SYMBOL(blk_rq_map_user_io); + /** * blk_rq_unmap_user - unmap a request with user data * @bio: start of bio list @@ -636,7 +750,7 @@ int blk_rq_unmap_user(struct bio *bio) next_bio = bio; bio = bio->bi_next; - bio_map_put(next_bio); + blk_mq_map_bio_put(next_bio); } return ret; diff --git a/block/blk-mq.c b/block/blk-mq.c index 83492d942348..8070b6c10e8d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -510,25 +510,87 @@ retry: alloc_time_ns); } -struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, - blk_mq_req_flags_t flags) +static struct request *blk_mq_rq_cache_fill(struct request_queue *q, + struct blk_plug *plug, + blk_opf_t opf, + blk_mq_req_flags_t flags) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .cmd_flags = opf, - .nr_tags = 1, + .nr_tags = plug->nr_ios, + .cached_rq = &plug->cached_rq, }; struct request *rq; - int ret; - ret = blk_queue_enter(q, flags); - if (ret) - return ERR_PTR(ret); + if (blk_queue_enter(q, flags)) + return NULL; + + plug->nr_ios = 1; rq = __blk_mq_alloc_requests(&data); - if (!rq) - goto out_queue_exit; + if (unlikely(!rq)) + blk_queue_exit(q); + return rq; +} + +static struct request *blk_mq_alloc_cached_request(struct request_queue *q, + blk_opf_t opf, + blk_mq_req_flags_t flags) +{ + struct blk_plug *plug = current->plug; + struct request *rq; + + if (!plug) + return NULL; + if (rq_list_empty(plug->cached_rq)) { + if (plug->nr_ios == 1) + return NULL; + rq = blk_mq_rq_cache_fill(q, plug, opf, flags); + if (rq) + goto got_it; + return NULL; + } + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + + if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) + return NULL; + + plug->cached_rq = rq_list_next(rq); +got_it: + rq->cmd_flags = opf; + INIT_LIST_HEAD(&rq->queuelist); + return rq; +} + +struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, + blk_mq_req_flags_t flags) +{ + struct request *rq; + + rq = blk_mq_alloc_cached_request(q, opf, flags); + if (!rq) { + struct blk_mq_alloc_data data = { + .q = q, + .flags = flags, + .cmd_flags = opf, + .nr_tags = 1, + }; + int ret; + + ret = blk_queue_enter(q, flags); + if (ret) + return ERR_PTR(ret); + + rq = __blk_mq_alloc_requests(&data); + if (!rq) + goto out_queue_exit; + } rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -761,8 +823,10 @@ static void blk_complete_request(struct request *req) * can find how many bytes remain in the request * later. */ - req->bio = NULL; - req->__data_len = 0; + if (!req->end_io) { + req->bio = NULL; + req->__data_len = 0; + } } /** @@ -939,7 +1003,8 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) if (rq->end_io) { rq_qos_done(rq->q, rq); - rq->end_io(rq, error); + if (rq->end_io(rq, error) == RQ_END_IO_FREE) + blk_mq_free_request(rq); } else { blk_mq_free_request(rq); } @@ -992,6 +1057,13 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) rq_qos_done(rq->q, rq); + /* + * If end_io handler returns NONE, then it still has + * ownership of the request. + */ + if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) + continue; + WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (!req_ref_put_and_test(rq)) continue; @@ -1233,12 +1305,13 @@ struct blk_rq_wait { blk_status_t ret; }; -static void blk_end_sync_rq(struct request *rq, blk_status_t ret) +static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) { struct blk_rq_wait *wait = rq->end_io_data; wait->ret = ret; complete(&wait->done); + return RQ_END_IO_NONE; } bool blk_rq_is_poll(struct request *rq) @@ -1472,10 +1545,12 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) void blk_mq_put_rq_ref(struct request *rq) { - if (is_flush_rq(rq)) - rq->end_io(rq, 0); - else if (req_ref_put_and_test(rq)) + if (is_flush_rq(rq)) { + if (rq->end_io(rq, 0) == RQ_END_IO_FREE) + blk_mq_free_request(rq); + } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); + } } static bool blk_mq_check_expired(struct request *rq, void *priv) |