diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bfq-cgroup.c | 8 | ||||
-rw-r--r-- | block/bfq-iosched.c | 6 | ||||
-rw-r--r-- | block/bfq-wf2q.c | 5 | ||||
-rw-r--r-- | block/bio.c | 134 | ||||
-rw-r--r-- | block/blk-cgroup.c | 1 | ||||
-rw-r--r-- | block/blk-core.c | 17 | ||||
-rw-r--r-- | block/blk-ia-ranges.c | 1 | ||||
-rw-r--r-- | block/blk-ioc.c | 2 | ||||
-rw-r--r-- | block/blk-iocost.c | 2 | ||||
-rw-r--r-- | block/blk-ioprio.c | 57 | ||||
-rw-r--r-- | block/blk-ioprio.h | 9 | ||||
-rw-r--r-- | block/blk-merge.c | 69 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 29 | ||||
-rw-r--r-- | block/blk-mq-debugfs.h | 10 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 12 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 18 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 10 | ||||
-rw-r--r-- | block/blk-mq.c | 59 | ||||
-rw-r--r-- | block/blk-mq.h | 12 | ||||
-rw-r--r-- | block/blk-rq-qos.c | 2 | ||||
-rw-r--r-- | block/blk-rq-qos.h | 7 | ||||
-rw-r--r-- | block/blk-sysfs.c | 39 | ||||
-rw-r--r-- | block/blk.h | 13 | ||||
-rw-r--r-- | block/bounce.c | 13 | ||||
-rw-r--r-- | block/fops.c | 16 | ||||
-rw-r--r-- | block/genhd.c | 42 | ||||
-rw-r--r-- | block/holder.c | 4 | ||||
-rw-r--r-- | block/ioprio.c | 58 | ||||
-rw-r--r-- | block/kyber-iosched.c | 3 | ||||
-rw-r--r-- | block/mq-deadline.c | 3 |
30 files changed, 331 insertions, 330 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 09574af83566..9fc605791b1e 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -706,10 +706,10 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, } /** - * __bfq_bic_change_cgroup - move @bic to @cgroup. + * __bfq_bic_change_cgroup - move @bic to @bfqg. * @bfqd: the queue descriptor. * @bic: the bic to move. - * @blkcg: the blk-cgroup to move to. + * @bfqg: the group to move to. * * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see @@ -863,6 +863,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) * @bfqd: the device data structure with the root group. * @entity: the entity to move, if entity is a leaf; or the parent entity * of an active leaf entity to move, if entity is not a leaf. + * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_entity *entity, @@ -892,6 +893,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. * @st: the service tree to start the search from. + * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_active_queues(struct bfq_data *bfqd, struct bfq_group *bfqg, @@ -1471,8 +1473,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq) return bfqq->bfqd->root_group; } -void bfqg_and_blkg_get(struct bfq_group *bfqg) {} - void bfqg_and_blkg_put(struct bfq_group *bfqg) {} struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 0d46cb728bbf..e6d7e6b01a05 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7046,6 +7046,7 @@ static void bfq_exit_queue(struct elevator_queue *e) spin_unlock_irq(&bfqd->lock); #endif + blk_stat_disable_accounting(bfqd->queue); wbt_enable_default(bfqd->queue); kfree(bfqd); @@ -7188,7 +7189,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + /* We dispatch from request queue wide instead of hw queue */ + blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); + wbt_disable_default(q); + blk_stat_enable_accounting(q); + return 0; out_free: diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index f8eb340381cf..983413cdefad 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1360,6 +1360,8 @@ left: /** * __bfq_lookup_next_entity - return the first eligible entity in @st. * @st: the service tree. + * @in_service: whether or not there is an in-service entity for the sched_data + * this active tree belongs to. * * If there is no in-service entity for the sched_data st belongs to, * then return the entity that will be set in service if: @@ -1472,9 +1474,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, break; } - if (!entity) - return NULL; - return entity; } diff --git a/block/bio.c b/block/bio.c index f92d0223247b..933ea3210954 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1159,6 +1159,37 @@ static void bio_put_pages(struct page **pages, size_t size, size_t off) put_page(pages[i]); } +static int bio_iov_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + bool same_page = false; + + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (WARN_ON_ONCE(bio_full(bio, len))) + return -EINVAL; + __bio_add_page(bio, page, len, offset); + return 0; + } + + if (same_page) + put_page(page); + return 0; +} + +static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + bool same_page = false; + + if (bio_add_hw_page(q, bio, page, len, offset, + queue_max_zone_append_sectors(q), &same_page) != len) + return -EINVAL; + if (same_page) + put_page(page); + return 0; +} + #define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) /** @@ -1177,7 +1208,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; - bool same_page = false; ssize_t size, left; unsigned len, i; size_t offset; @@ -1186,82 +1216,43 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * Move page array up in the allocated memory for the bio vecs as far as * possible so that we can start filling biovecs from the beginning * without overwriting the temporary page array. - */ + */ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); - if (unlikely(size <= 0)) - return size ? size : -EFAULT; - - for (left = size, i = 0; left > 0; left -= len, i++) { - struct page *page = pages[i]; - - len = min_t(size_t, PAGE_SIZE - offset, left); - - if (__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (same_page) - put_page(page); - } else { - if (WARN_ON_ONCE(bio_full(bio, len))) { - bio_put_pages(pages + i, left, offset); - return -EINVAL; - } - __bio_add_page(bio, page, len, offset); - } - offset = 0; - } - - iov_iter_advance(iter, size); - return 0; -} - -static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) -{ - unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; - unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bdev_get_queue(bio->bi_bdev); - unsigned int max_append_sectors = queue_max_zone_append_sectors(q); - struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; - struct page **pages = (struct page **)bv; - ssize_t size, left; - unsigned len, i; - size_t offset; - int ret = 0; - - if (WARN_ON_ONCE(!max_append_sectors)) - return 0; - /* - * Move page array up in the allocated memory for the bio vecs as far as - * possible so that we can start filling biovecs from the beginning - * without overwriting the temporary page array. + * Each segment in the iov is required to be a block size multiple. + * However, we may not be able to get the entire segment if it spans + * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the + * result to ensure the bio's total size is correct. The remainder of + * the iov data will be picked up in the next bio iteration. */ - BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); - pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (size > 0) + size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev)); if (unlikely(size <= 0)) return size ? size : -EFAULT; for (left = size, i = 0; left > 0; left -= len, i++) { struct page *page = pages[i]; - bool same_page = false; + int ret; len = min_t(size_t, PAGE_SIZE - offset, left); - if (bio_add_hw_page(q, bio, page, len, offset, - max_append_sectors, &same_page) != len) { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + ret = bio_iov_add_zone_append_page(bio, page, len, + offset); + else + ret = bio_iov_add_page(bio, page, len, offset); + + if (ret) { bio_put_pages(pages + i, left, offset); - ret = -EINVAL; - break; + return ret; } - if (same_page) - put_page(page); offset = 0; } - iov_iter_advance(iter, size - left); - return ret; + iov_iter_advance(iter, size); + return 0; } /** @@ -1298,10 +1289,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) } do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - ret = __bio_iov_append_get_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); + ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); /* don't account direct I/O as memory stall */ @@ -1747,26 +1735,6 @@ bad: } EXPORT_SYMBOL(bioset_init); -/* - * Initialize and setup a new bio_set, based on the settings from - * another bio_set. - */ -int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) -{ - int flags; - - flags = 0; - if (src->bvec_pool.min_nr) - flags |= BIOSET_NEED_BVECS; - if (src->rescue_workqueue) - flags |= BIOSET_NEED_RESCUER; - if (src->cache) - flags |= BIOSET_PERCPU_CACHE; - - return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags); -} -EXPORT_SYMBOL(bioset_init_from_src); - static int __init init_bio(void) { int i; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 764e740b0c0f..6906981563f8 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1299,6 +1299,7 @@ int blkcg_init_queue(struct request_queue *q) ret = blk_iolatency_init(q); if (ret) { blk_throtl_exit(q); + blk_ioprio_exit(q); goto err_destroy_all; } diff --git a/block/blk-core.c b/block/blk-core.c index 06ff5bbfe8f6..c2cec402d01c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -322,19 +322,6 @@ void blk_cleanup_queue(struct request_queue *q) blk_mq_exit_queue(q); } - /* - * In theory, request pool of sched_tags belongs to request queue. - * However, the current implementation requires tag_set for freeing - * requests, so free the pool now. - * - * Queue has become frozen, there can't be any in-queue requests, so - * it is safe to free requests now. - */ - mutex_lock(&q->sysfs_lock); - if (q->elevator) - blk_mq_sched_free_rqs(q); - mutex_unlock(&q->sysfs_lock); - /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } @@ -448,7 +435,7 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) q->last_merge = NULL; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); + q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); if (q->id < 0) goto fail_srcu; @@ -498,7 +485,7 @@ fail_stats: fail_split: bioset_exit(&q->bio_split); fail_id: - ida_simple_remove(&blk_queue_ida, q->id); + ida_free(&blk_queue_ida, q->id); fail_srcu: if (alloc_srcu) cleanup_srcu_struct(q->srcu); diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c index 56ed48d2954e..47c89e65b57f 100644 --- a/block/blk-ia-ranges.c +++ b/block/blk-ia-ranges.c @@ -144,7 +144,6 @@ int disk_register_independent_access_ranges(struct gendisk *disk, } for (i = 0; i < iars->nr_ia_ranges; i++) { - iars->ia_range[i].queue = q; ret = kobject_init_and_add(&iars->ia_range[i].kobj, &blk_ia_range_ktype, &iars->kobj, "%d", i); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index df9cfe4ca532..63fc02042408 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -247,6 +247,8 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); #endif + ioc->ioprio = IOPRIO_DEFAULT; + return ioc; } diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 33a11ba971ea..b7082f2aed9c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2769,7 +2769,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) return; - switch (req_op(rq) & REQ_OP_MASK) { + switch (req_op(rq)) { case REQ_OP_READ: pidx = QOS_RLAT; rw = READ; diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 79e797f5d194..c00060a02c6e 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -62,7 +62,6 @@ struct ioprio_blkg { struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; - bool prio_set; }; static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) @@ -113,7 +112,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, if (ret < 0) return ret; blkcg->prio_policy = ret; - blkcg->prio_set = true; return nbytes; } @@ -183,26 +181,20 @@ static struct blkcg_policy ioprio_policy = { .pd_free_fn = ioprio_free_pd, }; -struct blk_ioprio { - struct rq_qos rqos; -}; - -static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, - struct bio *bio) +void blkcg_set_ioprio(struct bio *bio) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); u16 prio; - if (!blkcg->prio_set) + if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects * the lower priority of bi_ioprio and the cgroup I/O priority class. - * If the cgroup policy has been set to POLICY_NO_CHANGE == 0, the - * bio I/O priority is not modified. If the bio I/O priority equals - * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio. + * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O + * priority is assigned to the bio. */ prio = max_t(u16, bio->bi_ioprio, IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); @@ -210,49 +202,14 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, bio->bi_ioprio = prio; } -static void blkcg_ioprio_exit(struct rq_qos *rqos) +void blk_ioprio_exit(struct request_queue *q) { - struct blk_ioprio *blkioprio_blkg = - container_of(rqos, typeof(*blkioprio_blkg), rqos); - - blkcg_deactivate_policy(rqos->q, &ioprio_policy); - kfree(blkioprio_blkg); + blkcg_deactivate_policy(q, &ioprio_policy); } -static struct rq_qos_ops blkcg_ioprio_ops = { - .track = blkcg_ioprio_track, - .exit = blkcg_ioprio_exit, -}; - int blk_ioprio_init(struct request_queue *q) { - struct blk_ioprio *blkioprio_blkg; - struct rq_qos *rqos; - int ret; - - blkioprio_blkg = kzalloc(sizeof(*blkioprio_blkg), GFP_KERNEL); - if (!blkioprio_blkg) - return -ENOMEM; - - ret = blkcg_activate_policy(q, &ioprio_policy); - if (ret) { - kfree(blkioprio_blkg); - return ret; - } - - rqos = &blkioprio_blkg->rqos; - rqos->id = RQ_QOS_IOPRIO; - rqos->ops = &blkcg_ioprio_ops; - rqos->q = q; - - /* - * Registering the rq-qos policy after activating the blk-cgroup - * policy guarantees that ioprio_blkcg_from_bio(bio) != NULL in the - * rq-qos callbacks. - */ - rq_qos_add(q, rqos); - - return 0; + return blkcg_activate_policy(q, &ioprio_policy); } static int __init ioprio_init(void) diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h index a7785c2f1aea..5a1eb550e178 100644 --- a/block/blk-ioprio.h +++ b/block/blk-ioprio.h @@ -6,14 +6,23 @@ #include <linux/kconfig.h> struct request_queue; +struct bio; #ifdef CONFIG_BLK_CGROUP_IOPRIO int blk_ioprio_init(struct request_queue *q); +void blk_ioprio_exit(struct request_queue *q); +void blkcg_set_ioprio(struct bio *bio); #else static inline int blk_ioprio_init(struct request_queue *q) { return 0; } +static inline void blk_ioprio_exit(struct request_queue *q) +{ +} +static inline void blkcg_set_ioprio(struct bio *bio) +{ +} #endif #endif /* _BLK_IOPRIO_H_ */ diff --git a/block/blk-merge.c b/block/blk-merge.c index 7771dacc99cb..0f5f42ebd0bb 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -164,18 +164,21 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, static inline unsigned get_max_io_size(struct request_queue *q, struct bio *bio) { - unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0); - unsigned max_sectors = sectors; unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; - unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1); + unsigned max_sectors = queue_max_sectors(q), start, end; - max_sectors += start_offset; - max_sectors &= ~(pbs - 1); - if (max_sectors > start_offset) - return max_sectors - start_offset; + if (q->limits.chunk_sectors) { + max_sectors = min(max_sectors, + blk_chunk_sectors_left(bio->bi_iter.bi_sector, + q->limits.chunk_sectors)); + } - return sectors & ~(lbs - 1); + start = bio->bi_iter.bi_sector & (pbs - 1); + end = (start + max_sectors) & ~(pbs - 1); + if (end > start) + return end - start; + return max_sectors & ~(lbs - 1); } static inline unsigned get_max_segment_size(const struct request_queue *q, @@ -201,11 +204,11 @@ static inline unsigned get_max_segment_size(const struct request_queue *q, * @nsegs: [in,out] Number of segments in the bio being built. Incremented * by the number of segments from @bv that may be appended to that * bio without exceeding @max_segs - * @sectors: [in,out] Number of sectors in the bio being built. Incremented - * by the number of sectors from @bv that may be appended to that - * bio without exceeding @max_sectors + * @bytes: [in,out] Number of bytes in the bio being built. Incremented + * by the number of bytes from @bv that may be appended to that + * bio without exceeding @max_bytes * @max_segs: [in] upper bound for *@nsegs - * @max_sectors: [in] upper bound for *@sectors + * @max_bytes: [in] upper bound for *@bytes * * When splitting a bio, it can happen that a bvec is encountered that is too * big to fit in a single segment and hence that it has to be split in the @@ -216,10 +219,10 @@ static inline unsigned get_max_segment_size(const struct request_queue *q, */ static bool bvec_split_segs(const struct request_queue *q, const struct bio_vec *bv, unsigned *nsegs, - unsigned *sectors, unsigned max_segs, - unsigned max_sectors) + unsigned *bytes, unsigned max_segs, + unsigned max_bytes) { - unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9; + unsigned max_len = min(max_bytes, UINT_MAX) - *bytes; unsigned len = min(bv->bv_len, max_len); unsigned total_len = 0; unsigned seg_size = 0; @@ -237,7 +240,7 @@ static bool bvec_split_segs(const struct request_queue *q, break; } - *sectors += total_len >> 9; + *bytes += total_len; /* tell the caller to split the bvec if it is too big to fit */ return len > 0 || bv->bv_len > max_len; @@ -269,8 +272,8 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; - unsigned nsegs = 0, sectors = 0; - const unsigned max_sectors = get_max_io_size(q, bio); + unsigned nsegs = 0, bytes = 0; + const unsigned max_bytes = get_max_io_size(q, bio) << 9; const unsigned max_segs = queue_max_segments(q); bio_for_each_bvec(bv, bio, iter) { @@ -282,12 +285,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, goto split; if (nsegs < max_segs && - sectors + (bv.bv_len >> 9) <= max_sectors && + bytes + bv.bv_len <= max_bytes && bv.bv_offset + bv.bv_len <= PAGE_SIZE) { nsegs++; - sectors += bv.bv_len >> 9; - } else if (bvec_split_segs(q, &bv, &nsegs, §ors, max_segs, - max_sectors)) { + bytes += bv.bv_len; + } else if (bvec_split_segs(q, &bv, &nsegs, &bytes, max_segs, + max_bytes)) { goto split; } @@ -301,12 +304,19 @@ split: *segs = nsegs; /* + * Individual bvecs might not be logical block aligned. Round down the + * split size so that each bio is properly block size aligned, even if + * we do not use the full hardware limits. + */ + bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q)); + + /* * Bio splitting may cause subtle trouble such as hang when doing sync * iopoll in direct IO routine. Given performance gain of iopoll for * big IO can be trival, disable iopoll when split needed. */ bio_clear_polled(bio); - return bio_split(bio, sectors, GFP_NOIO, bs); + return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); } /** @@ -375,7 +385,7 @@ EXPORT_SYMBOL(blk_queue_split); unsigned int blk_recalc_rq_segments(struct request *rq) { unsigned int nr_phys_segs = 0; - unsigned int nr_sectors = 0; + unsigned int bytes = 0; struct req_iterator iter; struct bio_vec bv; @@ -398,7 +408,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq) } rq_for_each_bvec(bv, rq, iter) - bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors, + bvec_split_segs(rq->q, &bv, &nr_phys_segs, &bytes, UINT_MAX, UINT_MAX); return nr_phys_segs; } @@ -559,17 +569,18 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { struct request_queue *q = rq->q; + unsigned int max_sectors; if (blk_rq_is_passthrough(rq)) return q->limits.max_hw_sectors; + max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); if (!q->limits.chunk_sectors || req_op(rq) == REQ_OP_DISCARD || req_op(rq) == REQ_OP_SECURE_ERASE) - return blk_queue_get_max_sectors(q, req_op(rq)); - - return min(blk_max_size_offset(q, offset, 0), - blk_queue_get_max_sectors(q, req_op(rq))); + return max_sectors; + return min(max_sectors, + blk_chunk_sectors_left(offset, q->limits.chunk_sectors)); } static inline int ll_new_hw_segment(struct request *req, struct bio *bio, diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7e4136a60e1c..4d1ce9ef4318 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -711,11 +711,6 @@ void blk_mq_debugfs_register(struct request_queue *q) } } -void blk_mq_debugfs_unregister(struct request_queue *q) -{ - q->sched_debugfs_dir = NULL; -} - static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { @@ -746,6 +741,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { + if (!hctx->queue->debugfs_dir) + return; debugfs_remove_recursive(hctx->debugfs_dir); hctx->sched_debugfs_dir = NULL; hctx->debugfs_dir = NULL; @@ -773,6 +770,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; + lockdep_assert_held(&q->debugfs_mutex); + /* * If the parent directory has not been created yet, return, we will be * called again later on and the directory/files will be created then. @@ -790,6 +789,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) void blk_mq_debugfs_unregister_sched(struct request_queue *q) { + lockdep_assert_held(&q->debugfs_mutex); + debugfs_remove_recursive(q->sched_debugfs_dir); q->sched_debugfs_dir = NULL; } @@ -811,6 +812,10 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id) void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { + lockdep_assert_held(&rqos->q->debugfs_mutex); + + if (!rqos->q->debugfs_dir) + return; debugfs_remove_recursive(rqos->debugfs_dir); rqos->debugfs_dir = NULL; } @@ -820,6 +825,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) struct request_queue *q = rqos->q; const char *dir_name = rq_qos_id_to_name(rqos->id); + lockdep_assert_held(&q->debugfs_mutex); + if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) return; @@ -833,17 +840,13 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); } -void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) -{ - debugfs_remove_recursive(q->rqos_debugfs_dir); - q->rqos_debugfs_dir = NULL; -} - void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { struct elevator_type *e = q->elevator->type; + lockdep_assert_held(&q->debugfs_mutex); + /* * If the parent debugfs directory has not been created yet, return; * We will be called again later on with appropriate parent debugfs @@ -863,6 +866,10 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { + lockdep_assert_held(&hctx->queue->debugfs_mutex); + + if (!hctx->queue->debugfs_dir) + return; debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; } diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 69918f4170d6..9c7d4b6117d4 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -21,7 +21,6 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v); void blk_mq_debugfs_register(struct request_queue *q); -void blk_mq_debugfs_unregister(struct request_queue *q); void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx); @@ -36,16 +35,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); -void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q); #else static inline void blk_mq_debugfs_register(struct request_queue *q) { } -static inline void blk_mq_debugfs_unregister(struct request_queue *q) -{ -} - static inline void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { @@ -87,10 +81,6 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { } - -static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) -{ -} #endif #ifdef CONFIG_BLK_DEBUG_FS_ZONED diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 9e56a69422b6..a4f7c101b53b 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -564,6 +564,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) int ret; if (!e) { + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); q->elevator = NULL; q->nr_requests = q->tag_set->queue_depth; return 0; @@ -593,7 +594,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) if (ret) goto err_free_map_and_rqs; + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched(q); + mutex_unlock(&q->debugfs_mutex); queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { @@ -606,7 +609,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return ret; } } + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); } return 0; @@ -647,14 +652,21 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched_hctx(hctx); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } flags = hctx->flags; } + + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched(q); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q, flags); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 2dcd738c6952..3cfffef1feb3 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -37,29 +37,25 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, * to get tag when first time, the other shared-tag users could reserve * budget for it. */ -bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; - if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || - test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) { - return true; - } + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) + return; + set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags); } else { - if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || - test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) { - return true; - } + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state); } users = atomic_inc_return(&hctx->tags->active_queues); blk_mq_update_wake_batch(hctx->tags, users); - - return true; } /* diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 5668e28be0b7..91ff37e3b43d 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -47,15 +47,13 @@ enum { BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, }; -extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +extern void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); -static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { - if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) - return false; - - return __blk_mq_tag_busy(hctx); + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_tag_busy(hctx); } static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq.c b/block/blk-mq.c index e9bf950983c7..92aae03103b7 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -42,6 +42,7 @@ #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-ioprio.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); @@ -579,6 +580,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); + if (cpu >= nr_cpu_ids) + goto out_queue_exit; data.ctx = __blk_mq_get_ctx(q, cpu); if (!q->elevator) @@ -2083,14 +2086,10 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { - int cpu = get_cpu(); - if (cpumask_test_cpu(cpu, hctx->cpumask)) { + if (cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); - put_cpu(); return; } - - put_cpu(); } kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, @@ -2141,20 +2140,6 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) EXPORT_SYMBOL(blk_mq_run_hw_queue); /* - * Is the request queue handled by an IO scheduler that does not respect - * hardware queues when dispatching? - */ -static bool blk_mq_has_sqsched(struct request_queue *q) -{ - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.dispatch_request && - !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE)) - return true; - return false; -} - -/* * Return prefered queue to dispatch from (if any) for non-mq aware IO * scheduler. */ @@ -2168,7 +2153,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) * just causes lock contention inside the scheduler and pointless cache * bouncing. */ - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx); + struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT]; if (!blk_mq_hctx_stopped(hctx)) return hctx; @@ -2186,7 +2171,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async) unsigned long i; sq_hctx = NULL; - if (blk_mq_has_sqsched(q)) + if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) @@ -2214,7 +2199,7 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) unsigned long i; sq_hctx = NULL; - if (blk_mq_has_sqsched(q)) + if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) @@ -2777,19 +2762,32 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, return NULL; } - rq_qos_throttle(q, *bio); - if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) return NULL; - rq->cmd_flags = (*bio)->bi_opf; + /* + * If any qos ->throttle() end up blocking, we will have flushed the + * plug and hence killed the cached_rq list as well. Pop this entry + * before we throttle. + */ plug->cached_rq = rq_list_next(rq); + rq_qos_throttle(q, *bio); + + rq->cmd_flags = (*bio)->bi_opf; INIT_LIST_HEAD(&rq->queuelist); return rq; } +static void bio_set_ioprio(struct bio *bio) +{ + /* Nobody set ioprio so far? Initialize it based on task's nice value */ + if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) + bio->bi_ioprio = get_current_ioprio(); + blkcg_set_ioprio(bio); +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2819,6 +2817,8 @@ void blk_mq_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return; + bio_set_ioprio(bio); + rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); if (!rq) { if (!bio) @@ -3443,8 +3443,9 @@ static void blk_mq_exit_hctx(struct request_queue *q, if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); - blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], - set->queue_depth, flush_rq); + if (blk_queue_init_done(q)) + blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], + set->queue_depth, flush_rq); if (set->ops->exit_request) set->ops->exit_request(set, flush_rq, hctx_idx); @@ -4438,12 +4439,14 @@ static bool blk_mq_elv_switch_none(struct list_head *head, if (!qe) return false; + /* q->elevator needs protection from ->sysfs_lock */ + mutex_lock(&q->sysfs_lock); + INIT_LIST_HEAD(&qe->node); qe->q = q; qe->type = q->elevator->type; list_add(&qe->node, head); - mutex_lock(&q->sysfs_lock); /* * After elevator_switch_mq, the previous elevator_queue will be * released by elevator_release. The reference of the io scheduler diff --git a/block/blk-mq.h b/block/blk-mq.h index 2615bd58bad3..e4c6fe2c8ac8 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -86,16 +86,16 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } -static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags) +static inline enum hctx_type blk_mq_get_hctx_type(unsigned int opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* * The caller ensure that if REQ_POLLED, poll must be enabled. */ - if (flags & REQ_POLLED) + if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; - else if ((flags & REQ_OP_MASK) == REQ_OP_READ) + else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; return type; } @@ -103,14 +103,14 @@ static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags) /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue - * @flags: request command flags + * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). * @ctx: software queue cpu ctx */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, - unsigned int flags, + unsigned int opf, struct blk_mq_ctx *ctx) { - return ctx->hctxs[blk_mq_get_hctx_type(flags)]; + return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index e83af7bc7591..d3a75693adbf 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -294,8 +294,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, void rq_qos_exit(struct request_queue *q) { - blk_mq_debugfs_unregister_queue_rqos(q); - while (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 68267007da1c..0e46052b018a 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -104,8 +104,11 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) blk_mq_unfreeze_queue(q); - if (rqos->ops->debugfs_attrs) + if (rqos->ops->debugfs_attrs) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); + } } static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) @@ -129,7 +132,9 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) blk_mq_unfreeze_queue(q); + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); } typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 88bd41d4cb59..69e53d1a4f0e 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -274,6 +274,11 @@ static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page return queue_var_show(q->limits.virt_boundary_mask, page); } +static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_dma_alignment(q), page); +} + #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ static ssize_t \ queue_##name##_show(struct request_queue *q, char *page) \ @@ -606,6 +611,7 @@ QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); +QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); @@ -667,6 +673,7 @@ static struct attribute *queue_attrs[] = { &blk_throtl_sample_time_entry.attr, #endif &queue_virt_boundary_mask_entry.attr, + &queue_dma_alignment_entry.attr, NULL, }; @@ -779,20 +786,12 @@ static void blk_release_queue(struct kobject *kobj) if (queue_is_mq(q)) blk_mq_release(q); - blk_trace_shutdown(q); - mutex_lock(&q->debugfs_mutex); - debugfs_remove_recursive(q->debugfs_dir); - mutex_unlock(&q->debugfs_mutex); - - if (queue_is_mq(q)) - blk_mq_debugfs_unregister(q); - bioset_exit(&q->bio_split); if (blk_queue_has_srcu(q)) cleanup_srcu_struct(q->srcu); - ida_simple_remove(&blk_queue_ida, q->id); + ida_free(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); } @@ -836,17 +835,16 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } + if (queue_is_mq(q)) + __blk_mq_register_dev(dev, q); + mutex_lock(&q->sysfs_lock); + mutex_lock(&q->debugfs_mutex); q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), blk_debugfs_root); - mutex_unlock(&q->debugfs_mutex); - - if (queue_is_mq(q)) { - __blk_mq_register_dev(dev, q); + if (queue_is_mq(q)) blk_mq_debugfs_register(q); - } - - mutex_lock(&q->sysfs_lock); + mutex_unlock(&q->debugfs_mutex); ret = disk_register_independent_access_ranges(disk, NULL); if (ret) @@ -948,8 +946,15 @@ void blk_unregister_queue(struct gendisk *disk) /* Now that we've deleted all child objects, we can delete the queue. */ kobject_uevent(&q->kobj, KOBJ_REMOVE); kobject_del(&q->kobj); - mutex_unlock(&q->sysfs_dir_lock); + mutex_lock(&q->debugfs_mutex); + blk_trace_shutdown(q); + debugfs_remove_recursive(q->debugfs_dir); + q->debugfs_dir = NULL; + q->sched_debugfs_dir = NULL; + q->rqos_debugfs_dir = NULL; + mutex_unlock(&q->debugfs_mutex); + kobject_put(&disk_to_dev(disk)->kobj); } diff --git a/block/blk.h b/block/blk.h index 434017701403..8e79296ee97a 100644 --- a/block/blk.h +++ b/block/blk.h @@ -159,6 +159,19 @@ static inline bool blk_discard_mergable(struct request *req) return false; } +static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, + int op) +{ + if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) + return min(q->limits.max_discard_sectors, + UINT_MAX >> SECTOR_SHIFT); + + if (unlikely(op == REQ_OP_WRITE_ZEROES)) + return q->limits.max_write_zeroes_sectors; + + return q->limits.max_sectors; +} + #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); diff --git a/block/bounce.c b/block/bounce.c index 8f7b6fe3b4db..c8f487af7be3 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -205,19 +205,26 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) int rw = bio_data_dir(*bio_orig); struct bio_vec *to, from; struct bvec_iter iter; - unsigned i = 0; + unsigned i = 0, bytes = 0; bool bounce = false; - int sectors = 0; + int sectors; bio_for_each_segment(from, *bio_orig, iter) { if (i++ < BIO_MAX_VECS) - sectors += from.bv_len >> 9; + bytes += from.bv_len; if (PageHighMem(from.bv_page)) bounce = true; } if (!bounce) return; + /* + * Individual bvecs might not be logical block aligned. Round down + * the split size so that each bio is properly block size aligned, + * even if we do not use the full hardware limits. + */ + sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> + SECTOR_SHIFT; if (sectors < bio_sectors(*bio_orig)) { bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); bio_chain(bio, *bio_orig); diff --git a/block/fops.c b/block/fops.c index d6b3276a6c68..86d3cab9bf93 100644 --- a/block/fops.c +++ b/block/fops.c @@ -42,6 +42,13 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb) return op; } +static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, + struct iov_iter *iter) +{ + return pos & (bdev_logical_block_size(bdev) - 1) || + !bdev_iter_is_aligned(bdev, iter); +} + #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, @@ -54,8 +61,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct bio bio; ssize_t ret; - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) + if (blkdev_dio_unaligned(bdev, pos, iter)) return -EINVAL; if (nr_pages <= DIO_INLINE_BIO_VECS) @@ -173,8 +179,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos; int ret = 0; - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) + if (blkdev_dio_unaligned(bdev, pos, iter)) return -EINVAL; if (iocb->ki_flags & IOCB_ALLOC_CACHE) @@ -298,8 +303,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, loff_t pos = iocb->ki_pos; int ret = 0; - if ((pos | iov_iter_alignment(iter)) & - (bdev_logical_block_size(bdev) - 1)) + if (blkdev_dio_unaligned(bdev, pos, iter)) return -EINVAL; if (iocb->ki_flags & IOCB_ALLOC_CACHE) diff --git a/block/genhd.c b/block/genhd.c index 27205ae47d59..278227ba1d53 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -623,6 +623,7 @@ void del_gendisk(struct gendisk *disk) * Prevent new I/O from crossing bio_queue_enter(). */ blk_queue_start_drain(q); + blk_mq_freeze_queue_wait(q); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -646,12 +647,21 @@ void del_gendisk(struct gendisk *disk) pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); - blk_mq_freeze_queue_wait(q); - blk_throtl_cancel_bios(disk->queue); blk_sync_queue(q); blk_flush_integrity(); + blk_mq_cancel_work_sync(q); + + blk_mq_quiesce_queue(q); + if (q->elevator) { + mutex_lock(&q->sysfs_lock); + elevator_exit(q); + mutex_unlock(&q->sysfs_lock); + } + rq_qos_exit(q); + blk_mq_unquiesce_queue(q); + /* * Allow using passthrough request again after the queue is torn down. */ @@ -1120,31 +1130,6 @@ static const struct attribute_group *disk_attr_groups[] = { NULL }; -static void disk_release_mq(struct request_queue *q) -{ - blk_mq_cancel_work_sync(q); - - /* - * There can't be any non non-passthrough bios in flight here, but - * requests stay around longer, including passthrough ones so we - * still need to freeze the queue here. - */ - blk_mq_freeze_queue(q); - - /* - * Since the I/O scheduler exit code may access cgroup information, - * perform I/O scheduler exit before disassociating from the block - * cgroup controller. - */ - if (q->elevator) { - mutex_lock(&q->sysfs_lock); - elevator_exit(q); - mutex_unlock(&q->sysfs_lock); - } - rq_qos_exit(q); - __blk_mq_unfreeze_queue(q, true); -} - /** * disk_release - releases all allocated resources of the gendisk * @dev: the device representing this disk @@ -1166,9 +1151,6 @@ static void disk_release(struct device *dev) might_sleep(); WARN_ON_ONCE(disk_live(disk)); - if (queue_is_mq(disk->queue)) - disk_release_mq(disk->queue); - blkcg_exit_queue(disk->queue); disk_release_events(disk); diff --git a/block/holder.c b/block/holder.c index 8d750281a1cd..5283bc804cc1 100644 --- a/block/holder.c +++ b/block/holder.c @@ -79,10 +79,6 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) WARN_ON_ONCE(!bdev->bd_holder); - /* FIXME: remove the following once add_disk() handles errors */ - if (WARN_ON(!bdev->bd_holder_dir)) - goto out_unlock; - holder = bd_find_holder_disk(bdev, disk); if (holder) { holder->refcnt++; diff --git a/block/ioprio.c b/block/ioprio.c index 2fe068fcaad5..32a456b45804 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -138,6 +138,32 @@ out: return ret; } +/* + * If the task has set an I/O priority, use that. Otherwise, return + * the default I/O priority. + * + * Expected to be called for current task or with task_lock() held to keep + * io_context stable. + */ +int __get_task_ioprio(struct task_struct *p) +{ + struct io_context *ioc = p->io_context; + int prio; + + if (p != current) + lockdep_assert_held(&p->alloc_lock); + if (ioc) + prio = ioc->ioprio; + else + prio = IOPRIO_DEFAULT; + + if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) + prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), + task_nice_ioprio(p)); + return prio; +} +EXPORT_SYMBOL_GPL(__get_task_ioprio); + static int get_task_ioprio(struct task_struct *p) { int ret; @@ -145,22 +171,38 @@ static int get_task_ioprio(struct task_struct *p) ret = security_task_getioprio(p); if (ret) goto out; - ret = IOPRIO_DEFAULT; + task_lock(p); + ret = __get_task_ioprio(p); + task_unlock(p); +out: + return ret; +} + +/* + * Return raw IO priority value as set by userspace. We use this for + * ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and + * also so that userspace can distinguish unset IO priority (which just gets + * overriden based on task's nice value) from IO priority set to some value. + */ +static int get_task_raw_ioprio(struct task_struct *p) +{ + int ret; + + ret = security_task_getioprio(p); + if (ret) + goto out; task_lock(p); if (p->io_context) ret = p->io_context->ioprio; + else + ret = IOPRIO_DEFAULT; task_unlock(p); out: return ret; } -int ioprio_best(unsigned short aprio, unsigned short bprio) +static int ioprio_best(unsigned short aprio, unsigned short bprio) { - if (!ioprio_valid(aprio)) - aprio = IOPRIO_DEFAULT; - if (!ioprio_valid(bprio)) - bprio = IOPRIO_DEFAULT; - return min(aprio, bprio); } @@ -181,7 +223,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) else p = find_task_by_vpid(who); if (p) - ret = get_task_ioprio(p); + ret = get_task_raw_ioprio(p); break; case IOPRIO_WHO_PGRP: if (!who) diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index 70ff2a599ef6..8f7c745b4a57 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -421,6 +421,8 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) blk_stat_enable_accounting(q); + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); + eq->elevator_data = kqd; q->elevator = eq; @@ -1033,7 +1035,6 @@ static struct elevator_type kyber_sched = { #endif .elevator_attrs = kyber_sched_attrs, .elevator_name = "kyber", - .elevator_features = ELEVATOR_F_MQ_AWARE, .elevator_owner = THIS_MODULE, }; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 6ed602b2f80a..1a9e835e816c 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -642,6 +642,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); + /* We dispatch from request queue wide instead of hw queue */ + blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); + q->elevator = eq; return 0; |