summaryrefslogtreecommitdiff
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/bfq-cgroup.c8
-rw-r--r--block/bfq-iosched.c6
-rw-r--r--block/bfq-wf2q.c5
-rw-r--r--block/bio.c134
-rw-r--r--block/blk-cgroup.c1
-rw-r--r--block/blk-core.c17
-rw-r--r--block/blk-ia-ranges.c1
-rw-r--r--block/blk-ioc.c2
-rw-r--r--block/blk-iocost.c2
-rw-r--r--block/blk-ioprio.c57
-rw-r--r--block/blk-ioprio.h9
-rw-r--r--block/blk-merge.c69
-rw-r--r--block/blk-mq-debugfs.c29
-rw-r--r--block/blk-mq-debugfs.h10
-rw-r--r--block/blk-mq-sched.c12
-rw-r--r--block/blk-mq-tag.c18
-rw-r--r--block/blk-mq-tag.h10
-rw-r--r--block/blk-mq.c59
-rw-r--r--block/blk-mq.h12
-rw-r--r--block/blk-rq-qos.c2
-rw-r--r--block/blk-rq-qos.h7
-rw-r--r--block/blk-sysfs.c39
-rw-r--r--block/blk.h13
-rw-r--r--block/bounce.c13
-rw-r--r--block/fops.c16
-rw-r--r--block/genhd.c42
-rw-r--r--block/holder.c4
-rw-r--r--block/ioprio.c58
-rw-r--r--block/kyber-iosched.c3
-rw-r--r--block/mq-deadline.c3
30 files changed, 331 insertions, 330 deletions
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 09574af83566..9fc605791b1e 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -706,10 +706,10 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
}
/**
- * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * __bfq_bic_change_cgroup - move @bic to @bfqg.
* @bfqd: the queue descriptor.
* @bic: the bic to move.
- * @blkcg: the blk-cgroup to move to.
+ * @bfqg: the group to move to.
*
* Move bic to blkcg, assuming that bfqd->lock is held; which makes
* sure that the reference to cgroup is valid across the call (see
@@ -863,6 +863,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st)
* @bfqd: the device data structure with the root group.
* @entity: the entity to move, if entity is a leaf; or the parent entity
* of an active leaf entity to move, if entity is not a leaf.
+ * @ioprio_class: I/O priority class to reparent.
*/
static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
struct bfq_entity *entity,
@@ -892,6 +893,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
* @bfqd: the device data structure with the root group.
* @bfqg: the group to move from.
* @st: the service tree to start the search from.
+ * @ioprio_class: I/O priority class to reparent.
*/
static void bfq_reparent_active_queues(struct bfq_data *bfqd,
struct bfq_group *bfqg,
@@ -1471,8 +1473,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
return bfqq->bfqd->root_group;
}
-void bfqg_and_blkg_get(struct bfq_group *bfqg) {}
-
void bfqg_and_blkg_put(struct bfq_group *bfqg) {}
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 0d46cb728bbf..e6d7e6b01a05 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -7046,6 +7046,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
spin_unlock_irq(&bfqd->lock);
#endif
+ blk_stat_disable_accounting(bfqd->queue);
wbt_enable_default(bfqd->queue);
kfree(bfqd);
@@ -7188,7 +7189,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfq_init_root_group(bfqd->root_group, bfqd);
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+ /* We dispatch from request queue wide instead of hw queue */
+ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+
wbt_disable_default(q);
+ blk_stat_enable_accounting(q);
+
return 0;
out_free:
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index f8eb340381cf..983413cdefad 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -1360,6 +1360,8 @@ left:
/**
* __bfq_lookup_next_entity - return the first eligible entity in @st.
* @st: the service tree.
+ * @in_service: whether or not there is an in-service entity for the sched_data
+ * this active tree belongs to.
*
* If there is no in-service entity for the sched_data st belongs to,
* then return the entity that will be set in service if:
@@ -1472,9 +1474,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
break;
}
- if (!entity)
- return NULL;
-
return entity;
}
diff --git a/block/bio.c b/block/bio.c
index f92d0223247b..933ea3210954 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1159,6 +1159,37 @@ static void bio_put_pages(struct page **pages, size_t size, size_t off)
put_page(pages[i]);
}
+static int bio_iov_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ bool same_page = false;
+
+ if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
+ if (WARN_ON_ONCE(bio_full(bio, len)))
+ return -EINVAL;
+ __bio_add_page(bio, page, len, offset);
+ return 0;
+ }
+
+ if (same_page)
+ put_page(page);
+ return 0;
+}
+
+static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ bool same_page = false;
+
+ if (bio_add_hw_page(q, bio, page, len, offset,
+ queue_max_zone_append_sectors(q), &same_page) != len)
+ return -EINVAL;
+ if (same_page)
+ put_page(page);
+ return 0;
+}
+
#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *))
/**
@@ -1177,7 +1208,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
- bool same_page = false;
ssize_t size, left;
unsigned len, i;
size_t offset;
@@ -1186,82 +1216,43 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
* Move page array up in the allocated memory for the bio vecs as far as
* possible so that we can start filling biovecs from the beginning
* without overwriting the temporary page array.
- */
+ */
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
- size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
- if (unlikely(size <= 0))
- return size ? size : -EFAULT;
-
- for (left = size, i = 0; left > 0; left -= len, i++) {
- struct page *page = pages[i];
-
- len = min_t(size_t, PAGE_SIZE - offset, left);
-
- if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
- if (same_page)
- put_page(page);
- } else {
- if (WARN_ON_ONCE(bio_full(bio, len))) {
- bio_put_pages(pages + i, left, offset);
- return -EINVAL;
- }
- __bio_add_page(bio, page, len, offset);
- }
- offset = 0;
- }
-
- iov_iter_advance(iter, size);
- return 0;
-}
-
-static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
-{
- unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
- unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
- struct request_queue *q = bdev_get_queue(bio->bi_bdev);
- unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
- struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
- struct page **pages = (struct page **)bv;
- ssize_t size, left;
- unsigned len, i;
- size_t offset;
- int ret = 0;
-
- if (WARN_ON_ONCE(!max_append_sectors))
- return 0;
-
/*
- * Move page array up in the allocated memory for the bio vecs as far as
- * possible so that we can start filling biovecs from the beginning
- * without overwriting the temporary page array.
+ * Each segment in the iov is required to be a block size multiple.
+ * However, we may not be able to get the entire segment if it spans
+ * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
+ * result to ensure the bio's total size is correct. The remainder of
+ * the iov data will be picked up in the next bio iteration.
*/
- BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
- pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
-
size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+ if (size > 0)
+ size = ALIGN_DOWN(size, bdev_logical_block_size(bio->bi_bdev));
if (unlikely(size <= 0))
return size ? size : -EFAULT;
for (left = size, i = 0; left > 0; left -= len, i++) {
struct page *page = pages[i];
- bool same_page = false;
+ int ret;
len = min_t(size_t, PAGE_SIZE - offset, left);
- if (bio_add_hw_page(q, bio, page, len, offset,
- max_append_sectors, &same_page) != len) {
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+ ret = bio_iov_add_zone_append_page(bio, page, len,
+ offset);
+ else
+ ret = bio_iov_add_page(bio, page, len, offset);
+
+ if (ret) {
bio_put_pages(pages + i, left, offset);
- ret = -EINVAL;
- break;
+ return ret;
}
- if (same_page)
- put_page(page);
offset = 0;
}
- iov_iter_advance(iter, size - left);
- return ret;
+ iov_iter_advance(iter, size);
+ return 0;
}
/**
@@ -1298,10 +1289,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
}
do {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND)
- ret = __bio_iov_append_get_pages(bio, iter);
- else
- ret = __bio_iov_iter_get_pages(bio, iter);
+ ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
/* don't account direct I/O as memory stall */
@@ -1747,26 +1735,6 @@ bad:
}
EXPORT_SYMBOL(bioset_init);
-/*
- * Initialize and setup a new bio_set, based on the settings from
- * another bio_set.
- */
-int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
-{
- int flags;
-
- flags = 0;
- if (src->bvec_pool.min_nr)
- flags |= BIOSET_NEED_BVECS;
- if (src->rescue_workqueue)
- flags |= BIOSET_NEED_RESCUER;
- if (src->cache)
- flags |= BIOSET_PERCPU_CACHE;
-
- return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags);
-}
-EXPORT_SYMBOL(bioset_init_from_src);
-
static int __init init_bio(void)
{
int i;
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 764e740b0c0f..6906981563f8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1299,6 +1299,7 @@ int blkcg_init_queue(struct request_queue *q)
ret = blk_iolatency_init(q);
if (ret) {
blk_throtl_exit(q);
+ blk_ioprio_exit(q);
goto err_destroy_all;
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 06ff5bbfe8f6..c2cec402d01c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -322,19 +322,6 @@ void blk_cleanup_queue(struct request_queue *q)
blk_mq_exit_queue(q);
}
- /*
- * In theory, request pool of sched_tags belongs to request queue.
- * However, the current implementation requires tag_set for freeing
- * requests, so free the pool now.
- *
- * Queue has become frozen, there can't be any in-queue requests, so
- * it is safe to free requests now.
- */
- mutex_lock(&q->sysfs_lock);
- if (q->elevator)
- blk_mq_sched_free_rqs(q);
- mutex_unlock(&q->sysfs_lock);
-
/* @q is and will stay empty, shutdown and put */
blk_put_queue(q);
}
@@ -448,7 +435,7 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
q->last_merge = NULL;
- q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
+ q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
if (q->id < 0)
goto fail_srcu;
@@ -498,7 +485,7 @@ fail_stats:
fail_split:
bioset_exit(&q->bio_split);
fail_id:
- ida_simple_remove(&blk_queue_ida, q->id);
+ ida_free(&blk_queue_ida, q->id);
fail_srcu:
if (alloc_srcu)
cleanup_srcu_struct(q->srcu);
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
index 56ed48d2954e..47c89e65b57f 100644
--- a/block/blk-ia-ranges.c
+++ b/block/blk-ia-ranges.c
@@ -144,7 +144,6 @@ int disk_register_independent_access_ranges(struct gendisk *disk,
}
for (i = 0; i < iars->nr_ia_ranges; i++) {
- iars->ia_range[i].queue = q;
ret = kobject_init_and_add(&iars->ia_range[i].kobj,
&blk_ia_range_ktype, &iars->kobj,
"%d", i);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index df9cfe4ca532..63fc02042408 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -247,6 +247,8 @@ static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn);
#endif
+ ioc->ioprio = IOPRIO_DEFAULT;
+
return ioc;
}
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 33a11ba971ea..b7082f2aed9c 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -2769,7 +2769,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
return;
- switch (req_op(rq) & REQ_OP_MASK) {
+ switch (req_op(rq)) {
case REQ_OP_READ:
pidx = QOS_RLAT;
rw = READ;
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
index 79e797f5d194..c00060a02c6e 100644
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@@ -62,7 +62,6 @@ struct ioprio_blkg {
struct ioprio_blkcg {
struct blkcg_policy_data cpd;
enum prio_policy prio_policy;
- bool prio_set;
};
static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
@@ -113,7 +112,6 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
if (ret < 0)
return ret;
blkcg->prio_policy = ret;
- blkcg->prio_set = true;
return nbytes;
}
@@ -183,26 +181,20 @@ static struct blkcg_policy ioprio_policy = {
.pd_free_fn = ioprio_free_pd,
};
-struct blk_ioprio {
- struct rq_qos rqos;
-};
-
-static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
- struct bio *bio)
+void blkcg_set_ioprio(struct bio *bio)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
u16 prio;
- if (!blkcg->prio_set)
+ if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
return;
/*
* Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
* correspond to a lower priority. Hence, the max_t() below selects
* the lower priority of bi_ioprio and the cgroup I/O priority class.
- * If the cgroup policy has been set to POLICY_NO_CHANGE == 0, the
- * bio I/O priority is not modified. If the bio I/O priority equals
- * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
+ * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O
+ * priority is assigned to the bio.
*/
prio = max_t(u16, bio->bi_ioprio,
IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
@@ -210,49 +202,14 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
bio->bi_ioprio = prio;
}
-static void blkcg_ioprio_exit(struct rq_qos *rqos)
+void blk_ioprio_exit(struct request_queue *q)
{
- struct blk_ioprio *blkioprio_blkg =
- container_of(rqos, typeof(*blkioprio_blkg), rqos);
-
- blkcg_deactivate_policy(rqos->q, &ioprio_policy);
- kfree(blkioprio_blkg);
+ blkcg_deactivate_policy(q, &ioprio_policy);
}
-static struct rq_qos_ops blkcg_ioprio_ops = {
- .track = blkcg_ioprio_track,
- .exit = blkcg_ioprio_exit,
-};
-
int blk_ioprio_init(struct request_queue *q)
{
- struct blk_ioprio *blkioprio_blkg;
- struct rq_qos *rqos;
- int ret;
-
- blkioprio_blkg = kzalloc(sizeof(*blkioprio_blkg), GFP_KERNEL);
- if (!blkioprio_blkg)
- return -ENOMEM;
-
- ret = blkcg_activate_policy(q, &ioprio_policy);
- if (ret) {
- kfree(blkioprio_blkg);
- return ret;
- }
-
- rqos = &blkioprio_blkg->rqos;
- rqos->id = RQ_QOS_IOPRIO;
- rqos->ops = &blkcg_ioprio_ops;
- rqos->q = q;
-
- /*
- * Registering the rq-qos policy after activating the blk-cgroup
- * policy guarantees that ioprio_blkcg_from_bio(bio) != NULL in the
- * rq-qos callbacks.
- */
- rq_qos_add(q, rqos);
-
- return 0;
+ return blkcg_activate_policy(q, &ioprio_policy);
}
static int __init ioprio_init(void)
diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h
index a7785c2f1aea..5a1eb550e178 100644
--- a/block/blk-ioprio.h
+++ b/block/blk-ioprio.h
@@ -6,14 +6,23 @@
#include <linux/kconfig.h>
struct request_queue;
+struct bio;
#ifdef CONFIG_BLK_CGROUP_IOPRIO
int blk_ioprio_init(struct request_queue *q);
+void blk_ioprio_exit(struct request_queue *q);
+void blkcg_set_ioprio(struct bio *bio);
#else
static inline int blk_ioprio_init(struct request_queue *q)
{
return 0;
}
+static inline void blk_ioprio_exit(struct request_queue *q)
+{
+}
+static inline void blkcg_set_ioprio(struct bio *bio)
+{
+}
#endif
#endif /* _BLK_IOPRIO_H_ */
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7771dacc99cb..0f5f42ebd0bb 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -164,18 +164,21 @@ static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
static inline unsigned get_max_io_size(struct request_queue *q,
struct bio *bio)
{
- unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0);
- unsigned max_sectors = sectors;
unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
- unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
+ unsigned max_sectors = queue_max_sectors(q), start, end;
- max_sectors += start_offset;
- max_sectors &= ~(pbs - 1);
- if (max_sectors > start_offset)
- return max_sectors - start_offset;
+ if (q->limits.chunk_sectors) {
+ max_sectors = min(max_sectors,
+ blk_chunk_sectors_left(bio->bi_iter.bi_sector,
+ q->limits.chunk_sectors));
+ }
- return sectors & ~(lbs - 1);
+ start = bio->bi_iter.bi_sector & (pbs - 1);
+ end = (start + max_sectors) & ~(pbs - 1);
+ if (end > start)
+ return end - start;
+ return max_sectors & ~(lbs - 1);
}
static inline unsigned get_max_segment_size(const struct request_queue *q,
@@ -201,11 +204,11 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
* @nsegs: [in,out] Number of segments in the bio being built. Incremented
* by the number of segments from @bv that may be appended to that
* bio without exceeding @max_segs
- * @sectors: [in,out] Number of sectors in the bio being built. Incremented
- * by the number of sectors from @bv that may be appended to that
- * bio without exceeding @max_sectors
+ * @bytes: [in,out] Number of bytes in the bio being built. Incremented
+ * by the number of bytes from @bv that may be appended to that
+ * bio without exceeding @max_bytes
* @max_segs: [in] upper bound for *@nsegs
- * @max_sectors: [in] upper bound for *@sectors
+ * @max_bytes: [in] upper bound for *@bytes
*
* When splitting a bio, it can happen that a bvec is encountered that is too
* big to fit in a single segment and hence that it has to be split in the
@@ -216,10 +219,10 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
*/
static bool bvec_split_segs(const struct request_queue *q,
const struct bio_vec *bv, unsigned *nsegs,
- unsigned *sectors, unsigned max_segs,
- unsigned max_sectors)
+ unsigned *bytes, unsigned max_segs,
+ unsigned max_bytes)
{
- unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
+ unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
unsigned len = min(bv->bv_len, max_len);
unsigned total_len = 0;
unsigned seg_size = 0;
@@ -237,7 +240,7 @@ static bool bvec_split_segs(const struct request_queue *q,
break;
}
- *sectors += total_len >> 9;
+ *bytes += total_len;
/* tell the caller to split the bvec if it is too big to fit */
return len > 0 || bv->bv_len > max_len;
@@ -269,8 +272,8 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
- unsigned nsegs = 0, sectors = 0;
- const unsigned max_sectors = get_max_io_size(q, bio);
+ unsigned nsegs = 0, bytes = 0;
+ const unsigned max_bytes = get_max_io_size(q, bio) << 9;
const unsigned max_segs = queue_max_segments(q);
bio_for_each_bvec(bv, bio, iter) {
@@ -282,12 +285,12 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
goto split;
if (nsegs < max_segs &&
- sectors + (bv.bv_len >> 9) <= max_sectors &&
+ bytes + bv.bv_len <= max_bytes &&
bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
nsegs++;
- sectors += bv.bv_len >> 9;
- } else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
- max_sectors)) {
+ bytes += bv.bv_len;
+ } else if (bvec_split_segs(q, &bv, &nsegs, &bytes, max_segs,
+ max_bytes)) {
goto split;
}
@@ -301,12 +304,19 @@ split:
*segs = nsegs;
/*
+ * Individual bvecs might not be logical block aligned. Round down the
+ * split size so that each bio is properly block size aligned, even if
+ * we do not use the full hardware limits.
+ */
+ bytes = ALIGN_DOWN(bytes, queue_logical_block_size(q));
+
+ /*
* Bio splitting may cause subtle trouble such as hang when doing sync
* iopoll in direct IO routine. Given performance gain of iopoll for
* big IO can be trival, disable iopoll when split needed.
*/
bio_clear_polled(bio);
- return bio_split(bio, sectors, GFP_NOIO, bs);
+ return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
}
/**
@@ -375,7 +385,7 @@ EXPORT_SYMBOL(blk_queue_split);
unsigned int blk_recalc_rq_segments(struct request *rq)
{
unsigned int nr_phys_segs = 0;
- unsigned int nr_sectors = 0;
+ unsigned int bytes = 0;
struct req_iterator iter;
struct bio_vec bv;
@@ -398,7 +408,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
}
rq_for_each_bvec(bv, rq, iter)
- bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
+ bvec_split_segs(rq->q, &bv, &nr_phys_segs, &bytes,
UINT_MAX, UINT_MAX);
return nr_phys_segs;
}
@@ -559,17 +569,18 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
struct request_queue *q = rq->q;
+ unsigned int max_sectors;
if (blk_rq_is_passthrough(rq))
return q->limits.max_hw_sectors;
+ max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
if (!q->limits.chunk_sectors ||
req_op(rq) == REQ_OP_DISCARD ||
req_op(rq) == REQ_OP_SECURE_ERASE)
- return blk_queue_get_max_sectors(q, req_op(rq));
-
- return min(blk_max_size_offset(q, offset, 0),
- blk_queue_get_max_sectors(q, req_op(rq)));
+ return max_sectors;
+ return min(max_sectors,
+ blk_chunk_sectors_left(offset, q->limits.chunk_sectors));
}
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7e4136a60e1c..4d1ce9ef4318 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -711,11 +711,6 @@ void blk_mq_debugfs_register(struct request_queue *q)
}
}
-void blk_mq_debugfs_unregister(struct request_queue *q)
-{
- q->sched_debugfs_dir = NULL;
-}
-
static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
@@ -746,6 +741,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
{
+ if (!hctx->queue->debugfs_dir)
+ return;
debugfs_remove_recursive(hctx->debugfs_dir);
hctx->sched_debugfs_dir = NULL;
hctx->debugfs_dir = NULL;
@@ -773,6 +770,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
{
struct elevator_type *e = q->elevator->type;
+ lockdep_assert_held(&q->debugfs_mutex);
+
/*
* If the parent directory has not been created yet, return, we will be
* called again later on and the directory/files will be created then.
@@ -790,6 +789,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
void blk_mq_debugfs_unregister_sched(struct request_queue *q)
{
+ lockdep_assert_held(&q->debugfs_mutex);
+
debugfs_remove_recursive(q->sched_debugfs_dir);
q->sched_debugfs_dir = NULL;
}
@@ -811,6 +812,10 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
{
+ lockdep_assert_held(&rqos->q->debugfs_mutex);
+
+ if (!rqos->q->debugfs_dir)
+ return;
debugfs_remove_recursive(rqos->debugfs_dir);
rqos->debugfs_dir = NULL;
}
@@ -820,6 +825,8 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
struct request_queue *q = rqos->q;
const char *dir_name = rq_qos_id_to_name(rqos->id);
+ lockdep_assert_held(&q->debugfs_mutex);
+
if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
return;
@@ -833,17 +840,13 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
}
-void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
-{
- debugfs_remove_recursive(q->rqos_debugfs_dir);
- q->rqos_debugfs_dir = NULL;
-}
-
void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
struct blk_mq_hw_ctx *hctx)
{
struct elevator_type *e = q->elevator->type;
+ lockdep_assert_held(&q->debugfs_mutex);
+
/*
* If the parent debugfs directory has not been created yet, return;
* We will be called again later on with appropriate parent debugfs
@@ -863,6 +866,10 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
{
+ lockdep_assert_held(&hctx->queue->debugfs_mutex);
+
+ if (!hctx->queue->debugfs_dir)
+ return;
debugfs_remove_recursive(hctx->sched_debugfs_dir);
hctx->sched_debugfs_dir = NULL;
}
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 69918f4170d6..9c7d4b6117d4 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -21,7 +21,6 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
void blk_mq_debugfs_register(struct request_queue *q);
-void blk_mq_debugfs_unregister(struct request_queue *q);
void blk_mq_debugfs_register_hctx(struct request_queue *q,
struct blk_mq_hw_ctx *hctx);
void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx);
@@ -36,16 +35,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
-void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q);
#else
static inline void blk_mq_debugfs_register(struct request_queue *q)
{
}
-static inline void blk_mq_debugfs_unregister(struct request_queue *q)
-{
-}
-
static inline void blk_mq_debugfs_register_hctx(struct request_queue *q,
struct blk_mq_hw_ctx *hctx)
{
@@ -87,10 +81,6 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
{
}
-
-static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
-{
-}
#endif
#ifdef CONFIG_BLK_DEBUG_FS_ZONED
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 9e56a69422b6..a4f7c101b53b 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -564,6 +564,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
int ret;
if (!e) {
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
q->elevator = NULL;
q->nr_requests = q->tag_set->queue_depth;
return 0;
@@ -593,7 +594,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
if (ret)
goto err_free_map_and_rqs;
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_sched(q);
+ mutex_unlock(&q->debugfs_mutex);
queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) {
@@ -606,7 +609,9 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return ret;
}
}
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_sched_hctx(q, hctx);
+ mutex_unlock(&q->debugfs_mutex);
}
return 0;
@@ -647,14 +652,21 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_sched_hctx(hctx);
+ mutex_unlock(&q->debugfs_mutex);
+
if (e->type->ops.exit_hctx && hctx->sched_data) {
e->type->ops.exit_hctx(hctx, i);
hctx->sched_data = NULL;
}
flags = hctx->flags;
}
+
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_sched(q);
+ mutex_unlock(&q->debugfs_mutex);
+
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q, flags);
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 2dcd738c6952..3cfffef1feb3 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -37,29 +37,25 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
* to get tag when first time, the other shared-tag users could reserve
* budget for it.
*/
-bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
- if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
- test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
- return true;
- }
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
+ return;
+ set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags);
} else {
- if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
- test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
- return true;
- }
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return;
+ set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state);
}
users = atomic_inc_return(&hctx->tags->active_queues);
blk_mq_update_wake_batch(hctx->tags, users);
-
- return true;
}
/*
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 5668e28be0b7..91ff37e3b43d 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -47,15 +47,13 @@ enum {
BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
};
-extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+extern void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
-static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
- if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
- return false;
-
- return __blk_mq_tag_busy(hctx);
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_tag_busy(hctx);
}
static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index e9bf950983c7..92aae03103b7 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -42,6 +42,7 @@
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
+#include "blk-ioprio.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
@@ -579,6 +580,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ goto out_queue_exit;
data.ctx = __blk_mq_get_ctx(q, cpu);
if (!q->elevator)
@@ -2083,14 +2086,10 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
return;
if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
- int cpu = get_cpu();
- if (cpumask_test_cpu(cpu, hctx->cpumask)) {
+ if (cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
__blk_mq_run_hw_queue(hctx);
- put_cpu();
return;
}
-
- put_cpu();
}
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
@@ -2141,20 +2140,6 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
EXPORT_SYMBOL(blk_mq_run_hw_queue);
/*
- * Is the request queue handled by an IO scheduler that does not respect
- * hardware queues when dispatching?
- */
-static bool blk_mq_has_sqsched(struct request_queue *q)
-{
- struct elevator_queue *e = q->elevator;
-
- if (e && e->type->ops.dispatch_request &&
- !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE))
- return true;
- return false;
-}
-
-/*
* Return prefered queue to dispatch from (if any) for non-mq aware IO
* scheduler.
*/
@@ -2168,7 +2153,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
* just causes lock contention inside the scheduler and pointless cache
* bouncing.
*/
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, 0, ctx);
+ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];
if (!blk_mq_hctx_stopped(hctx))
return hctx;
@@ -2186,7 +2171,7 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
unsigned long i;
sq_hctx = NULL;
- if (blk_mq_has_sqsched(q))
+ if (blk_queue_sq_sched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
@@ -2214,7 +2199,7 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
unsigned long i;
sq_hctx = NULL;
- if (blk_mq_has_sqsched(q))
+ if (blk_queue_sq_sched(q))
sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
@@ -2777,19 +2762,32 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q,
return NULL;
}
- rq_qos_throttle(q, *bio);
-
if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type)
return NULL;
if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf))
return NULL;
- rq->cmd_flags = (*bio)->bi_opf;
+ /*
+ * If any qos ->throttle() end up blocking, we will have flushed the
+ * plug and hence killed the cached_rq list as well. Pop this entry
+ * before we throttle.
+ */
plug->cached_rq = rq_list_next(rq);
+ rq_qos_throttle(q, *bio);
+
+ rq->cmd_flags = (*bio)->bi_opf;
INIT_LIST_HEAD(&rq->queuelist);
return rq;
}
+static void bio_set_ioprio(struct bio *bio)
+{
+ /* Nobody set ioprio so far? Initialize it based on task's nice value */
+ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
+ bio->bi_ioprio = get_current_ioprio();
+ blkcg_set_ioprio(bio);
+}
+
/**
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
@@ -2819,6 +2817,8 @@ void blk_mq_submit_bio(struct bio *bio)
if (!bio_integrity_prep(bio))
return;
+ bio_set_ioprio(bio);
+
rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs);
if (!rq) {
if (!bio)
@@ -3443,8 +3443,9 @@ static void blk_mq_exit_hctx(struct request_queue *q,
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
- blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
- set->queue_depth, flush_rq);
+ if (blk_queue_init_done(q))
+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
+ set->queue_depth, flush_rq);
if (set->ops->exit_request)
set->ops->exit_request(set, flush_rq, hctx_idx);
@@ -4438,12 +4439,14 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
if (!qe)
return false;
+ /* q->elevator needs protection from ->sysfs_lock */
+ mutex_lock(&q->sysfs_lock);
+
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
list_add(&qe->node, head);
- mutex_lock(&q->sysfs_lock);
/*
* After elevator_switch_mq, the previous elevator_queue will be
* released by elevator_release. The reference of the io scheduler
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 2615bd58bad3..e4c6fe2c8ac8 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -86,16 +86,16 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
}
-static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
+static inline enum hctx_type blk_mq_get_hctx_type(unsigned int opf)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
/*
* The caller ensure that if REQ_POLLED, poll must be enabled.
*/
- if (flags & REQ_POLLED)
+ if (opf & REQ_POLLED)
type = HCTX_TYPE_POLL;
- else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
+ else if ((opf & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
return type;
}
@@ -103,14 +103,14 @@ static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
/*
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
* @q: request queue
- * @flags: request command flags
+ * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED).
* @ctx: software queue cpu ctx
*/
static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
- unsigned int flags,
+ unsigned int opf,
struct blk_mq_ctx *ctx)
{
- return ctx->hctxs[blk_mq_get_hctx_type(flags)];
+ return ctx->hctxs[blk_mq_get_hctx_type(opf)];
}
/*
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index e83af7bc7591..d3a75693adbf 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -294,8 +294,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
void rq_qos_exit(struct request_queue *q)
{
- blk_mq_debugfs_unregister_queue_rqos(q);
-
while (q->rq_qos) {
struct rq_qos *rqos = q->rq_qos;
q->rq_qos = rqos->next;
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 68267007da1c..0e46052b018a 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -104,8 +104,11 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
blk_mq_unfreeze_queue(q);
- if (rqos->ops->debugfs_attrs)
+ if (rqos->ops->debugfs_attrs) {
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_rqos(rqos);
+ mutex_unlock(&q->debugfs_mutex);
+ }
}
static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
@@ -129,7 +132,9 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
blk_mq_unfreeze_queue(q);
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_rqos(rqos);
+ mutex_unlock(&q->debugfs_mutex);
}
typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 88bd41d4cb59..69e53d1a4f0e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -274,6 +274,11 @@ static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page
return queue_var_show(q->limits.virt_boundary_mask, page);
}
+static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(queue_dma_alignment(q), page);
+}
+
#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \
static ssize_t \
queue_##name##_show(struct request_queue *q, char *page) \
@@ -606,6 +611,7 @@ QUEUE_RO_ENTRY(queue_dax, "dax");
QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
+QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
@@ -667,6 +673,7 @@ static struct attribute *queue_attrs[] = {
&blk_throtl_sample_time_entry.attr,
#endif
&queue_virt_boundary_mask_entry.attr,
+ &queue_dma_alignment_entry.attr,
NULL,
};
@@ -779,20 +786,12 @@ static void blk_release_queue(struct kobject *kobj)
if (queue_is_mq(q))
blk_mq_release(q);
- blk_trace_shutdown(q);
- mutex_lock(&q->debugfs_mutex);
- debugfs_remove_recursive(q->debugfs_dir);
- mutex_unlock(&q->debugfs_mutex);
-
- if (queue_is_mq(q))
- blk_mq_debugfs_unregister(q);
-
bioset_exit(&q->bio_split);
if (blk_queue_has_srcu(q))
cleanup_srcu_struct(q->srcu);
- ida_simple_remove(&blk_queue_ida, q->id);
+ ida_free(&blk_queue_ida, q->id);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
@@ -836,17 +835,16 @@ int blk_register_queue(struct gendisk *disk)
goto unlock;
}
+ if (queue_is_mq(q))
+ __blk_mq_register_dev(dev, q);
+ mutex_lock(&q->sysfs_lock);
+
mutex_lock(&q->debugfs_mutex);
q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
blk_debugfs_root);
- mutex_unlock(&q->debugfs_mutex);
-
- if (queue_is_mq(q)) {
- __blk_mq_register_dev(dev, q);
+ if (queue_is_mq(q))
blk_mq_debugfs_register(q);
- }
-
- mutex_lock(&q->sysfs_lock);
+ mutex_unlock(&q->debugfs_mutex);
ret = disk_register_independent_access_ranges(disk, NULL);
if (ret)
@@ -948,8 +946,15 @@ void blk_unregister_queue(struct gendisk *disk)
/* Now that we've deleted all child objects, we can delete the queue. */
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
-
mutex_unlock(&q->sysfs_dir_lock);
+ mutex_lock(&q->debugfs_mutex);
+ blk_trace_shutdown(q);
+ debugfs_remove_recursive(q->debugfs_dir);
+ q->debugfs_dir = NULL;
+ q->sched_debugfs_dir = NULL;
+ q->rqos_debugfs_dir = NULL;
+ mutex_unlock(&q->debugfs_mutex);
+
kobject_put(&disk_to_dev(disk)->kobj);
}
diff --git a/block/blk.h b/block/blk.h
index 434017701403..8e79296ee97a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -159,6 +159,19 @@ static inline bool blk_discard_mergable(struct request *req)
return false;
}
+static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
+ int op)
+{
+ if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
+ return min(q->limits.max_discard_sectors,
+ UINT_MAX >> SECTOR_SHIFT);
+
+ if (unlikely(op == REQ_OP_WRITE_ZEROES))
+ return q->limits.max_write_zeroes_sectors;
+
+ return q->limits.max_sectors;
+}
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
diff --git a/block/bounce.c b/block/bounce.c
index 8f7b6fe3b4db..c8f487af7be3 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -205,19 +205,26 @@ void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
int rw = bio_data_dir(*bio_orig);
struct bio_vec *to, from;
struct bvec_iter iter;
- unsigned i = 0;
+ unsigned i = 0, bytes = 0;
bool bounce = false;
- int sectors = 0;
+ int sectors;
bio_for_each_segment(from, *bio_orig, iter) {
if (i++ < BIO_MAX_VECS)
- sectors += from.bv_len >> 9;
+ bytes += from.bv_len;
if (PageHighMem(from.bv_page))
bounce = true;
}
if (!bounce)
return;
+ /*
+ * Individual bvecs might not be logical block aligned. Round down
+ * the split size so that each bio is properly block size aligned,
+ * even if we do not use the full hardware limits.
+ */
+ sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
+ SECTOR_SHIFT;
if (sectors < bio_sectors(*bio_orig)) {
bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
bio_chain(bio, *bio_orig);
diff --git a/block/fops.c b/block/fops.c
index d6b3276a6c68..86d3cab9bf93 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -42,6 +42,13 @@ static unsigned int dio_bio_write_op(struct kiocb *iocb)
return op;
}
+static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
+ struct iov_iter *iter)
+{
+ return pos & (bdev_logical_block_size(bdev) - 1) ||
+ !bdev_iter_is_aligned(bdev, iter);
+}
+
#define DIO_INLINE_BIO_VECS 4
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
@@ -54,8 +61,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct bio bio;
ssize_t ret;
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
+ if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (nr_pages <= DIO_INLINE_BIO_VECS)
@@ -173,8 +179,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
+ if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
@@ -298,8 +303,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
loff_t pos = iocb->ki_pos;
int ret = 0;
- if ((pos | iov_iter_alignment(iter)) &
- (bdev_logical_block_size(bdev) - 1))
+ if (blkdev_dio_unaligned(bdev, pos, iter))
return -EINVAL;
if (iocb->ki_flags & IOCB_ALLOC_CACHE)
diff --git a/block/genhd.c b/block/genhd.c
index 27205ae47d59..278227ba1d53 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -623,6 +623,7 @@ void del_gendisk(struct gendisk *disk)
* Prevent new I/O from crossing bio_queue_enter().
*/
blk_queue_start_drain(q);
+ blk_mq_freeze_queue_wait(q);
if (!(disk->flags & GENHD_FL_HIDDEN)) {
sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
@@ -646,12 +647,21 @@ void del_gendisk(struct gendisk *disk)
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
device_del(disk_to_dev(disk));
- blk_mq_freeze_queue_wait(q);
-
blk_throtl_cancel_bios(disk->queue);
blk_sync_queue(q);
blk_flush_integrity();
+ blk_mq_cancel_work_sync(q);
+
+ blk_mq_quiesce_queue(q);
+ if (q->elevator) {
+ mutex_lock(&q->sysfs_lock);
+ elevator_exit(q);
+ mutex_unlock(&q->sysfs_lock);
+ }
+ rq_qos_exit(q);
+ blk_mq_unquiesce_queue(q);
+
/*
* Allow using passthrough request again after the queue is torn down.
*/
@@ -1120,31 +1130,6 @@ static const struct attribute_group *disk_attr_groups[] = {
NULL
};
-static void disk_release_mq(struct request_queue *q)
-{
- blk_mq_cancel_work_sync(q);
-
- /*
- * There can't be any non non-passthrough bios in flight here, but
- * requests stay around longer, including passthrough ones so we
- * still need to freeze the queue here.
- */
- blk_mq_freeze_queue(q);
-
- /*
- * Since the I/O scheduler exit code may access cgroup information,
- * perform I/O scheduler exit before disassociating from the block
- * cgroup controller.
- */
- if (q->elevator) {
- mutex_lock(&q->sysfs_lock);
- elevator_exit(q);
- mutex_unlock(&q->sysfs_lock);
- }
- rq_qos_exit(q);
- __blk_mq_unfreeze_queue(q, true);
-}
-
/**
* disk_release - releases all allocated resources of the gendisk
* @dev: the device representing this disk
@@ -1166,9 +1151,6 @@ static void disk_release(struct device *dev)
might_sleep();
WARN_ON_ONCE(disk_live(disk));
- if (queue_is_mq(disk->queue))
- disk_release_mq(disk->queue);
-
blkcg_exit_queue(disk->queue);
disk_release_events(disk);
diff --git a/block/holder.c b/block/holder.c
index 8d750281a1cd..5283bc804cc1 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -79,10 +79,6 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
WARN_ON_ONCE(!bdev->bd_holder);
- /* FIXME: remove the following once add_disk() handles errors */
- if (WARN_ON(!bdev->bd_holder_dir))
- goto out_unlock;
-
holder = bd_find_holder_disk(bdev, disk);
if (holder) {
holder->refcnt++;
diff --git a/block/ioprio.c b/block/ioprio.c
index 2fe068fcaad5..32a456b45804 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -138,6 +138,32 @@ out:
return ret;
}
+/*
+ * If the task has set an I/O priority, use that. Otherwise, return
+ * the default I/O priority.
+ *
+ * Expected to be called for current task or with task_lock() held to keep
+ * io_context stable.
+ */
+int __get_task_ioprio(struct task_struct *p)
+{
+ struct io_context *ioc = p->io_context;
+ int prio;
+
+ if (p != current)
+ lockdep_assert_held(&p->alloc_lock);
+ if (ioc)
+ prio = ioc->ioprio;
+ else
+ prio = IOPRIO_DEFAULT;
+
+ if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
+ prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
+ task_nice_ioprio(p));
+ return prio;
+}
+EXPORT_SYMBOL_GPL(__get_task_ioprio);
+
static int get_task_ioprio(struct task_struct *p)
{
int ret;
@@ -145,22 +171,38 @@ static int get_task_ioprio(struct task_struct *p)
ret = security_task_getioprio(p);
if (ret)
goto out;
- ret = IOPRIO_DEFAULT;
+ task_lock(p);
+ ret = __get_task_ioprio(p);
+ task_unlock(p);
+out:
+ return ret;
+}
+
+/*
+ * Return raw IO priority value as set by userspace. We use this for
+ * ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and
+ * also so that userspace can distinguish unset IO priority (which just gets
+ * overriden based on task's nice value) from IO priority set to some value.
+ */
+static int get_task_raw_ioprio(struct task_struct *p)
+{
+ int ret;
+
+ ret = security_task_getioprio(p);
+ if (ret)
+ goto out;
task_lock(p);
if (p->io_context)
ret = p->io_context->ioprio;
+ else
+ ret = IOPRIO_DEFAULT;
task_unlock(p);
out:
return ret;
}
-int ioprio_best(unsigned short aprio, unsigned short bprio)
+static int ioprio_best(unsigned short aprio, unsigned short bprio)
{
- if (!ioprio_valid(aprio))
- aprio = IOPRIO_DEFAULT;
- if (!ioprio_valid(bprio))
- bprio = IOPRIO_DEFAULT;
-
return min(aprio, bprio);
}
@@ -181,7 +223,7 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
else
p = find_task_by_vpid(who);
if (p)
- ret = get_task_ioprio(p);
+ ret = get_task_raw_ioprio(p);
break;
case IOPRIO_WHO_PGRP:
if (!who)
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index 70ff2a599ef6..8f7c745b4a57 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -421,6 +421,8 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
blk_stat_enable_accounting(q);
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
+
eq->elevator_data = kqd;
q->elevator = eq;
@@ -1033,7 +1035,6 @@ static struct elevator_type kyber_sched = {
#endif
.elevator_attrs = kyber_sched_attrs,
.elevator_name = "kyber",
- .elevator_features = ELEVATOR_F_MQ_AWARE,
.elevator_owner = THIS_MODULE,
};
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 6ed602b2f80a..1a9e835e816c 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -642,6 +642,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
+ /* We dispatch from request queue wide instead of hw queue */
+ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+
q->elevator = eq;
return 0;